Пример #1
0
def map_nominal(genotype_df,
                variant_df,
                phenotype_df,
                phenotype_pos_df,
                covariates_df,
                prefix,
                interaction_s=None,
                maf_threshold_interaction=0.05,
                group_s=None,
                window=1000000,
                run_eigenmt=False,
                output_dir='.',
                write_top=True,
                write_stats=True,
                logger=None,
                verbose=True):
    """
    cis-QTL mapping: nominal associations for all variant-phenotype pairs

    Association results for each chromosome are written to parquet files
    in the format <output_dir>/<prefix>.cis_qtl_pairs.<chr>.parquet

    If interaction_s is provided, the top association per phenotype is
    written to <output_dir>/<prefix>.cis_qtl_top_assoc.txt.gz unless
    write_top is set to False, in which case it is returned as a DataFrame
    """
    assert np.all(phenotype_df.columns == covariates_df.index)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    if logger is None:
        logger = SimpleLogger()
    if group_s is not None:
        group_dict = group_s.to_dict()

    logger.write(
        'cis-QTL mapping: nominal associations for all variant-phenotype pairs'
    )
    logger.write('  * {} samples'.format(phenotype_df.shape[1]))
    logger.write('  * {} phenotypes'.format(phenotype_df.shape[0]))
    logger.write('  * {} covariates'.format(covariates_df.shape[1]))
    logger.write('  * {} variants'.format(variant_df.shape[0]))
    if interaction_s is not None:
        assert np.all(interaction_s.index == covariates_df.index)
        logger.write('  * including interaction term')
        if maf_threshold_interaction > 0:
            logger.write('    * using {:.2f} MAF threshold'.format(
                maf_threshold_interaction))

    covariates_t = torch.tensor(covariates_df.values,
                                dtype=torch.float32).to(device)
    residualizer = Residualizer(covariates_t)
    del covariates_t

    genotype_ix = np.array(
        [genotype_df.columns.tolist().index(i) for i in phenotype_df.columns])
    genotype_ix_t = torch.from_numpy(genotype_ix).to(device)
    if interaction_s is None:
        dof = phenotype_df.shape[1] - 2 - covariates_df.shape[1]
    else:
        dof = phenotype_df.shape[1] - 4 - covariates_df.shape[1]
        interaction_t = torch.tensor(interaction_s.values.reshape(1, -1),
                                     dtype=torch.float32).to(device)
        if maf_threshold_interaction > 0:
            interaction_mask_t = torch.BoolTensor(
                interaction_s >= interaction_s.median()).to(device)
        else:
            interaction_mask_t = None

    igc = genotypeio.InputGeneratorCis(genotype_df,
                                       variant_df,
                                       phenotype_df,
                                       phenotype_pos_df,
                                       group_s=group_s,
                                       window=window)
    # iterate over chromosomes
    best_assoc = []
    start_time = time.time()
    k = 0
    logger.write('  * Computing associations')
    for chrom in igc.chrs:
        logger.write('    Mapping chromosome {}'.format(chrom))
        # allocate arrays
        n = 0
        if group_s is None:
            for i in igc.phenotype_pos_df[igc.phenotype_pos_df['chr'] ==
                                          chrom].index:
                j = igc.cis_ranges[i]
                n += j[1] - j[0] + 1
        else:
            for i in igc.group_s[igc.phenotype_pos_df['chr'] ==
                                 chrom].drop_duplicates().index:
                j = igc.cis_ranges[i]
                n += j[1] - j[0] + 1

        chr_res = OrderedDict()
        chr_res['phenotype_id'] = []
        chr_res['variant_id'] = []
        chr_res['tss_distance'] = np.empty(n, dtype=np.int32)
        chr_res['maf'] = np.empty(n, dtype=np.float32)
        chr_res['ma_samples'] = np.empty(n, dtype=np.int32)
        chr_res['ma_count'] = np.empty(n, dtype=np.int32)
        if interaction_s is None:
            chr_res['pval_nominal'] = np.empty(n, dtype=np.float64)
            chr_res['slope'] = np.empty(n, dtype=np.float32)
            chr_res['slope_se'] = np.empty(n, dtype=np.float32)
        else:
            chr_res['pval_g'] = np.empty(n, dtype=np.float64)
            chr_res['b_g'] = np.empty(n, dtype=np.float32)
            chr_res['b_g_se'] = np.empty(n, dtype=np.float32)
            chr_res['pval_i'] = np.empty(n, dtype=np.float64)
            chr_res['b_i'] = np.empty(n, dtype=np.float32)
            chr_res['b_i_se'] = np.empty(n, dtype=np.float32)
            chr_res['pval_gi'] = np.empty(n, dtype=np.float64)
            chr_res['b_gi'] = np.empty(n, dtype=np.float32)
            chr_res['b_gi_se'] = np.empty(n, dtype=np.float32)

        start = 0
        if group_s is None:
            for k, (phenotype, genotypes, genotype_range,
                    phenotype_id) in enumerate(
                        igc.generate_data(chrom=chrom, verbose=verbose),
                        k + 1):
                # copy genotypes to GPU
                phenotype_t = torch.tensor(phenotype,
                                           dtype=torch.float).to(device)
                genotypes_t = torch.tensor(genotypes,
                                           dtype=torch.float).to(device)
                genotypes_t = genotypes_t[:, genotype_ix_t]
                impute_mean(genotypes_t)

                variant_ids = variant_df.index[
                    genotype_range[0]:genotype_range[-1] + 1]
                tss_distance = np.int32(
                    variant_df['pos'].
                    values[genotype_range[0]:genotype_range[-1] + 1] -
                    igc.phenotype_tss[phenotype_id])

                if interaction_s is None:
                    res = calculate_cis_nominal(genotypes_t, phenotype_t,
                                                residualizer)
                    tstat, slope, slope_se, maf, ma_samples, ma_count = [
                        i.cpu().numpy() for i in res
                    ]
                    n = len(variant_ids)
                else:
                    genotypes_t, mask_t = filter_maf_interaction(
                        genotypes_t,
                        interaction_mask_t=interaction_mask_t,
                        maf_threshold_interaction=maf_threshold_interaction)
                    if genotypes_t.shape[0] > 0:
                        res = calculate_interaction_nominal(
                            genotypes_t,
                            phenotype_t.unsqueeze(0),
                            interaction_t,
                            residualizer,
                            return_sparse=False)
                        tstat, b, b_se, maf, ma_samples, ma_count = [
                            i.cpu().numpy() for i in res
                        ]
                        mask = mask_t.cpu().numpy()
                        variant_ids = variant_ids[mask]
                        tss_distance = tss_distance[mask]
                        n = len(variant_ids)

                        # top association
                        ix = np.nanargmax(np.abs(tstat[:, 2]))
                        top_s = pd.Series([
                            phenotype_id, variant_ids[ix], tss_distance[ix],
                            maf[ix], ma_samples[ix], ma_count[ix], tstat[ix,
                                                                         0],
                            b[ix, 0], b_se[ix, 0], tstat[ix, 1], b[ix, 1],
                            b_se[ix, 1], tstat[ix, 2], b[ix, 2], b_se[ix, 2]
                        ],
                                          index=chr_res.keys())
                        if run_eigenmt:  # compute eigenMT correction
                            top_s['tests_emt'] = eigenmt.compute_tests(
                                genotypes_t,
                                var_thresh=0.99,
                                variant_window=200)
                        best_assoc.append(top_s)
                    else:  # all genotypes in window were filtered out
                        n = 0

                if n > 0:
                    chr_res['phenotype_id'].extend([phenotype_id] * n)
                    chr_res['variant_id'].extend(variant_ids)
                    chr_res['tss_distance'][start:start + n] = tss_distance
                    chr_res['maf'][start:start + n] = maf
                    chr_res['ma_samples'][start:start + n] = ma_samples
                    chr_res['ma_count'][start:start + n] = ma_count
                    if interaction_s is None:
                        chr_res['pval_nominal'][start:start + n] = tstat
                        chr_res['slope'][start:start + n] = slope
                        chr_res['slope_se'][start:start + n] = slope_se
                    else:
                        chr_res['pval_g'][start:start + n] = tstat[:, 0]
                        chr_res['b_g'][start:start + n] = b[:, 0]
                        chr_res['b_g_se'][start:start + n] = b_se[:, 0]
                        chr_res['pval_i'][start:start + n] = tstat[:, 1]
                        chr_res['b_i'][start:start + n] = b[:, 1]
                        chr_res['b_i_se'][start:start + n] = b_se[:, 1]
                        chr_res['pval_gi'][start:start + n] = tstat[:, 2]
                        chr_res['b_gi'][start:start + n] = b[:, 2]
                        chr_res['b_gi_se'][start:start + n] = b_se[:, 2]
                start += n  # update pointer
        else:  # groups
            for k, (phenotypes, genotypes, genotype_range, phenotype_ids,
                    group_id) in enumerate(
                        igc.generate_data(chrom=chrom, verbose=verbose),
                        k + 1):

                # copy genotypes to GPU
                genotypes_t = torch.tensor(genotypes,
                                           dtype=torch.float).to(device)
                genotypes_t = genotypes_t[:, genotype_ix_t]
                impute_mean(genotypes_t)

                variant_ids = variant_df.index[
                    genotype_range[0]:genotype_range[-1] + 1]
                # assuming that the TSS for all grouped phenotypes is the same
                tss_distance = np.int32(
                    variant_df['pos'].
                    values[genotype_range[0]:genotype_range[-1] + 1] -
                    igc.phenotype_tss[phenotype_ids[0]])

                if interaction_s is not None:
                    genotypes_t, mask_t = filter_maf_interaction(
                        genotypes_t,
                        interaction_mask_t=interaction_mask_t,
                        maf_threshold_interaction=maf_threshold_interaction)
                    mask = mask_t.cpu().numpy()
                    variant_ids = variant_ids[mask]
                    tss_distance = tss_distance[mask]

                n = len(variant_ids)

                if genotypes_t.shape[0] > 0:
                    # process first phenotype in group
                    phenotype_id = phenotype_ids[0]
                    phenotype_t = torch.tensor(phenotypes[0],
                                               dtype=torch.float).to(device)

                    if interaction_s is None:
                        res = calculate_cis_nominal(genotypes_t, phenotype_t,
                                                    residualizer)
                        tstat, slope, slope_se, maf, ma_samples, ma_count = [
                            i.cpu().numpy() for i in res
                        ]
                    else:
                        res = calculate_interaction_nominal(
                            genotypes_t,
                            phenotype_t.unsqueeze(0),
                            interaction_t,
                            residualizer,
                            return_sparse=False)
                        tstat, b, b_se, maf, ma_samples, ma_count = [
                            i.cpu().numpy() for i in res
                        ]
                    px = [phenotype_id] * n

                    # iterate over remaining phenotypes in group
                    for phenotype, phenotype_id in zip(phenotypes[1:],
                                                       phenotype_ids[1:]):
                        phenotype_t = torch.tensor(
                            phenotype, dtype=torch.float).to(device)
                        if interaction_s is None:
                            res = calculate_cis_nominal(
                                genotypes_t, phenotype_t, residualizer)
                            tstat0, slope0, slope_se0, maf, ma_samples, ma_count = [
                                i.cpu().numpy() for i in res
                            ]
                        else:
                            res = calculate_interaction_nominal(
                                genotypes_t,
                                phenotype_t.unsqueeze(0),
                                interaction_t,
                                residualizer,
                                return_sparse=False)
                            tstat0, b0, b_se0, maf, ma_samples, ma_count = [
                                i.cpu().numpy() for i in res
                            ]

                        # find associations that are stronger for current phenotype
                        if interaction_s is None:
                            ix = np.where(np.abs(tstat0) > np.abs(tstat))[0]
                        else:
                            ix = np.where(
                                np.abs(tstat0[:, 2]) > np.abs(tstat[:, 2]))[0]

                        # update relevant positions
                        for j in ix:
                            px[j] = phenotype_id
                        if interaction_s is None:
                            tstat[ix] = tstat0[ix]
                            slope[ix] = slope0[ix]
                            slope_se[ix] = slope_se0[ix]
                        else:
                            tstat[ix] = tstat0[ix]
                            b[ix] = b0[ix]
                            b_se[ix] = b_se0[ix]

                    chr_res['phenotype_id'].extend(px)
                    chr_res['variant_id'].extend(variant_ids)
                    chr_res['tss_distance'][start:start + n] = tss_distance
                    chr_res['maf'][start:start + n] = maf
                    chr_res['ma_samples'][start:start + n] = ma_samples
                    chr_res['ma_count'][start:start + n] = ma_count
                    if interaction_s is None:
                        chr_res['pval_nominal'][start:start + n] = tstat
                        chr_res['slope'][start:start + n] = slope
                        chr_res['slope_se'][start:start + n] = slope_se
                    else:
                        chr_res['pval_g'][start:start + n] = tstat[:, 0]
                        chr_res['b_g'][start:start + n] = b[:, 0]
                        chr_res['b_g_se'][start:start + n] = b_se[:, 0]
                        chr_res['pval_i'][start:start + n] = tstat[:, 1]
                        chr_res['b_i'][start:start + n] = b[:, 1]
                        chr_res['b_i_se'][start:start + n] = b_se[:, 1]
                        chr_res['pval_gi'][start:start + n] = tstat[:, 2]
                        chr_res['b_gi'][start:start + n] = b[:, 2]
                        chr_res['b_gi_se'][start:start + n] = b_se[:, 2]

                    # top association for the group
                    if interaction_s is not None:
                        ix = np.nanargmax(np.abs(tstat[:, 2]))
                        top_s = pd.Series([
                            chr_res['phenotype_id'][start:start + n][ix],
                            variant_ids[ix], tss_distance[ix], maf[ix],
                            ma_samples[ix], ma_count[ix], tstat[ix, 0],
                            b[ix, 0], b_se[ix, 0], tstat[ix, 1], b[ix, 1],
                            b_se[ix, 1], tstat[ix, 2], b[ix, 2], b_se[ix, 2]
                        ],
                                          index=chr_res.keys())
                        top_s['num_phenotypes'] = len(phenotype_ids)
                        if run_eigenmt:  # compute eigenMT correction
                            top_s['tests_emt'] = eigenmt.compute_tests(
                                genotypes_t,
                                var_thresh=0.99,
                                variant_window=200)
                        best_assoc.append(top_s)

                start += n  # update pointer

        logger.write('    time elapsed: {:.2f} min'.format(
            (time.time() - start_time) / 60))

        # convert to dataframe, compute p-values and write current chromosome
        if start < len(chr_res['maf']):
            for x in chr_res:
                chr_res[x] = chr_res[x][:start]

        if write_stats:
            chr_res_df = pd.DataFrame(chr_res)
            if interaction_s is None:
                m = chr_res_df['pval_nominal'].notnull()
                chr_res_df.loc[m, 'pval_nominal'] = 2 * stats.t.cdf(
                    -chr_res_df.loc[m, 'pval_nominal'].abs(), dof)
            else:
                m = chr_res_df['pval_gi'].notnull()
                chr_res_df.loc[m, 'pval_g'] = 2 * stats.t.cdf(
                    -chr_res_df.loc[m, 'pval_g'].abs(), dof)
                chr_res_df.loc[m, 'pval_i'] = 2 * stats.t.cdf(
                    -chr_res_df.loc[m, 'pval_i'].abs(), dof)
                chr_res_df.loc[m, 'pval_gi'] = 2 * stats.t.cdf(
                    -chr_res_df.loc[m, 'pval_gi'].abs(), dof)
            print('    * writing output')
            chr_res_df.to_parquet(
                os.path.join(
                    output_dir,
                    '{}.cis_qtl_pairs.{}.parquet'.format(prefix, chrom)))

    if interaction_s is not None:
        best_assoc = pd.concat(
            best_assoc, axis=1,
            sort=False).T.set_index('phenotype_id').infer_objects()
        m = best_assoc['pval_g'].notnull()
        best_assoc.loc[m, 'pval_g'] = 2 * stats.t.cdf(
            -best_assoc.loc[m, 'pval_g'].abs(), dof)
        best_assoc.loc[m, 'pval_i'] = 2 * stats.t.cdf(
            -best_assoc.loc[m, 'pval_i'].abs(), dof)
        best_assoc.loc[m, 'pval_gi'] = 2 * stats.t.cdf(
            -best_assoc.loc[m, 'pval_gi'].abs(), dof)
        if run_eigenmt:
            if group_s is None:
                best_assoc['pval_emt'] = np.minimum(
                    best_assoc['tests_emt'] * best_assoc['pval_gi'], 1)
            else:
                best_assoc['pval_emt'] = np.minimum(
                    best_assoc['num_phenotypes'] * best_assoc['tests_emt'] *
                    best_assoc['pval_gi'], 1)
            best_assoc['pval_adj_bh'] = eigenmt.padjust_bh(
                best_assoc['pval_emt'])
        if write_top:
            best_assoc.to_csv(os.path.join(
                output_dir, '{}.cis_qtl_top_assoc.txt.gz'.format(prefix)),
                              sep='\t',
                              float_format='%.6g')
        else:
            return best_assoc
    logger.write('done.')
Пример #2
0
def map_cis(genotype_df,
            variant_df,
            phenotype_df,
            phenotype_pos_df,
            covariates_df,
            group_s=None,
            beta_approx=True,
            nperm=10000,
            window=1000000,
            logger=None,
            seed=None,
            verbose=True):
    """Run cis-QTL mapping"""

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    assert np.all(phenotype_df.columns == covariates_df.index)
    if logger is None:
        logger = SimpleLogger()

    logger.write('cis-QTL mapping: empirical p-values for phenotypes')
    logger.write('  * {} samples'.format(phenotype_df.shape[1]))
    logger.write('  * {} phenotypes'.format(phenotype_df.shape[0]))
    if group_s is not None:
        logger.write('  * {} phenotype groups'.format(len(group_s.unique())))
        group_dict = group_s.to_dict()
    logger.write('  * {} covariates'.format(covariates_df.shape[1]))
    logger.write('  * {} variants'.format(genotype_df.shape[0]))

    covariates_t = torch.tensor(covariates_df.values,
                                dtype=torch.float32).to(device)
    residualizer = Residualizer(covariates_t)
    del covariates_t

    genotype_ix = np.array(
        [genotype_df.columns.tolist().index(i) for i in phenotype_df.columns])
    genotype_ix_t = torch.from_numpy(genotype_ix).to(device)
    dof = phenotype_df.shape[1] - 2 - covariates_df.shape[1]

    # permutation indices
    n_samples = phenotype_df.shape[1]
    ix = np.arange(n_samples)
    if seed is not None:
        logger.write('  * using seed {}'.format(seed))
        np.random.seed(seed)
    permutation_ix_t = torch.LongTensor(
        np.array([np.random.permutation(ix) for i in range(nperm)])).to(device)

    res_df = []
    igc = genotypeio.InputGeneratorCis(genotype_df,
                                       variant_df,
                                       phenotype_df,
                                       phenotype_pos_df,
                                       group_s=group_s,
                                       window=window)
    start_time = time.time()
    logger.write('  * computing permutations')
    if group_s is None:
        for k, (phenotype, genotypes, genotype_range,
                phenotype_id) in enumerate(igc.generate_data(verbose=verbose),
                                           1):
            # copy genotypes to GPU
            genotypes_t = torch.tensor(genotypes, dtype=torch.float).to(device)
            genotypes_t = genotypes_t[:, genotype_ix_t]
            impute_mean(genotypes_t)
            phenotype_t = torch.tensor(phenotype, dtype=torch.float).to(device)

            res = calculate_cis_permutations(genotypes_t, phenotype_t,
                                             residualizer, permutation_ix_t)
            r_nominal, std_ratio, var_ix, r2_perm, g = [
                i.cpu().numpy() for i in res
            ]
            var_ix = genotype_range[var_ix]
            variant_id = variant_df.index[var_ix]
            tss_distance = variant_df['pos'].values[
                var_ix] - igc.phenotype_tss[phenotype_id]
            res_s = prepare_cis_output(r_nominal,
                                       r2_perm,
                                       std_ratio,
                                       g,
                                       genotypes.shape[0],
                                       dof,
                                       variant_id,
                                       tss_distance,
                                       phenotype_id,
                                       nperm=nperm)
            if beta_approx:
                res_s[[
                    'pval_beta', 'beta_shape1', 'beta_shape2', 'true_df',
                    'pval_true_df'
                ]] = calculate_beta_approx_pval(r2_perm, r_nominal * r_nominal,
                                                dof)
            res_df.append(res_s)
    else:
        for k, (phenotypes, genotypes, genotype_range, phenotype_ids,
                group_id) in enumerate(igc.generate_data(verbose=verbose), 1):
            # copy genotypes to GPU
            genotypes_t = torch.tensor(genotypes, dtype=torch.float).to(device)
            genotypes_t = genotypes_t[:, genotype_ix_t]
            impute_mean(genotypes_t)

            # iterate over phenotypes
            buf = []
            for phenotype, phenotype_id in zip(phenotypes, phenotype_ids):
                phenotype_t = torch.tensor(phenotype,
                                           dtype=torch.float).to(device)
                res = calculate_cis_permutations(genotypes_t, phenotype_t,
                                                 residualizer,
                                                 permutation_ix_t)
                res = [i.cpu().numpy() for i in res
                       ]  # r_nominal, std_ratio, var_ix, r2_perm, g
                res[2] = genotype_range[res[2]]
                buf.append(res + [genotypes.shape[0], phenotype_id])
            res_s = _process_group_permutations(
                buf,
                variant_df,
                igc.phenotype_tss[phenotype_ids[0]],
                dof,
                group_id,
                nperm=nperm)
            res_df.append(res_s)

    res_df = pd.concat(res_df, axis=1, sort=False).T
    res_df.index.name = 'phenotype_id'
    logger.write('  Time elapsed: {:.2f} min'.format(
        (time.time() - start_time) / 60))
    logger.write('done.')
    return res_df.astype(output_dtype_dict).infer_objects()
Пример #3
0
def map_independent(genotype_df,
                    variant_df,
                    cis_df,
                    phenotype_df,
                    phenotype_pos_df,
                    covariates_df,
                    group_s=None,
                    fdr=0.05,
                    fdr_col='qval',
                    nperm=10000,
                    window=1000000,
                    logger=None,
                    seed=None,
                    verbose=True):
    """
    Run independent cis-QTL mapping (forward-backward regression)

    cis_df: output from map_cis, annotated with q-values (calculate_qvalues)
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    assert np.all(phenotype_df.index == phenotype_pos_df.index)
    assert np.all(covariates_df.index == phenotype_df.columns)
    if logger is None:
        logger = SimpleLogger()

    signif_df = cis_df[cis_df[fdr_col] <= fdr].copy()
    cols = [
        'num_var',
        'beta_shape1',
        'beta_shape2',
        'true_df',
        'pval_true_df',
        'variant_id',
        'tss_distance',
        'ma_samples',
        'ma_count',
        'maf',
        'ref_factor',
        'pval_nominal',
        'slope',
        'slope_se',
        'pval_perm',
        'pval_beta',
    ]
    if group_s is not None:
        cols += ['group_id', 'group_size']
    signif_df = signif_df[cols]
    signif_threshold = signif_df['pval_beta'].max()
    # subset significant phenotypes
    if group_s is None:
        ix = phenotype_df.index[phenotype_df.index.isin(signif_df.index)]
    else:
        ix = group_s[phenotype_df.index].loc[group_s[phenotype_df.index].isin(
            signif_df['group_id'])].index

    logger.write('cis-QTL mapping: conditionally independent variants')
    logger.write('  * {} samples'.format(phenotype_df.shape[1]))
    if group_s is None:
        logger.write('  * {}/{} significant phenotypes'.format(
            signif_df.shape[0], cis_df.shape[0]))
    else:
        logger.write('  * {}/{} significant groups'.format(
            signif_df.shape[0], cis_df.shape[0]))
        logger.write('    {}/{} phenotypes'.format(len(ix),
                                                   phenotype_df.shape[0]))
        group_dict = group_s.to_dict()
    logger.write('  * {} covariates'.format(covariates_df.shape[1]))
    logger.write('  * {} variants'.format(genotype_df.shape[0]))
    # print('Significance threshold: {}'.format(signif_threshold))
    phenotype_df = phenotype_df.loc[ix]
    phenotype_pos_df = phenotype_pos_df.loc[ix]

    genotype_ix = np.array(
        [genotype_df.columns.tolist().index(i) for i in phenotype_df.columns])
    genotype_ix_t = torch.from_numpy(genotype_ix).to(device)
    dof = phenotype_df.shape[1] - 2 - covariates_df.shape[1]
    ix_dict = {i: k for k, i in enumerate(genotype_df.index)}

    # permutation indices
    n_samples = phenotype_df.shape[1]
    ix = np.arange(n_samples)
    if seed is not None:
        logger.write('  * using seed {}'.format(seed))
        np.random.seed(seed)
    permutation_ix_t = torch.LongTensor(
        np.array([np.random.permutation(ix) for i in range(nperm)])).to(device)

    res_df = []
    igc = genotypeio.InputGeneratorCis(genotype_df,
                                       variant_df,
                                       phenotype_df,
                                       phenotype_pos_df,
                                       group_s=group_s,
                                       window=window)
    logger.write('  * computing independent QTLs')
    start_time = time.time()
    if group_s is None:
        for k, (phenotype, genotypes, genotype_range,
                phenotype_id) in enumerate(igc.generate_data(verbose=verbose),
                                           1):
            # copy genotypes to GPU
            phenotype_t = torch.tensor(phenotype, dtype=torch.float).to(device)
            genotypes_t = torch.tensor(genotypes, dtype=torch.float).to(device)
            genotypes_t = genotypes_t[:, genotype_ix_t]
            impute_mean(genotypes_t)

            # 1) forward pass
            forward_df = [signif_df.loc[phenotype_id]
                          ]  # initialize results with top variant
            covariates = covariates_df.values.copy()  # initialize covariates
            dosage_dict = {}
            while True:
                # add variant to covariates
                variant_id = forward_df[-1]['variant_id']
                ig = genotype_df.values[ix_dict[variant_id], genotype_ix]
                dosage_dict[variant_id] = ig
                covariates = np.hstack([covariates,
                                        ig.reshape(-1, 1)]).astype(np.float32)
                dof = phenotype_df.shape[1] - 2 - covariates.shape[1]
                covariates_t = torch.tensor(covariates,
                                            dtype=torch.float32).to(device)
                residualizer = Residualizer(covariates_t)
                del covariates_t

                res = calculate_cis_permutations(genotypes_t, phenotype_t,
                                                 residualizer,
                                                 permutation_ix_t)
                r_nominal, std_ratio, var_ix, r2_perm, g = [
                    i.cpu().numpy() for i in res
                ]
                x = calculate_beta_approx_pval(r2_perm, r_nominal * r_nominal,
                                               dof)
                # add to list if empirical p-value passes significance threshold
                if x[0] <= signif_threshold:
                    var_ix = genotype_range[var_ix]
                    variant_id = variant_df.index[var_ix]
                    tss_distance = variant_df['pos'].values[
                        var_ix] - igc.phenotype_tss[phenotype_id]
                    res_s = prepare_cis_output(r_nominal,
                                               r2_perm,
                                               std_ratio,
                                               g,
                                               genotypes.shape[0],
                                               dof,
                                               variant_id,
                                               tss_distance,
                                               phenotype_id,
                                               nperm=nperm)
                    res_s[[
                        'pval_beta', 'beta_shape1', 'beta_shape2', 'true_df',
                        'pval_true_df'
                    ]] = x
                    forward_df.append(res_s)
                else:
                    break
            forward_df = pd.concat(forward_df, axis=1, sort=False).T
            dosage_df = pd.DataFrame(dosage_dict)

            # 2) backward pass
            if forward_df.shape[0] > 1:
                back_df = []
                variant_set = set()
                for k, i in enumerate(forward_df['variant_id'], 1):
                    covariates = np.hstack([
                        covariates_df.values,
                        dosage_df[np.setdiff1d(forward_df['variant_id'],
                                               i)].values,
                    ])
                    dof = phenotype_df.shape[1] - 2 - covariates.shape[1]
                    covariates_t = torch.tensor(covariates,
                                                dtype=torch.float32).to(device)
                    residualizer = Residualizer(covariates_t)
                    del covariates_t

                    res = calculate_cis_permutations(genotypes_t, phenotype_t,
                                                     residualizer,
                                                     permutation_ix_t)
                    r_nominal, std_ratio, var_ix, r2_perm, g = [
                        i.cpu().numpy() for i in res
                    ]
                    var_ix = genotype_range[var_ix]
                    variant_id = variant_df.index[var_ix]
                    x = calculate_beta_approx_pval(r2_perm,
                                                   r_nominal * r_nominal, dof)
                    if x[0] <= signif_threshold and variant_id not in variant_set:
                        tss_distance = variant_df['pos'].values[
                            var_ix] - igc.phenotype_tss[phenotype_id]
                        res_s = prepare_cis_output(r_nominal,
                                                   r2_perm,
                                                   std_ratio,
                                                   g,
                                                   genotypes.shape[0],
                                                   dof,
                                                   variant_id,
                                                   tss_distance,
                                                   phenotype_id,
                                                   nperm=nperm)
                        res_s[[
                            'pval_beta', 'beta_shape1', 'beta_shape2',
                            'true_df', 'pval_true_df'
                        ]] = x
                        res_s['rank'] = k
                        back_df.append(res_s)
                        variant_set.add(variant_id)
                if len(back_df) > 0:
                    res_df.append(pd.concat(back_df, axis=1, sort=False).T)
            else:  # single independent variant
                forward_df['rank'] = 1
                res_df.append(forward_df)

    else:  # grouped phenotypes
        for k, (phenotypes, genotypes, genotype_range, phenotype_ids,
                group_id) in enumerate(igc.generate_data(verbose=verbose), 1):
            # copy genotypes to GPU
            genotypes_t = torch.tensor(genotypes, dtype=torch.float).to(device)
            genotypes_t = genotypes_t[:, genotype_ix_t]
            impute_mean(genotypes_t)

            # 1) forward pass
            forward_df = [
                signif_df[signif_df['group_id'] == group_id].iloc[0]
            ]  # initialize results with top variant
            covariates = covariates_df.values.copy()  # initialize covariates
            dosage_dict = {}
            while True:
                # add variant to covariates
                variant_id = forward_df[-1]['variant_id']
                ig = genotype_df.values[ix_dict[variant_id], genotype_ix]
                dosage_dict[variant_id] = ig
                covariates = np.hstack([covariates,
                                        ig.reshape(-1, 1)]).astype(np.float32)
                dof = phenotype_df.shape[1] - 2 - covariates.shape[1]
                covariates_t = torch.tensor(covariates,
                                            dtype=torch.float32).to(device)
                residualizer = Residualizer(covariates_t)
                del covariates_t

                # iterate over phenotypes
                buf = []
                for phenotype, phenotype_id in zip(phenotypes, phenotype_ids):
                    phenotype_t = torch.tensor(phenotype,
                                               dtype=torch.float).to(device)
                    res = calculate_cis_permutations(genotypes_t, phenotype_t,
                                                     residualizer,
                                                     permutation_ix_t)
                    res = [i.cpu().numpy() for i in res
                           ]  # r_nominal, std_ratio, var_ix, r2_perm, g
                    res[2] = genotype_range[res[2]]
                    buf.append(res + [genotypes.shape[0], phenotype_id])
                res_s = _process_group_permutations(
                    buf,
                    variant_df,
                    igc.phenotype_tss[phenotype_ids[0]],
                    dof,
                    group_id,
                    nperm=nperm)

                # add to list if significant
                if res_s['pval_beta'] <= signif_threshold:
                    forward_df.append(res_s)
                else:
                    break
            forward_df = pd.concat(forward_df, axis=1, sort=False).T
            dosage_df = pd.DataFrame(dosage_dict)

            # 2) backward pass
            if forward_df.shape[0] > 1:
                back_df = []
                variant_set = set()
                for k, variant_id in enumerate(forward_df['variant_id'], 1):
                    covariates = np.hstack([
                        covariates_df.values,
                        dosage_df[np.setdiff1d(forward_df['variant_id'],
                                               variant_id)].values,
                    ])
                    dof = phenotype_df.shape[1] - 2 - covariates.shape[1]
                    covariates_t = torch.tensor(covariates,
                                                dtype=torch.float32).to(device)
                    residualizer = Residualizer(covariates_t)
                    del covariates_t

                    # iterate over phenotypes
                    buf = []
                    for phenotype, phenotype_id in zip(phenotypes,
                                                       phenotype_ids):
                        phenotype_t = torch.tensor(
                            phenotype, dtype=torch.float).to(device)
                        res = calculate_cis_permutations(
                            genotypes_t, phenotype_t, residualizer,
                            permutation_ix_t)
                        res = [i.cpu().numpy() for i in res
                               ]  # r_nominal, std_ratio, var_ix, r2_perm, g
                        res[2] = genotype_range[res[2]]
                        buf.append(res + [genotypes.shape[0], phenotype_id])
                    res_s = _process_group_permutations(
                        buf,
                        variant_df,
                        igc.phenotype_tss[phenotype_ids[0]],
                        dof,
                        group_id,
                        nperm=nperm)

                    if res_s[
                            'pval_beta'] <= signif_threshold and variant_id not in variant_set:
                        res_s['rank'] = k
                        back_df.append(res_s)
                        variant_set.add(variant_id)
                if len(back_df) > 0:
                    res_df.append(pd.concat(back_df, axis=1, sort=False).T)
            else:  # single independent variant
                forward_df['rank'] = 1
                res_df.append(forward_df)

    res_df = pd.concat(res_df, axis=0, sort=False)
    res_df.index.name = 'phenotype_id'
    logger.write('  Time elapsed: {:.2f} min'.format(
        (time.time() - start_time) / 60))
    logger.write('done.')
    return res_df.reset_index().astype(output_dtype_dict)
Пример #4
0
def map_nominal(genotype_df,
                variant_df,
                phenotype_df,
                phenotype_pos_df,
                covariates_df,
                prefix,
                interaction_s=None,
                maf_threshold_interaction=0.05,
                group_s=None,
                window=1000000,
                run_eigenmt=False,
                output_dir='.',
                logger=None,
                verbose=True):
    """
    cis-QTL mapping: nominal associations for all variant-phenotype pairs

    Association results for each chromosome are written to parquet files
    in the format <output_dir>/<prefix>.cis_qtl_pairs.<chr>.parquet
    """
    assert np.all(phenotype_df.columns == covariates_df.index)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    if logger is None:
        logger = SimpleLogger()
    if group_s is not None:
        group_dict = group_s.to_dict()

    logger.write(
        'cis-QTL mapping: nominal associations for all variant-phenotype pairs'
    )
    logger.write('  * {} samples'.format(phenotype_df.shape[1]))
    logger.write('  * {} phenotypes'.format(phenotype_df.shape[0]))
    logger.write('  * {} covariates'.format(covariates_df.shape[1]))
    logger.write('  * {} variants'.format(variant_df.shape[0]))
    if interaction_s is not None:
        assert np.all(interaction_s.index == covariates_df.index)
        logger.write('  * including interaction term')
        if maf_threshold_interaction > 0:
            logger.write('    * using {:.2f} MAF threshold'.format(
                maf_threshold_interaction))

    covariates_t = torch.tensor(covariates_df.values,
                                dtype=torch.float32).to(device)
    residualizer = Residualizer(covariates_t)
    del covariates_t

    genotype_ix = np.array(
        [genotype_df.columns.tolist().index(i) for i in phenotype_df.columns])
    genotype_ix_t = torch.from_numpy(genotype_ix).to(device)
    if interaction_s is None:
        dof = phenotype_df.shape[1] - 2 - covariates_df.shape[1]
    else:
        dof = phenotype_df.shape[1] - 4 - covariates_df.shape[1]
        interaction_t = torch.tensor(interaction_s.values.reshape(1, -1),
                                     dtype=torch.float32).to(device)
        if maf_threshold_interaction > 0:
            interaction_mask_t = torch.BoolTensor(
                interaction_s >= interaction_s.median()).to(device)
        else:
            interaction_mask_t = None

    igc = genotypeio.InputGeneratorCis(genotype_df,
                                       variant_df,
                                       phenotype_df,
                                       phenotype_pos_df,
                                       window=window)
    # iterate over chromosomes
    best_assoc = []
    start_time = time.time()
    prev_phenotype_id = None
    k = 0
    logger.write('  * Computing associations')
    for chrom in igc.chrs:
        logger.write('    Mapping chromosome {}'.format(chrom))
        chr_res_df = []
        for k, (phenotype, genotypes, genotype_range,
                phenotype_id) in enumerate(
                    igc.generate_data(chrom=chrom, verbose=verbose), k + 1):
            # copy genotypes to GPU
            phenotype_t = torch.tensor(phenotype, dtype=torch.float).to(device)
            genotypes_t = torch.tensor(genotypes, dtype=torch.float).to(device)
            genotypes_t = genotypes_t[:, genotype_ix_t]
            impute_mean(genotypes_t)

            if interaction_s is None:
                res = calculate_cis_nominal(genotypes_t, phenotype_t,
                                            residualizer)
                tstat, slope, slope_se, maf, ma_samples, ma_count = [
                    i.cpu().numpy() for i in res
                ]
                variant_ids = variant_df.index[
                    genotype_range[0]:genotype_range[-1] + 1]
                tss_distance = np.int32(
                    variant_df['pos'].
                    values[genotype_range[0]:genotype_range[-1] + 1] -
                    igc.phenotype_tss[phenotype_id])
                res_df = pd.DataFrame(
                    OrderedDict([
                        ('phenotype_id', [phenotype_id] * len(variant_ids)),
                        ('variant_id', variant_ids),
                        ('tss_distance', tss_distance),
                        ('maf', maf),
                        ('ma_samples', ma_samples),
                        ('ma_count', ma_count),
                        ('pval_nominal', tstat
                         ),  #### replace with pval (currently on CPU, below)
                        ('slope', slope),
                        ('slope_se', slope_se),
                    ]))
            else:
                genotypes_t, mask_t = filter_maf_interaction(
                    genotypes_t,
                    interaction_mask_t=interaction_mask_t,
                    maf_threshold_interaction=maf_threshold_interaction)
                if genotypes_t.shape[0] > 0:
                    res = calculate_interaction_nominal(
                        genotypes_t,
                        phenotype_t.unsqueeze(0),
                        interaction_t,
                        residualizer,
                        return_sparse=False)

                    if run_eigenmt:  # compute eigenMT correction
                        m_eff = eigenmt.compute_tests(genotypes_t,
                                                      var_thresh=0.99,
                                                      variant_window=200)

                    tstat, b, b_se, maf, ma_samples, ma_count = [
                        i.cpu().numpy() for i in res
                    ]
                    mask = mask_t.cpu().numpy()

                    r = igc.cis_ranges[phenotype_id]
                    variant_ids = variant_df.index[r[0]:r[-1] + 1]
                    tss_distance = np.int32(
                        variant_df['pos'].values[r[0]:r[-1] + 1] -
                        igc.phenotype_tss[phenotype_id])
                    variant_ids = variant_ids[mask]
                    tss_distance = tss_distance[mask]
                    nv = len(variant_ids)
                    res_df = pd.DataFrame(
                        OrderedDict([
                            ('phenotype_id', [phenotype_id] * nv),
                            ('variant_id', variant_ids),
                            ('tss_distance', tss_distance),
                            ('maf', maf),
                            ('ma_samples', ma_samples),
                            ('ma_count', ma_count),
                            ('pval_g', tstat[:, 0]),
                            ('b_g', b[:, 0]),
                            ('b_g_se', b_se[:, 0]),
                            ('pval_i', tstat[:, 1]),
                            ('b_i', b[:, 1]),
                            ('b_i_se', b_se[:, 1]),
                            ('pval_gi', tstat[:, 2]),
                            ('b_gi', b[:, 2]),
                            ('b_gi_se', b_se[:, 2]),
                        ]))

                    top_s = res_df.loc[res_df['pval_gi'].abs().idxmax()].copy()
                    if run_eigenmt:
                        top_s['tests_emt'] = m_eff

                    best_assoc.append(
                        top_s
                    )  # top variant only (pval_gi is t-statistic here, hence max)
                else:  # all genotypes in window were filtered out
                    res_df = None

            if group_s is not None and group_dict[
                    phenotype_id] == group_dict.get(prev_phenotype_id):
                # store the strongest association within each group
                if interaction_s is None:
                    ix = res_df['pval_nominal'] > chr_res_df[-1][
                        'pval_nominal']  # compare t-statistics
                else:
                    ix = res_df['pval_gi'] > chr_res_df[-1]['pval_gi']
                chr_res_df[-1].loc[ix] = res_df.loc[ix]
            else:
                chr_res_df.append(res_df)
            prev_phenotype_id = phenotype_id
        logger.write('    time elapsed: {:.2f} min'.format(
            (time.time() - start_time) / 60))

        # compute p-values and write current chromosome
        chr_res_df = pd.concat(chr_res_df, copy=False)
        if interaction_s is None:
            m = chr_res_df['pval_nominal'].notnull()
            chr_res_df.loc[m, 'pval_nominal'] = 2 * stats.t.cdf(
                -chr_res_df.loc[m, 'pval_nominal'].abs(), dof)
        else:
            m = chr_res_df['pval_gi'].notnull()
            chr_res_df.loc[m, 'pval_g'] = 2 * stats.t.cdf(
                -chr_res_df.loc[m, 'pval_g'].abs(), dof)
            chr_res_df.loc[m, 'pval_i'] = 2 * stats.t.cdf(
                -chr_res_df.loc[m, 'pval_i'].abs(), dof)
            chr_res_df.loc[m, 'pval_gi'] = 2 * stats.t.cdf(
                -chr_res_df.loc[m, 'pval_gi'].abs(), dof)
        print('  * writing output')
        chr_res_df.to_parquet(
            os.path.join(output_dir,
                         '{}.cis_qtl_pairs.{}.parquet'.format(prefix, chrom)))

    if interaction_s is not None:
        best_assoc = pd.concat(
            best_assoc, axis=1,
            sort=False).T.set_index('phenotype_id').infer_objects()
        m = best_assoc['pval_g'].notnull()
        best_assoc.loc[m, 'pval_g'] = 2 * stats.t.cdf(
            -best_assoc.loc[m, 'pval_g'].abs(), dof)
        best_assoc.loc[m, 'pval_i'] = 2 * stats.t.cdf(
            -best_assoc.loc[m, 'pval_i'].abs(), dof)
        best_assoc.loc[m, 'pval_gi'] = 2 * stats.t.cdf(
            -best_assoc.loc[m, 'pval_gi'].abs(), dof)
        if run_eigenmt:
            best_assoc['pval_emt'] = np.minimum(
                best_assoc['tests_emt'] * best_assoc['pval_gi'], 1)
            best_assoc['pval_adj_bh'] = eigenmt.padjust_bh(
                best_assoc['pval_emt'])
        best_assoc.to_csv(os.path.join(
            output_dir, '{}.cis_qtl_top_assoc.txt.gz'.format(prefix)),
                          sep='\t',
                          float_format='%.6g')
    logger.write('done.')
Пример #5
0
def run_eigenmt(genotype_df,
                variant_df,
                phenotype_df,
                phenotype_pos_df,
                interaction_s=None,
                maf_threshold=0,
                var_thresh=0.99,
                variant_window=200,
                window=1000000,
                verbose=True,
                logger=None):
    """Standalone function for computing eigenMT correction.

    Returns the number of tests for each gene
    """

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    if logger is None:
        logger = SimpleLogger()

    logger.write(
        'eigenMT: estimating number of independent variants tested for each phenotype'
    )

    logger.write('cis-QTL mapping: empirical p-values for phenotypes')
    logger.write(f'  * {phenotype_df.shape[1]} samples')
    logger.write(f'  * {phenotype_df.shape[0]} phenotypes')
    logger.write(f'  * {genotype_df.shape[0]} variants')

    if interaction_s is not None and maf_threshold > 0:
        interaction_mask_t = torch.BoolTensor(
            interaction_s >= interaction_s.median()).to(device)
    else:
        interaction_mask_t = None

    genotype_ix = np.array(
        [genotype_df.columns.tolist().index(i) for i in phenotype_df.columns])
    genotype_ix_t = torch.from_numpy(genotype_ix).to(device)

    igc = genotypeio.InputGeneratorCis(genotype_df,
                                       variant_df,
                                       phenotype_df,
                                       phenotype_pos_df,
                                       window=window)
    start_time = time.time()
    m_eff = OrderedDict()
    for k, (phenotype, genotypes, genotype_range,
            phenotype_id) in enumerate(igc.generate_data(verbose=verbose), 1):

        # copy genotypes to GPU
        genotypes_t = torch.tensor(genotypes, dtype=torch.float).to(device)
        genotypes_t = genotypes_t[:, genotype_ix_t]
        impute_mean(genotypes_t)

        if interaction_s is None:
            mask_t = calculate_maf(genotypes_t) >= maf_threshold
            genotypes_t = genotypes_t[mask_t]
        else:
            genotypes_t, mask_t = filter_maf_interaction(
                genotypes_t,
                interaction_mask_t=interaction_mask_t,
                maf_threshold_interaction=maf_threshold)

        m_eff[phenotype_id] = compute_tests(genotypes_t,
                                            var_thresh=var_thresh,
                                            variant_window=variant_window)

    logger.write(f'    time elapsed: {(time.time()-start_time)/60:.2f} min')
    return pd.Series(m_eff)
Пример #6
0
def run_pairs(genotype_df,
              variant_df,
              phenotype1_df,
              phenotype2_df,
              phenotype_pos_df,
              covariates1_df=None,
              covariates2_df=None,
              p1=1e-4,
              p2=1e-4,
              p12=1e-5,
              mode='beta',
              maf_threshold=0,
              window=1000000,
              batch_size=10000,
              logger=None,
              verbose=True):
    """Compute COLOC for all phenotype pairs"""

    assert np.all(phenotype1_df.index == phenotype2_df.index)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    if logger is None:
        logger = SimpleLogger()

    logger.write('Computing COLOC for all pairs of phenotypes')
    logger.write(f'  * {phenotype1_df.shape[0]} phenotypes')
    logger.write(f'  * phenotype group 1: {phenotype1_df.shape[1]} samples')
    logger.write(f'  * phenotype group 2: {phenotype2_df.shape[1]} samples')

    if covariates1_df is not None:
        assert np.all(phenotype1_df.columns == covariates1_df.index)
        logger.write(
            f'  * phenotype group 1: {covariates1_df.shape[1]} covariates')
        residualizer1 = Residualizer(
            torch.tensor(covariates1_df.values,
                         dtype=torch.float32).to(device))
    else:
        residualizer1 = None

    if covariates2_df is not None:
        assert np.all(phenotype2_df.columns == covariates2_df.index)
        logger.write(
            f'  * phenotype group 2: {covariates2_df.shape[1]} covariates')
        residualizer2 = Residualizer(
            torch.tensor(covariates2_df.values,
                         dtype=torch.float32).to(device))
    else:
        residualizer2 = None

    if maf_threshold > 0:
        logger.write(
            f'  * applying in-sample {maf_threshold} MAF filter (in at least one cohort)'
        )

    genotype1_ix = np.array(
        [genotype_df.columns.tolist().index(i) for i in phenotype1_df.columns])
    genotype1_ix_t = torch.from_numpy(genotype1_ix).to(device)
    genotype2_ix = np.array(
        [genotype_df.columns.tolist().index(i) for i in phenotype2_df.columns])
    genotype2_ix_t = torch.from_numpy(genotype2_ix).to(device)

    igc = genotypeio.InputGeneratorCis(genotype_df,
                                       variant_df,
                                       phenotype1_df,
                                       phenotype_pos_df,
                                       window=window)
    coloc_df = []
    start_time = time.time()
    logger.write('  * Computing pairwise colocalization')
    for phenotype1, genotypes, genotype_range, phenotype_id in igc.generate_data(
            verbose=verbose):
        phenotype2 = phenotype2_df.loc[phenotype_id]

        # copy to GPU
        phenotype1_t = torch.tensor(phenotype1, dtype=torch.float).to(device)
        phenotype2_t = torch.tensor(phenotype2, dtype=torch.float).to(device)
        genotypes_t = torch.tensor(genotypes, dtype=torch.float).to(device)
        genotypes1_t = genotypes_t[:, genotype1_ix_t]
        genotypes2_t = genotypes_t[:, genotype2_ix_t]
        del genotypes_t

        # filter monomorphic sites
        m = ((genotypes1_t == 0).all(1) | (genotypes1_t == 1).all(1) |
             (genotypes1_t == 2).all(1) | (genotypes2_t == 0).all(1) |
             (genotypes2_t == 1).all(1) | (genotypes2_t == 2).all(1))
        genotypes1_t = genotypes1_t[~m]
        genotypes2_t = genotypes2_t[~m]
        impute_mean(genotypes1_t)
        impute_mean(genotypes2_t)

        if maf_threshold > 0:
            maf1_t = calculate_maf(genotypes1_t)
            maf2_t = calculate_maf(genotypes2_t)
            mask_t = (maf1_t >= maf_threshold) | (maf2_t >= maf_threshold)
            genotypes1_t = genotypes1_t[mask_t]
            genotypes2_t = genotypes2_t[mask_t]

        coloc_t = coloc(genotypes1_t,
                        genotypes2_t,
                        phenotype1_t,
                        phenotype2_t,
                        residualizer1=residualizer1,
                        residualizer2=residualizer2,
                        p1=p1,
                        p2=p2,
                        p12=p12,
                        mode=mode)
        coloc_df.append(coloc_t.cpu().numpy())
    logger.write('    time elapsed: {:.2f} min'.format(
        (time.time() - start_time) / 60))
    coloc_df = pd.DataFrame(coloc_df,
                            columns=[f'pp_h{i}_abf' for i in range(5)],
                            index=phenotype1_df.index)
    logger.write('done.')
    return coloc_df
Пример #7
0
def map_cis(genotype_df, variant_df, phenotype_df, phenotype_pos_df,
            covariates_df, mapper, prefix, 
            window=1000000, output_dir='.', 
            logger=None, verbose=True, interaction=False, kwargs={}, kwargs_interaction={},
            num_of_permutation=None, permutation_chunk_size=10):
    '''
    Wrapper for cis-QTL mapping.
    The QTL caller is `mapper` which should have 
    * mapper.init(phenotype, covariate)
    * mapper.map(genotype) 
    If interaction: 
    * mapper.map_one_multi_x(X) with X being generated from 
    kwargs_interaction['transform_fun'](kwargs['design_matrix'] @ genotype, **kwargs_interaction['transform_fun_args'])
    implemented.
    mapper.map_one should return 'bhat', 'pval' in a dictionary.
    '''
    
    assert np.all(phenotype_df.columns==covariates_df.index)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    if logger is None:
        logger = SimpleLogger()

    logger.write('cis-QTL mapping')
    logger.write('  * {} samples'.format(phenotype_df.shape[1]))
    logger.write('  * {} phenotypes'.format(phenotype_df.shape[0]))
    logger.write('  * {} covariates'.format(covariates_df.shape[1]))
    logger.write('  * {} variants'.format(variant_df.shape[0]))
    

    covariates_t = torch.tensor(covariates_df.values, dtype=torch.float32).to(device)
    phenotypes_t = torch.tensor(phenotype_df.values, dtype=torch.float32).to(device)
    
    # FIXME: this is not ideal since we may initialize for some phenotypes that does not have cis genotype.
    # So, for now, as it is not taken care of inside the caller, 
    # we need to make sure that these phenotypes are not part of the input.
    ## mapper call
    mapper.init(phenotypes_t.T, covariates_t, **kwargs)
    phenotype_names = phenotype_df.index.to_list()

    
    igc = genotypeio.InputGeneratorCis(genotype_df, variant_df, phenotype_df, phenotype_pos_df, group_s=None, window=window)
    # iterate over chromosomes
    best_assoc = []
    start_time = time.time()
    k = 0
    logger.write('  * Computing associations')
    for chrom in igc.chrs:
        logger.write('    Mapping chromosome {}'.format(chrom))
        # allocate arrays
        n = 0
        for i in igc.phenotype_pos_df[igc.phenotype_pos_df['chr']==chrom].index:
            j = igc.cis_ranges[i]
            n += j[1] - j[0] + 1
        
        chr_res = OrderedDict()
        chr_res['phenotype_id'] = []
        chr_res['variant_id'] = []
        chr_res['tss_distance'] = np.empty(n, dtype=np.int32)
        chr_res['pval'] = np.empty(n, dtype=np.float64)
        chr_res['b'] =        np.empty(n, dtype=np.float32)
        
        if num_of_permutation is not None:
            chr_res['pval_permutation'] = np.empty(n, dtype=np.float64)
        
        start = 0
        
        for k, (phenotype, genotypes, genotype_range, phenotype_id) in enumerate(igc.generate_data(chrom=chrom, verbose=verbose), k+1):
            # copy genotypes to GPU
            genotypes_t = torch.tensor(genotypes, dtype=torch.float).to(device)
            impute_mean(genotypes_t)

            variant_ids = variant_df.index[genotype_range[0]:genotype_range[-1]+1]
            
            n = len(variant_ids)
            if n <= 0:
                continue
            
            tss_distance = np.int32(variant_df['pos'].values[genotype_range[0]:genotype_range[-1]+1] - igc.phenotype_tss[phenotype_id])
            
            phenotype_idx = name_to_index(phenotype_names, phenotype_id)

            ## mapper call
            if interaction is False:
                res_i = mapper.map_one(genotypes_t.T, phenotype_idx)
            elif interaction is True:
                X = kwargs_interaction['transform_fun'](torch.Tensor(kwargs['design_matrix']) @ genotypes_t.T, **kwargs_interaction['transform_fun_args'])
                res_i = mapper.map_one_multi_x(X, phenotype_idx)
            
                
            ## take care of permutation
            if num_of_permutation is not None:
                list_pval_perm = []
                permutor = Permutor(num_of_permutation, chunk_size=permutation_chunk_size)
                if interaction is False:
                    for x, nchunk in permutor.gen_permuted_columns(genotypes_t.T):
                        _, pval_perm = mapper.map_one(x, phenotype_idx)
                        list_pval_perm.append(permutor.rearrange(torch.Tensor(pval_perm), nchunk))
                elif interaction is True:
                    traveler = permutor.gen_permuted_columns_interaction(
                        torch.Tensor(kwargs['design_matrix']) @ genotypes_t.T,
                        kwargs_interaction['permutation']['transform_fun'],
                        kwargs_interaction['permutation']['transform_fun_args']
                    )
                    for x, nchunk in traveler: 
                        _, pval_perm = mapper.map_one_multi_x(x, phenotype_idx)
                        list_pval_perm.append(permutor.rearrange(torch.Tensor(pval_perm), nchunk))
                else:
                    raise ValueError(f'The args interaction can only be True or False. Wrong interaction = {interaction}.')
                pval_from_permutation = permutor.add_permutation_pval(torch.Tensor(res_i[1]), torch.cat(list_pval_perm, axis=0))
        
            chr_res['phenotype_id'].extend([phenotype_id]*n)
            chr_res['variant_id'].extend(variant_ids)
            chr_res['tss_distance'][start:start+n] = tss_distance
            chr_res['pval'][start:start+n] = res_i[1]
            chr_res['b'][start:start+n] = res_i[0]
            if num_of_permutation is not None:
                chr_res['pval_permutation'][start:start+n] = pval_from_permutation
            start += n  # update pointer
        

        logger.write('    time elapsed: {:.2f} min'.format((time.time()-start_time)/60))

        # prepare output
        if start < len(chr_res['tss_distance']):
            for x in chr_res:
                chr_res[x] = chr_res[x][:start]

        chr_res_df = pd.DataFrame(chr_res)
            
        print('    * writing output')
        chr_res_df.to_parquet(os.path.join(output_dir, '{}.cis_qtl_pairs.{}.parquet'.format(prefix, chrom)))

    
    logger.write('done.')
Пример #8
0
def map(genotype_df,
        variant_df,
        phenotype_df,
        phenotype_pos_df,
        covariates_df,
        L=10,
        scaled_prior_variance=0.2,
        estimate_residual_variance=True,
        estimate_prior_variance=True,
        tol=1e-3,
        coverage=0.95,
        min_abs_corr=0.5,
        summary_only=True,
        maf_threshold=0,
        max_iter=100,
        window=1000000,
        logger=None,
        verbose=True,
        warn_monomorphic=False):
    """
    SuSiE fine-mapping: computes SuSiE model for all phenotypes
    """
    assert np.all(phenotype_df.columns == covariates_df.index)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    if logger is None:
        logger = SimpleLogger()

    logger.write('SuSiE fine-mapping')
    logger.write(f'  * {phenotype_df.shape[1]} samples')
    logger.write(f'  * {phenotype_df.shape[0]} phenotypes')
    logger.write(f'  * {covariates_df.shape[1]} covariates')
    logger.write(f'  * {variant_df.shape[0]} variants')
    if maf_threshold > 0:
        logger.write(f'  * applying in-sample MAF >= {maf_threshold} filter')

    residualizer = Residualizer(
        torch.tensor(covariates_df.values, dtype=torch.float32).to(device))

    genotype_ix = np.array(
        [genotype_df.columns.tolist().index(i) for i in phenotype_df.columns])
    genotype_ix_t = torch.from_numpy(genotype_ix).to(device)

    igc = genotypeio.InputGeneratorCis(genotype_df,
                                       variant_df,
                                       phenotype_df,
                                       phenotype_pos_df,
                                       window=window)
    if igc.n_phenotypes == 0:
        raise ValueError('No valid phenotypes found.')

    start_time = time.time()
    logger.write('  * fine-mapping')
    copy_keys = ['pip', 'sets', 'converged', 'niter']
    if not summary_only:
        susie_res = {}
    else:
        susie_res = []
    for k, (phenotype, genotypes, genotype_range,
            phenotype_id) in enumerate(igc.generate_data(verbose=verbose), 1):
        # copy genotypes to GPU
        genotypes_t = torch.tensor(genotypes, dtype=torch.float).to(device)
        genotypes_t = genotypes_t[:, genotype_ix_t]
        impute_mean(genotypes_t)

        variant_ids = variant_df.index[genotype_range[0]:genotype_range[-1] +
                                       1]

        # filter monomorphic variants
        mask_t = ~(genotypes_t == genotypes_t[:, [0]]).all(1)
        if warn_monomorphic:
            logger.write(
                f'    * WARNING: excluding {~mask_t.sum()} monomorphic variants'
            )
        if maf_threshold > 0:
            maf_t = calculate_maf(genotypes_t)
            mask_t &= maf_t >= maf_threshold
        if mask_t.any():
            genotypes_t = genotypes_t[mask_t]
            mask = mask_t.cpu().numpy().astype(bool)
            variant_ids = variant_ids[mask]
            genotype_range = genotype_range[mask]

        if genotypes_t.shape[0] == 0:
            logger.write(
                f'WARNING: skipping {phenotype_id} (no valid variants)')
            continue

        phenotype_t = torch.tensor(phenotype, dtype=torch.float).to(device)
        genotypes_res_t = residualizer.transform(
            genotypes_t)  # variants x samples
        phenotype_res_t = residualizer.transform(phenotype_t.reshape(
            1, -1))  # phenotypes x samples

        res = susie(genotypes_res_t.T,
                    phenotype_res_t.T,
                    L=L,
                    scaled_prior_variance=scaled_prior_variance,
                    coverage=coverage,
                    min_abs_corr=min_abs_corr,
                    estimate_residual_variance=estimate_residual_variance,
                    estimate_prior_variance=estimate_prior_variance,
                    tol=tol,
                    max_iter=max_iter)

        af_t = genotypes_t.sum(1) / (2 * genotypes_t.shape[1])
        res['pip'] = pd.DataFrame({
            'pip': res['pip'],
            'af': af_t.cpu().numpy()
        },
                                  index=variant_ids)
        if not summary_only:
            susie_res[phenotype_id] = {k: res[k] for k in copy_keys}
        else:
            if res['sets']['cs'] is not None:
                # assert res['converged'] == True
                if res['converged'] == True:
                    for c in sorted(res['sets']['cs'],
                                    key=lambda x: int(x.replace('L', ''))):
                        cs = res['sets']['cs'][c]  # indexes
                        p = res['pip'].iloc[cs].copy().reset_index()
                        p['cs_id'] = c.replace('L', '')
                        p.insert(0, 'phenotype_id', phenotype_id)
                        susie_res.append(p)
                else:
                    print(f'    * phenotype ID: {phenotype_id}')

    logger.write(f'  Time elapsed: {(time.time()-start_time)/60:.2f} min')
    logger.write('done.')
    if summary_only and susie_res:
        susie_res = pd.concat(susie_res, axis=0).rename(columns={
            'snp': 'variant_id'
        }).reset_index(drop=True)
    return susie_res
Пример #9
0
def map(genotype_df,
        variant_df,
        phenotype_df,
        phenotype_pos_df,
        covariates_df,
        L=10,
        scaled_prior_variance=0.2,
        estimate_residual_variance=True,
        estimate_prior_variance=True,
        tol=1e-3,
        coverage=0.95,
        min_abs_corr=0.5,
        max_iter=100,
        window=1000000,
        logger=None,
        verbose=True):
    """
    SuSiE fine-mapping: computes SuSiE model for all phenotypes
    """
    assert np.all(phenotype_df.columns == covariates_df.index)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    if logger is None:
        logger = SimpleLogger()

    logger.write('SuSiE fine-mapping')
    logger.write(f'  * {phenotype_df.shape[1]} samples')
    logger.write(f'  * {phenotype_df.shape[0]} phenotypes')
    logger.write(f'  * {covariates_df.shape[1]} covariates')
    logger.write(f'  * {variant_df.shape[0]} variants')

    residualizer = Residualizer(
        torch.tensor(covariates_df.values, dtype=torch.float32).to(device))

    genotype_ix = np.array(
        [genotype_df.columns.tolist().index(i) for i in phenotype_df.columns])
    genotype_ix_t = torch.from_numpy(genotype_ix).to(device)

    igc = genotypeio.InputGeneratorCis(genotype_df,
                                       variant_df,
                                       phenotype_df,
                                       phenotype_pos_df,
                                       window=window)
    if igc.n_phenotypes == 0:
        raise ValueError('No valid phenotypes found.')

    start_time = time.time()
    logger.write('  * fine-mapping')
    copy_keys = ['pip', 'sets', 'converged', 'niter']
    susie_res = {}
    for k, (phenotype, genotypes, genotype_range,
            phenotype_id) in enumerate(igc.generate_data(verbose=verbose), 1):
        # copy genotypes to GPU
        genotypes_t = torch.tensor(genotypes, dtype=torch.float).to(device)
        genotypes_t = genotypes_t[:, genotype_ix_t]

        # filter monomorphic variants
        # genotypes_t = genotypes_t[~(genotypes_t == genotypes_t[:, [0]]).all(1)]
        # if genotypes_t.shape[0] == 0:
        #     logger.write(f'WARNING: skipping {phenotype_id} (no valid variants)')
        #     continue

        impute_mean(genotypes_t)

        phenotype_t = torch.tensor(phenotype, dtype=torch.float).to(device)
        genotypes_res_t = residualizer.transform(
            genotypes_t)  # variants x samples
        phenotype_res_t = residualizer.transform(phenotype_t.reshape(
            1, -1))  # phenotypes x samples

        res = susie(genotypes_res_t.T,
                    phenotype_res_t.T,
                    L=L,
                    scaled_prior_variance=scaled_prior_variance,
                    coverage=coverage,
                    min_abs_corr=min_abs_corr,
                    estimate_residual_variance=estimate_residual_variance,
                    estimate_prior_variance=estimate_prior_variance,
                    tol=tol,
                    max_iter=max_iter)

        af_t = genotypes_t.sum(1) / (2 * genotypes_t.shape[1])
        variant_ids = variant_df.index[genotype_range[0]:genotype_range[-1] +
                                       1]
        res['pip'] = pd.DataFrame({
            'pip': res['pip'],
            'af': af_t.cpu().numpy()
        },
                                  index=variant_ids)
        susie_res[phenotype_id] = {k: res[k] for k in copy_keys}

    logger.write(f'  Time elapsed: {(time.time()-start_time)/60:.2f} min')
    logger.write('done.')
    return susie_res
Пример #10
0
def map_cis(genotype_df, variant_df, phenotype_df, phenotype_pos_df, covariates_df=None,
            group_s=None, maf_threshold=0, beta_approx=True, nperm=10000,
            window=1000000, random_tiebreak=False, logger=None, seed=None,
            verbose=True, warn_monomorphic=True):
    """Run cis-QTL mapping"""

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    if logger is None:
        logger = SimpleLogger()

    logger.write('cis-QTL mapping: empirical p-values for phenotypes')
    logger.write(f'  * {phenotype_df.shape[1]} samples')
    logger.write(f'  * {phenotype_df.shape[0]} phenotypes')
    if group_s is not None:
        logger.write(f'  * {len(group_s.unique())} phenotype groups')
        group_dict = group_s.to_dict()
    if covariates_df is not None:
        assert np.all(phenotype_df.columns==covariates_df.index)
        logger.write(f'  * {covariates_df.shape[1]} covariates')
        residualizer = Residualizer(torch.tensor(covariates_df.values, dtype=torch.float32).to(device))
        dof = phenotype_df.shape[1] - 2 - covariates_df.shape[1]
    else:
        residualizer = None
        dof = phenotype_df.shape[1] - 2
    logger.write(f'  * {genotype_df.shape[0]} variants')
    if maf_threshold > 0:
        logger.write(f'  * applying in-sample {maf_threshold} MAF filter')
    if random_tiebreak:
        logger.write(f'  * randomly selecting top variant in case of ties')

    genotype_ix = np.array([genotype_df.columns.tolist().index(i) for i in phenotype_df.columns])
    genotype_ix_t = torch.from_numpy(genotype_ix).to(device)

    # permutation indices
    n_samples = phenotype_df.shape[1]
    ix = np.arange(n_samples)
    if seed is not None:
        logger.write(f'  * using seed {seed}')
        np.random.seed(seed)
    permutation_ix_t = torch.LongTensor(np.array([np.random.permutation(ix) for i in range(nperm)])).to(device)

    res_df = []
    igc = genotypeio.InputGeneratorCis(genotype_df, variant_df, phenotype_df, phenotype_pos_df, group_s=group_s, window=window)
    if igc.n_phenotypes == 0:
        raise ValueError('No valid phenotypes found.')
    start_time = time.time()
    logger.write('  * computing permutations')
    if group_s is None:
        for k, (phenotype, genotypes, genotype_range, phenotype_id) in enumerate(igc.generate_data(verbose=verbose), 1):
            # copy genotypes to GPU
            genotypes_t = torch.tensor(genotypes, dtype=torch.float).to(device)
            genotypes_t = genotypes_t[:,genotype_ix_t]
            impute_mean(genotypes_t)

            if maf_threshold > 0:
                maf_t = calculate_maf(genotypes_t)
                mask_t = maf_t >= maf_threshold
                genotypes_t = genotypes_t[mask_t]
                mask = mask_t.cpu().numpy().astype(bool)
                genotype_range = genotype_range[mask]

            # filter monomorphic variants
            mono_t = (genotypes_t == genotypes_t[:, [0]]).all(1)
            if mono_t.any():
                genotypes_t = genotypes_t[~mono_t]
                genotype_range = genotype_range[~mono_t.cpu()]
                if warn_monomorphic:
                    logger.write(f'    * WARNING: excluding {mono_t.sum()} monomorphic variants')

            if genotypes_t.shape[0] == 0:
                logger.write(f'WARNING: skipping {phenotype_id} (no valid variants)')
                continue

            phenotype_t = torch.tensor(phenotype, dtype=torch.float).to(device)

            res = calculate_cis_permutations(genotypes_t, phenotype_t, permutation_ix_t,
                                             residualizer=residualizer, random_tiebreak=random_tiebreak)
            r_nominal, std_ratio, var_ix, r2_perm, g = [i.cpu().numpy() for i in res]
            var_ix = genotype_range[var_ix]
            variant_id = variant_df.index[var_ix]
            tss_distance = variant_df['pos'].values[var_ix] - igc.phenotype_tss[phenotype_id]
            res_s = prepare_cis_output(r_nominal, r2_perm, std_ratio, g, genotypes_t.shape[0], dof, variant_id, tss_distance, phenotype_id, nperm=nperm)
            if beta_approx:
                res_s[['pval_beta', 'beta_shape1', 'beta_shape2', 'true_df', 'pval_true_df']] = calculate_beta_approx_pval(r2_perm, r_nominal*r_nominal, dof)
            res_df.append(res_s)
    else:  # grouped mode
        for k, (phenotypes, genotypes, genotype_range, phenotype_ids, group_id) in enumerate(igc.generate_data(verbose=verbose), 1):
            # copy genotypes to GPU
            genotypes_t = torch.tensor(genotypes, dtype=torch.float).to(device)
            genotypes_t = genotypes_t[:,genotype_ix_t]
            impute_mean(genotypes_t)

            if maf_threshold > 0:
                maf_t = calculate_maf(genotypes_t)
                mask_t = maf_t >= maf_threshold
                genotypes_t = genotypes_t[mask_t]
                mask = mask_t.cpu().numpy().astype(bool)
                genotype_range = genotype_range[mask]

            # filter monomorphic variants
            mono_t = (genotypes_t == genotypes_t[:, [0]]).all(1)
            if mono_t.any():
                genotypes_t = genotypes_t[~mono_t]
                genotype_range = genotype_range[~mono_t.cpu()]
                if warn_monomorphic:
                    logger.write(f'    * WARNING: excluding {mono_t.sum()} monomorphic variants')

            if genotypes_t.shape[0] == 0:
                logger.write(f'WARNING: skipping {phenotype_id} (no valid variants)')
                continue

            # iterate over phenotypes
            buf = []
            for phenotype, phenotype_id in zip(phenotypes, phenotype_ids):
                phenotype_t = torch.tensor(phenotype, dtype=torch.float).to(device)
                res = calculate_cis_permutations(genotypes_t, phenotype_t, permutation_ix_t,
                                                 residualizer=residualizer, random_tiebreak=random_tiebreak)
                res = [i.cpu().numpy() for i in res]  # r_nominal, std_ratio, var_ix, r2_perm, g
                res[2] = genotype_range[res[2]]
                buf.append(res + [genotypes_t.shape[0], phenotype_id])
            res_s = _process_group_permutations(buf, variant_df, igc.phenotype_tss[phenotype_ids[0]], dof,
                                                group_id, nperm=nperm, beta_approx=beta_approx)
            res_df.append(res_s)

    res_df = pd.concat(res_df, axis=1, sort=False).T
    res_df.index.name = 'phenotype_id'
    logger.write(f'  Time elapsed: {(time.time()-start_time)/60:.2f} min')
    logger.write('done.')
    return res_df.astype(output_dtype_dict).infer_objects()