def map_nominal(genotype_df, variant_df, phenotype_df, phenotype_pos_df, covariates_df, prefix, interaction_s=None, maf_threshold_interaction=0.05, group_s=None, window=1000000, run_eigenmt=False, output_dir='.', write_top=True, write_stats=True, logger=None, verbose=True): """ cis-QTL mapping: nominal associations for all variant-phenotype pairs Association results for each chromosome are written to parquet files in the format <output_dir>/<prefix>.cis_qtl_pairs.<chr>.parquet If interaction_s is provided, the top association per phenotype is written to <output_dir>/<prefix>.cis_qtl_top_assoc.txt.gz unless write_top is set to False, in which case it is returned as a DataFrame """ assert np.all(phenotype_df.columns == covariates_df.index) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if logger is None: logger = SimpleLogger() if group_s is not None: group_dict = group_s.to_dict() logger.write( 'cis-QTL mapping: nominal associations for all variant-phenotype pairs' ) logger.write(' * {} samples'.format(phenotype_df.shape[1])) logger.write(' * {} phenotypes'.format(phenotype_df.shape[0])) logger.write(' * {} covariates'.format(covariates_df.shape[1])) logger.write(' * {} variants'.format(variant_df.shape[0])) if interaction_s is not None: assert np.all(interaction_s.index == covariates_df.index) logger.write(' * including interaction term') if maf_threshold_interaction > 0: logger.write(' * using {:.2f} MAF threshold'.format( maf_threshold_interaction)) covariates_t = torch.tensor(covariates_df.values, dtype=torch.float32).to(device) residualizer = Residualizer(covariates_t) del covariates_t genotype_ix = np.array( [genotype_df.columns.tolist().index(i) for i in phenotype_df.columns]) genotype_ix_t = torch.from_numpy(genotype_ix).to(device) if interaction_s is None: dof = phenotype_df.shape[1] - 2 - covariates_df.shape[1] else: dof = phenotype_df.shape[1] - 4 - covariates_df.shape[1] interaction_t = torch.tensor(interaction_s.values.reshape(1, -1), dtype=torch.float32).to(device) if maf_threshold_interaction > 0: interaction_mask_t = torch.BoolTensor( interaction_s >= interaction_s.median()).to(device) else: interaction_mask_t = None igc = genotypeio.InputGeneratorCis(genotype_df, variant_df, phenotype_df, phenotype_pos_df, group_s=group_s, window=window) # iterate over chromosomes best_assoc = [] start_time = time.time() k = 0 logger.write(' * Computing associations') for chrom in igc.chrs: logger.write(' Mapping chromosome {}'.format(chrom)) # allocate arrays n = 0 if group_s is None: for i in igc.phenotype_pos_df[igc.phenotype_pos_df['chr'] == chrom].index: j = igc.cis_ranges[i] n += j[1] - j[0] + 1 else: for i in igc.group_s[igc.phenotype_pos_df['chr'] == chrom].drop_duplicates().index: j = igc.cis_ranges[i] n += j[1] - j[0] + 1 chr_res = OrderedDict() chr_res['phenotype_id'] = [] chr_res['variant_id'] = [] chr_res['tss_distance'] = np.empty(n, dtype=np.int32) chr_res['maf'] = np.empty(n, dtype=np.float32) chr_res['ma_samples'] = np.empty(n, dtype=np.int32) chr_res['ma_count'] = np.empty(n, dtype=np.int32) if interaction_s is None: chr_res['pval_nominal'] = np.empty(n, dtype=np.float64) chr_res['slope'] = np.empty(n, dtype=np.float32) chr_res['slope_se'] = np.empty(n, dtype=np.float32) else: chr_res['pval_g'] = np.empty(n, dtype=np.float64) chr_res['b_g'] = np.empty(n, dtype=np.float32) chr_res['b_g_se'] = np.empty(n, dtype=np.float32) chr_res['pval_i'] = np.empty(n, dtype=np.float64) chr_res['b_i'] = np.empty(n, dtype=np.float32) chr_res['b_i_se'] = np.empty(n, dtype=np.float32) chr_res['pval_gi'] = np.empty(n, dtype=np.float64) chr_res['b_gi'] = np.empty(n, dtype=np.float32) chr_res['b_gi_se'] = np.empty(n, dtype=np.float32) start = 0 if group_s is None: for k, (phenotype, genotypes, genotype_range, phenotype_id) in enumerate( igc.generate_data(chrom=chrom, verbose=verbose), k + 1): # copy genotypes to GPU phenotype_t = torch.tensor(phenotype, dtype=torch.float).to(device) genotypes_t = torch.tensor(genotypes, dtype=torch.float).to(device) genotypes_t = genotypes_t[:, genotype_ix_t] impute_mean(genotypes_t) variant_ids = variant_df.index[ genotype_range[0]:genotype_range[-1] + 1] tss_distance = np.int32( variant_df['pos']. values[genotype_range[0]:genotype_range[-1] + 1] - igc.phenotype_tss[phenotype_id]) if interaction_s is None: res = calculate_cis_nominal(genotypes_t, phenotype_t, residualizer) tstat, slope, slope_se, maf, ma_samples, ma_count = [ i.cpu().numpy() for i in res ] n = len(variant_ids) else: genotypes_t, mask_t = filter_maf_interaction( genotypes_t, interaction_mask_t=interaction_mask_t, maf_threshold_interaction=maf_threshold_interaction) if genotypes_t.shape[0] > 0: res = calculate_interaction_nominal( genotypes_t, phenotype_t.unsqueeze(0), interaction_t, residualizer, return_sparse=False) tstat, b, b_se, maf, ma_samples, ma_count = [ i.cpu().numpy() for i in res ] mask = mask_t.cpu().numpy() variant_ids = variant_ids[mask] tss_distance = tss_distance[mask] n = len(variant_ids) # top association ix = np.nanargmax(np.abs(tstat[:, 2])) top_s = pd.Series([ phenotype_id, variant_ids[ix], tss_distance[ix], maf[ix], ma_samples[ix], ma_count[ix], tstat[ix, 0], b[ix, 0], b_se[ix, 0], tstat[ix, 1], b[ix, 1], b_se[ix, 1], tstat[ix, 2], b[ix, 2], b_se[ix, 2] ], index=chr_res.keys()) if run_eigenmt: # compute eigenMT correction top_s['tests_emt'] = eigenmt.compute_tests( genotypes_t, var_thresh=0.99, variant_window=200) best_assoc.append(top_s) else: # all genotypes in window were filtered out n = 0 if n > 0: chr_res['phenotype_id'].extend([phenotype_id] * n) chr_res['variant_id'].extend(variant_ids) chr_res['tss_distance'][start:start + n] = tss_distance chr_res['maf'][start:start + n] = maf chr_res['ma_samples'][start:start + n] = ma_samples chr_res['ma_count'][start:start + n] = ma_count if interaction_s is None: chr_res['pval_nominal'][start:start + n] = tstat chr_res['slope'][start:start + n] = slope chr_res['slope_se'][start:start + n] = slope_se else: chr_res['pval_g'][start:start + n] = tstat[:, 0] chr_res['b_g'][start:start + n] = b[:, 0] chr_res['b_g_se'][start:start + n] = b_se[:, 0] chr_res['pval_i'][start:start + n] = tstat[:, 1] chr_res['b_i'][start:start + n] = b[:, 1] chr_res['b_i_se'][start:start + n] = b_se[:, 1] chr_res['pval_gi'][start:start + n] = tstat[:, 2] chr_res['b_gi'][start:start + n] = b[:, 2] chr_res['b_gi_se'][start:start + n] = b_se[:, 2] start += n # update pointer else: # groups for k, (phenotypes, genotypes, genotype_range, phenotype_ids, group_id) in enumerate( igc.generate_data(chrom=chrom, verbose=verbose), k + 1): # copy genotypes to GPU genotypes_t = torch.tensor(genotypes, dtype=torch.float).to(device) genotypes_t = genotypes_t[:, genotype_ix_t] impute_mean(genotypes_t) variant_ids = variant_df.index[ genotype_range[0]:genotype_range[-1] + 1] # assuming that the TSS for all grouped phenotypes is the same tss_distance = np.int32( variant_df['pos']. values[genotype_range[0]:genotype_range[-1] + 1] - igc.phenotype_tss[phenotype_ids[0]]) if interaction_s is not None: genotypes_t, mask_t = filter_maf_interaction( genotypes_t, interaction_mask_t=interaction_mask_t, maf_threshold_interaction=maf_threshold_interaction) mask = mask_t.cpu().numpy() variant_ids = variant_ids[mask] tss_distance = tss_distance[mask] n = len(variant_ids) if genotypes_t.shape[0] > 0: # process first phenotype in group phenotype_id = phenotype_ids[0] phenotype_t = torch.tensor(phenotypes[0], dtype=torch.float).to(device) if interaction_s is None: res = calculate_cis_nominal(genotypes_t, phenotype_t, residualizer) tstat, slope, slope_se, maf, ma_samples, ma_count = [ i.cpu().numpy() for i in res ] else: res = calculate_interaction_nominal( genotypes_t, phenotype_t.unsqueeze(0), interaction_t, residualizer, return_sparse=False) tstat, b, b_se, maf, ma_samples, ma_count = [ i.cpu().numpy() for i in res ] px = [phenotype_id] * n # iterate over remaining phenotypes in group for phenotype, phenotype_id in zip(phenotypes[1:], phenotype_ids[1:]): phenotype_t = torch.tensor( phenotype, dtype=torch.float).to(device) if interaction_s is None: res = calculate_cis_nominal( genotypes_t, phenotype_t, residualizer) tstat0, slope0, slope_se0, maf, ma_samples, ma_count = [ i.cpu().numpy() for i in res ] else: res = calculate_interaction_nominal( genotypes_t, phenotype_t.unsqueeze(0), interaction_t, residualizer, return_sparse=False) tstat0, b0, b_se0, maf, ma_samples, ma_count = [ i.cpu().numpy() for i in res ] # find associations that are stronger for current phenotype if interaction_s is None: ix = np.where(np.abs(tstat0) > np.abs(tstat))[0] else: ix = np.where( np.abs(tstat0[:, 2]) > np.abs(tstat[:, 2]))[0] # update relevant positions for j in ix: px[j] = phenotype_id if interaction_s is None: tstat[ix] = tstat0[ix] slope[ix] = slope0[ix] slope_se[ix] = slope_se0[ix] else: tstat[ix] = tstat0[ix] b[ix] = b0[ix] b_se[ix] = b_se0[ix] chr_res['phenotype_id'].extend(px) chr_res['variant_id'].extend(variant_ids) chr_res['tss_distance'][start:start + n] = tss_distance chr_res['maf'][start:start + n] = maf chr_res['ma_samples'][start:start + n] = ma_samples chr_res['ma_count'][start:start + n] = ma_count if interaction_s is None: chr_res['pval_nominal'][start:start + n] = tstat chr_res['slope'][start:start + n] = slope chr_res['slope_se'][start:start + n] = slope_se else: chr_res['pval_g'][start:start + n] = tstat[:, 0] chr_res['b_g'][start:start + n] = b[:, 0] chr_res['b_g_se'][start:start + n] = b_se[:, 0] chr_res['pval_i'][start:start + n] = tstat[:, 1] chr_res['b_i'][start:start + n] = b[:, 1] chr_res['b_i_se'][start:start + n] = b_se[:, 1] chr_res['pval_gi'][start:start + n] = tstat[:, 2] chr_res['b_gi'][start:start + n] = b[:, 2] chr_res['b_gi_se'][start:start + n] = b_se[:, 2] # top association for the group if interaction_s is not None: ix = np.nanargmax(np.abs(tstat[:, 2])) top_s = pd.Series([ chr_res['phenotype_id'][start:start + n][ix], variant_ids[ix], tss_distance[ix], maf[ix], ma_samples[ix], ma_count[ix], tstat[ix, 0], b[ix, 0], b_se[ix, 0], tstat[ix, 1], b[ix, 1], b_se[ix, 1], tstat[ix, 2], b[ix, 2], b_se[ix, 2] ], index=chr_res.keys()) top_s['num_phenotypes'] = len(phenotype_ids) if run_eigenmt: # compute eigenMT correction top_s['tests_emt'] = eigenmt.compute_tests( genotypes_t, var_thresh=0.99, variant_window=200) best_assoc.append(top_s) start += n # update pointer logger.write(' time elapsed: {:.2f} min'.format( (time.time() - start_time) / 60)) # convert to dataframe, compute p-values and write current chromosome if start < len(chr_res['maf']): for x in chr_res: chr_res[x] = chr_res[x][:start] if write_stats: chr_res_df = pd.DataFrame(chr_res) if interaction_s is None: m = chr_res_df['pval_nominal'].notnull() chr_res_df.loc[m, 'pval_nominal'] = 2 * stats.t.cdf( -chr_res_df.loc[m, 'pval_nominal'].abs(), dof) else: m = chr_res_df['pval_gi'].notnull() chr_res_df.loc[m, 'pval_g'] = 2 * stats.t.cdf( -chr_res_df.loc[m, 'pval_g'].abs(), dof) chr_res_df.loc[m, 'pval_i'] = 2 * stats.t.cdf( -chr_res_df.loc[m, 'pval_i'].abs(), dof) chr_res_df.loc[m, 'pval_gi'] = 2 * stats.t.cdf( -chr_res_df.loc[m, 'pval_gi'].abs(), dof) print(' * writing output') chr_res_df.to_parquet( os.path.join( output_dir, '{}.cis_qtl_pairs.{}.parquet'.format(prefix, chrom))) if interaction_s is not None: best_assoc = pd.concat( best_assoc, axis=1, sort=False).T.set_index('phenotype_id').infer_objects() m = best_assoc['pval_g'].notnull() best_assoc.loc[m, 'pval_g'] = 2 * stats.t.cdf( -best_assoc.loc[m, 'pval_g'].abs(), dof) best_assoc.loc[m, 'pval_i'] = 2 * stats.t.cdf( -best_assoc.loc[m, 'pval_i'].abs(), dof) best_assoc.loc[m, 'pval_gi'] = 2 * stats.t.cdf( -best_assoc.loc[m, 'pval_gi'].abs(), dof) if run_eigenmt: if group_s is None: best_assoc['pval_emt'] = np.minimum( best_assoc['tests_emt'] * best_assoc['pval_gi'], 1) else: best_assoc['pval_emt'] = np.minimum( best_assoc['num_phenotypes'] * best_assoc['tests_emt'] * best_assoc['pval_gi'], 1) best_assoc['pval_adj_bh'] = eigenmt.padjust_bh( best_assoc['pval_emt']) if write_top: best_assoc.to_csv(os.path.join( output_dir, '{}.cis_qtl_top_assoc.txt.gz'.format(prefix)), sep='\t', float_format='%.6g') else: return best_assoc logger.write('done.')
def map_nominal(genotype_df, variant_df, phenotype_df, phenotype_pos_df, covariates_df, prefix, interaction_s=None, maf_threshold_interaction=0.05, group_s=None, window=1000000, run_eigenmt=False, output_dir='.', logger=None, verbose=True): """ cis-QTL mapping: nominal associations for all variant-phenotype pairs Association results for each chromosome are written to parquet files in the format <output_dir>/<prefix>.cis_qtl_pairs.<chr>.parquet """ assert np.all(phenotype_df.columns == covariates_df.index) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if logger is None: logger = SimpleLogger() if group_s is not None: group_dict = group_s.to_dict() logger.write( 'cis-QTL mapping: nominal associations for all variant-phenotype pairs' ) logger.write(' * {} samples'.format(phenotype_df.shape[1])) logger.write(' * {} phenotypes'.format(phenotype_df.shape[0])) logger.write(' * {} covariates'.format(covariates_df.shape[1])) logger.write(' * {} variants'.format(variant_df.shape[0])) if interaction_s is not None: assert np.all(interaction_s.index == covariates_df.index) logger.write(' * including interaction term') if maf_threshold_interaction > 0: logger.write(' * using {:.2f} MAF threshold'.format( maf_threshold_interaction)) covariates_t = torch.tensor(covariates_df.values, dtype=torch.float32).to(device) residualizer = Residualizer(covariates_t) del covariates_t genotype_ix = np.array( [genotype_df.columns.tolist().index(i) for i in phenotype_df.columns]) genotype_ix_t = torch.from_numpy(genotype_ix).to(device) if interaction_s is None: dof = phenotype_df.shape[1] - 2 - covariates_df.shape[1] else: dof = phenotype_df.shape[1] - 4 - covariates_df.shape[1] interaction_t = torch.tensor(interaction_s.values.reshape(1, -1), dtype=torch.float32).to(device) if maf_threshold_interaction > 0: interaction_mask_t = torch.BoolTensor( interaction_s >= interaction_s.median()).to(device) else: interaction_mask_t = None igc = genotypeio.InputGeneratorCis(genotype_df, variant_df, phenotype_df, phenotype_pos_df, window=window) # iterate over chromosomes best_assoc = [] start_time = time.time() prev_phenotype_id = None k = 0 logger.write(' * Computing associations') for chrom in igc.chrs: logger.write(' Mapping chromosome {}'.format(chrom)) chr_res_df = [] for k, (phenotype, genotypes, genotype_range, phenotype_id) in enumerate( igc.generate_data(chrom=chrom, verbose=verbose), k + 1): # copy genotypes to GPU phenotype_t = torch.tensor(phenotype, dtype=torch.float).to(device) genotypes_t = torch.tensor(genotypes, dtype=torch.float).to(device) genotypes_t = genotypes_t[:, genotype_ix_t] impute_mean(genotypes_t) if interaction_s is None: res = calculate_cis_nominal(genotypes_t, phenotype_t, residualizer) tstat, slope, slope_se, maf, ma_samples, ma_count = [ i.cpu().numpy() for i in res ] variant_ids = variant_df.index[ genotype_range[0]:genotype_range[-1] + 1] tss_distance = np.int32( variant_df['pos']. values[genotype_range[0]:genotype_range[-1] + 1] - igc.phenotype_tss[phenotype_id]) res_df = pd.DataFrame( OrderedDict([ ('phenotype_id', [phenotype_id] * len(variant_ids)), ('variant_id', variant_ids), ('tss_distance', tss_distance), ('maf', maf), ('ma_samples', ma_samples), ('ma_count', ma_count), ('pval_nominal', tstat ), #### replace with pval (currently on CPU, below) ('slope', slope), ('slope_se', slope_se), ])) else: genotypes_t, mask_t = filter_maf_interaction( genotypes_t, interaction_mask_t=interaction_mask_t, maf_threshold_interaction=maf_threshold_interaction) if genotypes_t.shape[0] > 0: res = calculate_interaction_nominal( genotypes_t, phenotype_t.unsqueeze(0), interaction_t, residualizer, return_sparse=False) if run_eigenmt: # compute eigenMT correction m_eff = eigenmt.compute_tests(genotypes_t, var_thresh=0.99, variant_window=200) tstat, b, b_se, maf, ma_samples, ma_count = [ i.cpu().numpy() for i in res ] mask = mask_t.cpu().numpy() r = igc.cis_ranges[phenotype_id] variant_ids = variant_df.index[r[0]:r[-1] + 1] tss_distance = np.int32( variant_df['pos'].values[r[0]:r[-1] + 1] - igc.phenotype_tss[phenotype_id]) variant_ids = variant_ids[mask] tss_distance = tss_distance[mask] nv = len(variant_ids) res_df = pd.DataFrame( OrderedDict([ ('phenotype_id', [phenotype_id] * nv), ('variant_id', variant_ids), ('tss_distance', tss_distance), ('maf', maf), ('ma_samples', ma_samples), ('ma_count', ma_count), ('pval_g', tstat[:, 0]), ('b_g', b[:, 0]), ('b_g_se', b_se[:, 0]), ('pval_i', tstat[:, 1]), ('b_i', b[:, 1]), ('b_i_se', b_se[:, 1]), ('pval_gi', tstat[:, 2]), ('b_gi', b[:, 2]), ('b_gi_se', b_se[:, 2]), ])) top_s = res_df.loc[res_df['pval_gi'].abs().idxmax()].copy() if run_eigenmt: top_s['tests_emt'] = m_eff best_assoc.append( top_s ) # top variant only (pval_gi is t-statistic here, hence max) else: # all genotypes in window were filtered out res_df = None if group_s is not None and group_dict[ phenotype_id] == group_dict.get(prev_phenotype_id): # store the strongest association within each group if interaction_s is None: ix = res_df['pval_nominal'] > chr_res_df[-1][ 'pval_nominal'] # compare t-statistics else: ix = res_df['pval_gi'] > chr_res_df[-1]['pval_gi'] chr_res_df[-1].loc[ix] = res_df.loc[ix] else: chr_res_df.append(res_df) prev_phenotype_id = phenotype_id logger.write(' time elapsed: {:.2f} min'.format( (time.time() - start_time) / 60)) # compute p-values and write current chromosome chr_res_df = pd.concat(chr_res_df, copy=False) if interaction_s is None: m = chr_res_df['pval_nominal'].notnull() chr_res_df.loc[m, 'pval_nominal'] = 2 * stats.t.cdf( -chr_res_df.loc[m, 'pval_nominal'].abs(), dof) else: m = chr_res_df['pval_gi'].notnull() chr_res_df.loc[m, 'pval_g'] = 2 * stats.t.cdf( -chr_res_df.loc[m, 'pval_g'].abs(), dof) chr_res_df.loc[m, 'pval_i'] = 2 * stats.t.cdf( -chr_res_df.loc[m, 'pval_i'].abs(), dof) chr_res_df.loc[m, 'pval_gi'] = 2 * stats.t.cdf( -chr_res_df.loc[m, 'pval_gi'].abs(), dof) print(' * writing output') chr_res_df.to_parquet( os.path.join(output_dir, '{}.cis_qtl_pairs.{}.parquet'.format(prefix, chrom))) if interaction_s is not None: best_assoc = pd.concat( best_assoc, axis=1, sort=False).T.set_index('phenotype_id').infer_objects() m = best_assoc['pval_g'].notnull() best_assoc.loc[m, 'pval_g'] = 2 * stats.t.cdf( -best_assoc.loc[m, 'pval_g'].abs(), dof) best_assoc.loc[m, 'pval_i'] = 2 * stats.t.cdf( -best_assoc.loc[m, 'pval_i'].abs(), dof) best_assoc.loc[m, 'pval_gi'] = 2 * stats.t.cdf( -best_assoc.loc[m, 'pval_gi'].abs(), dof) if run_eigenmt: best_assoc['pval_emt'] = np.minimum( best_assoc['tests_emt'] * best_assoc['pval_gi'], 1) best_assoc['pval_adj_bh'] = eigenmt.padjust_bh( best_assoc['pval_emt']) best_assoc.to_csv(os.path.join( output_dir, '{}.cis_qtl_top_assoc.txt.gz'.format(prefix)), sep='\t', float_format='%.6g') logger.write('done.')