def map_tissue_eqtls( tissue, genes, genotype_df, variant_df, eqtls, covariates_dir, expression_dir, pval_threshold, maf_threshold, eqtl_project ): ''' covariates_fp = '' phenotype_fp = '' if eqtl_project.lower() == 'gtex': # TODO rename files to be consistent. covariates_fp = os.path.join( covariates_dir, tissue + '.v8.covariates.txt') phenotype_fp = os.path.join( expression_dir, tissue + '.v8.normalized_expression.bed.gz') else: covariates_fp = os.path.join( covariates_dir, tissue + '.covariates.txt') phenotype_fp = os.path.join( expression_dir, tissue + '.normalized_expression.bed.gz') if not (os.path.exists(covariates_fp) and os.path.exists(phenotype_fp)): return covariates_df = pd.read_csv(covariates_fp, sep='\t', index_col=0).T phenotype_df, pos_df = tensorqtl.read_phenotype_bed(phenotype_fp) if pairs_df['pid'].iloc[0] != '': # Spatial connections phenotype_df = phenotype_df[ phenotype_df.index.isin(pairs_df['pid'])] ''' phenotype_df, covariates_df = fetch_phenotypes( tissue, genes, covariates_dir, expression_dir, eqtl_project) eqtl_df = trans.map_trans( genotype_df, phenotype_df, covariates_df, return_sparse=True, pval_threshold=float(pval_threshold), maf_threshold=float(maf_threshold), batch_size=20000, verbose=False) eqtl_df['tissue'] = tissue eqtls.append(eqtl_df[~((eqtl_df['variant_id'].isnull()) | (eqtl_df['phenotype_id'].isnull()))])
# Set up file paths phenotype_bed_file = path + "phenotypes/INTERVAL_RNAseq_phase1-2_filteredSamplesGenes_TMMNormalised_FPKM_Counts_foranalysis_chr" + chr + ".bed.gz" covariates_file = path + "covariates/INTERVAL_RNAseq_phase1-2_fullcovariates_foranalysis.txt" plink_prefix_path = path + "genotypes/INTERVAL_RNAseq_Phase1-2_imputed_b38_biallelic_MAF0.005_chr" + chr # Read in phenotypes phenotype_df, phenotype_pos_df = tensorqtl.read_phenotype_bed( phenotype_bed_file) # Read in covariates and make subset to only ids that are in the phenotype file covariates_df = pd.read_csv(covariates_file, sep='\t', index_col=0) covariates_df = covariates_df[phenotype_df.columns].T # Read in genotypes pr = genotypeio.PlinkReader(plink_prefix_path) # load genotypes and variants into data frames genotype_df = pd.DataFrame(pr.load_genotypes(), index=pr.bim['snp'], columns=pr.fam['iid']) variant_df = pr.bim.set_index('snp')[['chrom', 'pos']] # Call trans-eQTLs trans_df = trans.map_trans(genotype_df, phenotype_df, covariates_df, return_sparse=True, maf_threshold=0.005) trans_df.to_csv(outpath + "tensorqtl_trans_MAF0.005_chr" + chr + ".csv", index=False)
# read in genotypes: chr9_geno_df = pd.read_csv('../../data/tensorqtldata/chr9.csv') chr9_geno_df = chr9_geno_df.drop(chr9_geno_df.columns[[1]], axis=1) chr9_geno_df = chr9_geno_df.set_index('ID') #chr9_geno_df = torch.from_numpy(chr9_geno_df.values) # load phenotype and covariates phenotype_df, phenotype_pos_df = tensorqtl.read_phenotype_bed(expression_bed) covariates_df = pd.read_csv(covariates_file, sep='\t', index_col=0).T import timeit import time start_time = timeit.default_timer() trans_df = trans.map_trans(chr9_geno_df, phenotype_df, covariates_df, batch_size=10000, return_sparse=True, pval_threshold=1e-5, maf_threshold=0.05) #print("Tensorqtl trans.map_trans function took:" #print(timeit.default_timer() - start_time) timetaken = timeit.default_timer() - start_time msg = "{func} took {time} seconds to complete." print(msg.format(func=trans.map_trans.__name__, time=timetaken)) # trans_df.to_csv('tensorqtl-scan-covar.csv') # map_trans took 4.8016955852508545 seconds to complete. # use GPU Nvidia V100 # subset data set, getting a smaller end result for the purpose of comparing TensorQTL vs LiteQTL.
for i in range(1, 23): print(i) plink_prefix_path = rootpath + 'genotypes/INTERVAL_RNAseq_Phase1-2_imputed_b38_biallelic_MAF0.005_chr' + str( i) print(plink_prefix_path) gw_pr = genotypeio.PlinkReader(plink_prefix_path) gw_genotype_df = pd.DataFrame(gw_pr.load_genotypes(), index=gw_pr.bim['snp'], columns=gw_pr.fam['iid']) gw_variant_df = gw_pr.bim.set_index('snp')[['chrom', 'pos']] MAF_filter = 0.005 gw_trans_df = trans.map_trans(gw_genotype_df, phenotype_df, covariates_df, return_sparse=True, return_r2=True, maf_threshold=MAF_filter, batch_size=gw_variant_df.shape[0]) gw_trans_df.to_csv( outdir + "/tensorqtl_trans_MAF" + str(MAF_filter) + "_all_age_sex_rin_batch_readDepth_PC10_PEER20_COVID19_CHR" + str(i) + ".csv") # chrX trans plink_prefix_path_x = "/rds/project/jmmh2/rds-jmmh2-projects/interval_rna_seq/covid19/genotypes/INTERVAL_chrX_merged_cleaned_RNAseq_phase1-2_b38_rsids_deduplicated_MAF0.005" x_pr = genotypeio.PlinkReader(plink_prefix_path_x) x_genotype_df = pd.DataFrame(x_pr.load_genotypes(), index=x_pr.bim['snp'], columns=x_pr.fam['iid']) x_variant_df = x_pr.bim.set_index('snp')[['chrom', 'pos']]
cis_df = cis.map_cis(gmpr_genotype_df, gmpr_variant_df, phenotype_df, phenotype_pos_df, covariates_df) tensorqtl.calculate_qvalues(cis_df, qvalue_lambda=0.85) cis_df.to_csv(outpath + "tensorqtl_cis_cisPerGene_chr" + chr + ".csv", index=True, index_label="Phenotype") # Cis nominal mapping cisnom_df = cis.map_nominal(gmpr_genotype_df, gmpr_variant_df, phenotype_df, phenotype_pos_df, covariates_df, prefix=outpath + "tensorqtl_cis_cisNominal_chr" + chr) cisnom_df2 = pd.read_parquet( outpath + "tensorqtl_cis_cisNominal_chr6.cis_qtl_pairs.6.parquet") cisnom_df2.to_csv(outpath + "tensorqtl_cis_cisNominal_chr6.cis_qtl_pairs.6.csv", index=False) # Call trans-eQTLs trans_min_df = trans.map_trans(gmpr_genotype_df, gw_phenotype_df, covariates_df, return_sparse=True) trans_min_df.to_csv(outpath + "tensorqtl_trans.csv", index=False) # Conditional cis-analysis (may time out!) #indep_df = cis.map_independent(gmpr_genotype_df, gmpr_variant_df, cis_df, phenotype_df, phenotype_pos_df, covariates_df, nperm=10000) #indep_df.to_csv(outpath + "tensorqtl_cis_cisIndependent_chr" + chr + ".csv", index=True, index_label = "Phenotype")
f'time_stamp,device,data_transfer_time,compute_time,result_reorg_time,pval_time,elapsed_total\n' ) # device = torch.device("cpu") # trans_df, cpucalctime = trans.map_trans(small_geno_df, phenotype_df, batch_size=20000, # return_sparse=True, pval_threshold=1e-5, maf_threshold=0.05, device=device, timing_file=timing_file) for i in range(0, 10): ##################################### Full Matrix Case ######################################## for numthreads in [20]: torch.set_num_threads(numthreads) device = torch.device("cpu") trans_df, cpucalctime = trans.map_trans(small_geno_df, phenotype_df, batch_size=20000, return_sparse=False, pval_threshold=1, maf_threshold=0.00, device=device, timing_file=timing_file) device = torch.device("cuda") (trans_df, gpucalctime) = trans.map_trans(small_geno_df, phenotype_df, batch_size=20000, return_sparse=False, pval_threshold=1, maf_threshold=0.00, device=device, timing_file=timing_file) n = small_geno_df.shape[1]
group_s=None, run_eigenmt=True, output_dir=covdir) for i in [8, 9, 21]: df = pd.read_parquet( covdir + "tensorqtl_cis_MAF0.005_cisGxE_covid19.cis_qtl_pairs." + str(i) + ".parquet") df.to_csv(covdir + "tensorqtl_cis_MAF0.005_cisGxE_covid19.cis_qtl_pairs." + str(i) + ".csv", index=False) # trans trans_peer_df = trans.map_trans(genotype_df, pheno_df_noACE2, covariates_peer_df, return_sparse=True, maf_threshold=0.005) trans_peer_df.to_csv( outdir + "tensorqtl_trans_MAF0.005_all_age_sex_rin_batch_readDepth_PC10_PEER20_COVID19.csv" ) ################################################################# # chrX import pandas as pd import tensorqtl from tensorqtl import genotypeio, cis, trans # Function to re-add RSids as these are no longer in the vcf file