def read_genotype(plink_prefix): ''' Read WGS VCF ''' pr = genotypeio.PlinkReader(plink_prefix) genotype_df = pd.DataFrame( pr.load_genotypes(), #index=pr.bim['snp'], columns=pr.fam['iid']) return genotype_df
def fetch_genotypes(snps, geno, plink_prefix, C): print(' * Loading genotypes') snp_ref = pd.read_csv(f'{geno}.bim', sep='\t', engine='c', memory_map=True, compression=None, usecols=[1], names = ['snp'], header=None) snp_list = (snp_ref[snp_ref['snp'].isin(set(snps))] .sort_values(by=['snp']) )['snp'].drop_duplicates().tolist() filtering = time.time() cmd = f'''{C.plink} \ --bfile {geno} \ --snps {', '.join(snp_list)} \ --out {plink_prefix} \ --make-bed \ --silent ''' filter_snps = subprocess.run(cmd, shell=True, check=True) if filter_snps.returncode != 0: sys.exit(f'Could not fetch SNPs.') pr = genotypeio.PlinkReader(plink_prefix, verbose=False) genotype_df = pr.load_genotypes() variant_df = pr.bim.set_index('snp')[['chrom', 'pos']] plink = time.time() return genotype_df, variant_df
# Set up file paths phenotype_bed_file = path + "phenotypes/INTERVAL_RNAseq_phase1-2_filteredSamplesGenes_TMMNormalised_FPKM_Counts_foranalysis_chr" + chr + ".bed.gz" covariates_file = path + "covariates/INTERVAL_RNAseq_phase1-2_fullcovariates_foranalysis.txt" plink_prefix_path = path + "genotypes/INTERVAL_RNAseq_Phase1-2_imputed_b38_biallelic_MAF0.005_chr" + chr # Read in phenotypes phenotype_df, phenotype_pos_df = tensorqtl.read_phenotype_bed( phenotype_bed_file) # Read in covariates and make subset to only ids that are in the phenotype file covariates_df = pd.read_csv(covariates_file, sep='\t', index_col=0) covariates_df = covariates_df[phenotype_df.columns].T # Read in genotypes pr = genotypeio.PlinkReader(plink_prefix_path) # load genotypes and variants into data frames genotype_df = pd.DataFrame(pr.load_genotypes(), index=pr.bim['snp'], columns=pr.fam['iid']) variant_df = pr.bim.set_index('snp')[['chrom', 'pos']] # Call trans-eQTLs trans_df = trans.map_trans(genotype_df, phenotype_df, covariates_df, return_sparse=True, maf_threshold=0.005) trans_df.to_csv(outpath + "tensorqtl_trans_MAF0.005_chr" + chr + ".csv", index=False)
# Load phenotypes and covariates: phenotype_df, phenotype_pos_df = tensorqtl.read_phenotype_bed(pheno_file) covariates_df = pd.read_csv(cov_file, sep='\t', index_col=0).T # samples x covariates assert np.all(phenotype_df.columns==covariates_df.index) # Load interaction data interaction_s = pd.read_csv(interaction_file, sep='\t', index_col=0, header=None, squeeze=True) ## Select individuals that are in the interaction dataset phenotype_df = phenotype_df.iloc[:, phenotype_df.columns.isin(interaction_s.index)] covariates_df = covariates_df[covariates_df.index.isin(interaction_s.index)] assert np.all(phenotype_df.columns==covariates_df.index) assert covariates_df.index.isin(interaction_s.index).all() interaction_s = interaction_s.loc[covariates_df.index].astype(np.float32) # Load genotypes (for VCFs with hard GT calls only, specify type as np.int8 to save memory) pr = genotypeio.PlinkReader(geno_path, select_samples=phenotype_df.columns, dtype=np.int8) # Load genotypes for each chromosome separately top_df = [] for chrom in pr.chrs: g, pos_s = pr.get_region(chrom) genotype_df = pd.DataFrame(g, index=pos_s.index, columns=pr.fam['iid'])[phenotype_df.columns] variant_df = pr.bim.set_index('snp')[['chrom', 'pos']] # Map cis_nominal with intercation term and eigenMT correction chr_df = cis.map_nominal(genotype_df, variant_df[variant_df['chrom']==chrom], phenotype_df[phenotype_pos_df['chr']==chrom], phenotype_pos_df[phenotype_pos_df['chr']==chrom], covariates_df, prefix, interaction_s=interaction_s, maf_threshold_interaction=0.1, window=1000000, output_dir=output_dir, write_top=False, run_eigenmt=True) top_df.append(chr_df) top_df = pd.concat(top_df) top_df.to_csv(os.path.join(output_dir, '{}.cis_qtl_top_assoc.txt.gz'.format(prefix)), sep='\t', float_format='%.6g') # if __name__ == '__main__':
excluded_chr_list = None else: all_chrs_list.remove(chr_id) excluded_chr_list = all_chrs_list logger.write('[{}] Running TensorQTL: {}-QTL mapping'.format( datetime.now().strftime("%b %d %H:%M:%S"), args.mode.split('_')[0])) # logger = SimpleLogger() # load phenotypes and covariates phenotype_df, phenotype_pos_df = tensorqtl.read_phenotype_bed(expression_bed) covariates_df = pd.read_csv(covariates_file, sep='\t', index_col=0).T pr = genotypeio.PlinkReader(plink_prefix_path, exclude_chrs=excluded_chr_list) genotype_df = pr.load_genotypes() variant_df = pr.bim.set_index('snp')[['chrom', 'pos']] if mode == 'cis': # cis-QTL: empirical p-values for phenotypes if excluded_chr_list: cis_df = cis.map_cis( genotype_df, variant_df, phenotype_df.loc[phenotype_pos_df['chr'] == chr_id], phenotype_pos_df.loc[phenotype_pos_df['chr'] == chr_id], covariates_df=covariates_df, seed=args.seed) else: cis_df = cis.map_cis(genotype_df,
phenotype_df, phenotype_pos_df = tensorqtl.read_phenotype_bed(expression_bed) covariates_df = pd.read_csv(covariates_file, sep='\t', index_col=0).T #phenotype_gene_df, phenotype_gene_pos_df = tensorqtl.read_phenotype_bed(expression_gene_bed) covariates_df = covariates_df.astype('float64') #interaction_s = pd.Series(data=covariates_df['IRSnoAge'], index=covariates_df.index.values) old = covariates_df['SDC_AGE_CALC'] < 46 covariates_df_old = covariates_df[old] interaction_s = pd.Series(data=covariates_df['IRSnoAge'], index=covariates_df.index.values) interaction_s_old = pd.Series(data=covariates_df_old['IRSnoAge'], index=covariates_df_old.index.values) # PLINK reader for genotypes pr = genotypeio.PlinkReader(plink_prefix_path, select_samples=covariates_df_old.index) genotype_df = pr.load_genotypes() variant_df = pr.bim.set_index('snp')[['chrom', 'pos']] phenotype_pos_df["chr"] = phenotype_pos_df["chr"].astype(str) ########################## # Nominal eqtls - canonical ########################## ##cis.map_nominal(genotype_df, variant_df, # phenotype_df.loc[phenotype_pos_df['chr'].isin(['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22'])], # phenotype_pos_df.loc[phenotype_pos_df['chr'].isin(['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22'])], # prefix, covariates_df=covariates_df) ##########################
outdir = covdir phenotype_df, phenotype_pos_df = tensorqtl.read_phenotype_bed( phenotype_bed_file) covariates_peer_df = pd.read_csv(covariates_peer, sep='\t', index_col=0).T # samples x covariates interaction_s = pd.read_csv( "/rds/project/jmmh2/rds-jmmh2-projects/interval_rna_seq/covid19/INTERVAL_RNAseq_COVID19_neutPCT_GxE.txt", sep="\t", index_col=0, squeeze=True).T interaction_s = interaction_s.squeeze() plink_prefix_path = "/rds/user/jm2294/rds-jmmh2-projects/interval_rna_seq/analysis/03_tensorqtl/genotypes/INTERVAL_b38_autosomes_RNAseqPhase1_biallelic_all_MAF0.005" pr = genotypeio.PlinkReader(plink_prefix_path) genotype_df = pd.DataFrame(pr.get_all_genotypes(), index=pr.bim['snp'], columns=pr.fam['iid']) variant_df = pr.bim.set_index('snp')[['chrom', 'pos']] # cis # Cis gene-level mapping pheno_df_noACE2 = phenotype_df.drop("ENSG00000130234") phenopos_df_noACE2 = phenotype_pos_df.drop("ENSG00000130234") pheno_df_noACE2 = pheno_df_noACE2.drop("ENSG00000184012") phenopos_df_noACE2 = phenopos_df_noACE2.drop("ENSG00000184012") cis_df = cis.map_cis(genotype_df, variant_df, pheno_df_noACE2, phenopos_df_noACE2, covariates_peer_df)