def create_window_ldsc(args): ''' Create SNP annotation corresponding to x kb window around genes in gene sets. Estimate LD scores using these annotations. ''' gsets = read_gene_sets(args.gene_sets, args.gset_start, args.gset_end) gene_coords = pd.read_csv(args.gene_coords, sep='\t') # read SNPs geno_fname = args.bfile bim = pd.read_csv(geno_fname + '.bim', sep='\t', header=None) # load bim bim = bim.loc[:, 0:3] bim.columns = ['CHR', 'SNP', 'CM', 'BP'] # create gene window annot file for gset, genes in gsets.items(): temp_genes = [x for x in genes if x in gene_coords.iloc[:, 2].tolist()] toadd_annot = np.zeros(len(bim), dtype=int) for gene in temp_genes: coord = gene_coords[gene_coords.iloc[:, 2] == gene] try: chr = int(coord.iloc[0, 0]) except: continue start = int(coord.iloc[0, 1]) - 1000 * args.make_kb_window end = int(coord.iloc[0, 1]) + 1000 * args.make_kb_window toadd_annot[(bim['CHR'] == chr).values & (bim['BP'] > start).values & (bim['BP'] < end).values] = 1 bim[gset] = toadd_annot # estimate LD scores array_indivs = ps.PlinkFAMFile(geno_fname + '.fam') array_snps = ps.PlinkBIMFile(geno_fname + '.bim') geno_array = ld.PlinkBEDFile(geno_fname + '.bed', array_indivs.n, array_snps) block_left = ld.getBlockLefts(geno_array.df[:, 2], 1e6) # compute M M_5_50 = np.sum(bim.iloc[np.array(geno_array.freq) > 0.05, 4:].values, axis=0) # estimate ld scores res = geno_array.ldScoreVarBlocks(block_left, c=50, annot=bim.iloc[:, 4:].values) keep_snps = pd.read_csv(args.keep, header=None) keep_snps_indices = array_snps.df['SNP'].isin(keep_snps[0]).values res = res[keep_snps_indices, :] expscore = pd.concat([ bim.iloc[keep_snps_indices, [0, 1, 3]].reset_index(drop=True), pd.DataFrame(res).reset_index(drop=True) ], axis=1) expscore.columns = geno_array.colnames[:3] + bim.columns[4:].tolist() # output files if args.split_output: for i in range(len(gsets)): gset_name = bim.columns[4 + i] np.savetxt('{}.{}.l2.M_5_50'.format(args.out, gset_name), M_5_50[i].reshape((1, 1)), fmt='%d') bim.iloc[:, range(4) + [4 + i]].to_csv('{}.{}.annot.gz'.format( args.out, gset_name), sep='\t', index=False, compression='gzip') expscore.iloc[:, range(3) + [3 + i]].to_csv( '{}.{}.l2.ldscore.gz'.format(args.out, gset_name), sep='\t', index=False, float_format='%.5f', compression='gzip') else: np.savetxt('{}.l2.M_5_50'.format(args.out), M_5_50.reshape((1, len(M_5_50))), fmt='%d') bim.to_csv('{}.annot.gz'.format(args.out), sep='\t', index=False, compression='gzip') expscore.to_csv('{}.l2.ldscore.gz'.format(args.out), sep='\t', index=False, float_format='%.5f', compression='gzip')
def create_gset_expscore(args): ''' Create gene set expression scores ''' input_prefix = '{}.{}'.format(args.input_prefix, args.chr) gsets = read_gene_sets(args.gene_sets, args.gset_start, args.gset_end) print('Reading eQTL weights') h2cis = pd.DataFrame() # read in all chromosome, since partitioning should be by gene across genome (makes a difference for small gene sets) for i in range(1, N_CHR+1): temp_h2cis = pd.read_csv('{}.{}.hsq'.format(args.input_prefix, i), sep='\t') h2cis = h2cis.append(temp_h2cis) h2cis.dropna(inplace=True) lasso = pd.read_csv(input_prefix + '.lasso', sep='\t') keep_snps = pd.read_csv(args.keep, header=None) geno_fname = args.bfile bim = pd.read_csv(geno_fname + '.bim', sep='\t', header=None) # load bim bim = bim.loc[(bim[0] == args.chr).values & bim[1].isin(keep_snps[0]).values, 0:3] bim.columns = ['CHR', 'SNP', 'CM', 'BP'] # keep genes with positive h2cis and converged LASSO snp_indices = dict(zip(bim['SNP'].tolist(), range(len(bim)))) # SNP indices for fast merging filtered_h2cis = h2cis[h2cis['h2cis'] > 0] # filter out genes w/h2cis < 0 filtered_h2cis = filtered_h2cis[~np.isnan(filtered_h2cis['h2cis'])] if args.genes: keep_genes = read_file_line(args.genes) filtered_h2cis = filtered_h2cis[filtered_h2cis['Gene'].isin(keep_genes)] # retain genes across all chromosome for binning filtered_gene_indices = dict(zip(filtered_h2cis['Gene'].tolist(), range(len(filtered_h2cis)))) # get gset names gset_names = ['Cis_herit_bin_{}'.format(x) for x in range(1,args.num_background_bins+1)] for k in gsets.keys(): gset_names.extend(['{}_Cis_herit_bin_{}'.format(k, x) for x in range(1,args.num_gene_bins+1)]) # create dict indicating gene membership in each gene set ave_h2cis = [] # compute average cis-heritability of genes in bin gene_gset_dict = defaultdict(list) # background gene set gene_bins = pd.qcut(filtered_h2cis['h2cis'], args.num_background_bins, labels=range(args.num_background_bins)).astype(int).tolist() temp_combined_herit = pd.DataFrame(np.c_[filtered_h2cis[['Gene', 'Chrom','h2cis']], gene_bins]) temp_combined_herit[1] = temp_combined_herit[1].astype(int) temp_combined_herit[2] = temp_combined_herit[2].astype(float) temp_combined_herit[3] = temp_combined_herit[3].astype(int) temp_combined_herit = temp_combined_herit[temp_combined_herit[1] == args.chr] temp_h2cis = temp_combined_herit[[2,3]].groupby([3]).mean() temp_h2cis = temp_h2cis[2].values ave_h2cis.extend(temp_h2cis) for i, gene in enumerate(filtered_h2cis['Gene']): gene_gset_dict[gene].append('Cis_herit_bin_{}'.format(gene_bins[i]+1)) # remaining gene sets for k, v in gsets.items(): temp_genes = [x for x in v if x in filtered_h2cis['Gene'].tolist()] temp_herit = filtered_h2cis.iloc[[filtered_gene_indices[x] for x in temp_genes], [0,1,2]] gene_bins = pd.qcut(temp_herit['h2cis'], args.num_gene_bins, labels=range(args.num_gene_bins)).astype(int).tolist() # bin first, then subset chr temp_combined_herit = pd.DataFrame(np.c_[temp_herit, gene_bins]) temp_combined_herit[1] = temp_combined_herit[1].astype(int) temp_combined_herit[2] = temp_combined_herit[2].astype(float) temp_combined_herit[3] = temp_combined_herit[3].astype(int) temp_combined_herit = temp_combined_herit[temp_combined_herit[1] == args.chr] # subset chr # sometimes for small gene sets, bins will contain no genes for individual chromosomes bins = temp_combined_herit[3].tolist() copy_herit = copy.deepcopy(temp_combined_herit) for i in range(args.num_gene_bins): if i not in bins: copy_herit = copy_herit.append([['GENE',0,0,i]]) temp_h2cis = copy_herit[[2, 3]].groupby([3]).mean() temp_h2cis = temp_h2cis[2].values ave_h2cis.extend(temp_h2cis) for i, gene in enumerate(temp_combined_herit[0].values): gene_gset_dict[gene].append('{}_Cis_herit_bin_{}'.format(k, temp_combined_herit[3].values[i]+1)) gset_indices = dict(zip(gset_names, range(len(gset_names)))) filtered_h2cis = filtered_h2cis[filtered_h2cis['Gene'].isin(lasso['GENE'])] # finally retain just genes on input chr g_annot = [] glist = [] eqtl_annot = np.zeros((len(bim), len(gset_names))) # create eQTL annot (for expscore) and gene annot print('Combining eQTL weights') for i in range(len(filtered_h2cis)): gene = filtered_h2cis.iloc[i, 0] temp_h2cis = filtered_h2cis.iloc[i, 2] temp_lasso = lasso[lasso['GENE'] == gene] if len(temp_lasso) == 0: continue if gene not in gene_gset_dict.keys(): g_annot.append(np.zeros(len(gset_names))) else: snp_idx = [snp_indices[x] for x in temp_lasso['SNP'].tolist()] temp_lasso_weights = temp_lasso['EFFECT'].values emp_herit = np.sum(np.square(temp_lasso_weights)) if emp_herit <= 0: # scale eQTL weights to h2cis bias = 0 else: bias = np.sqrt(temp_h2cis / emp_herit) temp_lasso_weights *= bias temp_gset_indices = [gset_indices[x] for x in gene_gset_dict[gene]] for gset in temp_gset_indices: eqtl_annot[snp_idx, gset] += np.square(temp_lasso_weights) g_annot_toadd = np.zeros(len(gset_names)) g_annot_toadd[temp_gset_indices] = 1 g_annot.append(g_annot_toadd) glist.append(gene) g_annot = np.array(g_annot).astype(int) g_annot_final = pd.DataFrame(np.c_[glist, g_annot]) g_annot_final.columns = ['Gene'] + gset_names print('Computing expression scores') # load genotypes array_indivs = ps.PlinkFAMFile(geno_fname + '.fam') array_snps = ps.PlinkBIMFile(geno_fname + '.bim') keep_snps_indices = np.where((array_snps.df['CHR'] == args.chr).values & array_snps.df['SNP'].isin(keep_snps[0]).values)[0] with Suppressor(): geno_array = ld.PlinkBEDFile(geno_fname + '.bed', array_indivs.n, array_snps, keep_snps=keep_snps_indices) block_left = ld.getBlockLefts(geno_array.df[:, 2], 1e6) # estimate expression scores res = geno_array.ldScoreVarBlocks(block_left, c=50, annot=eqtl_annot) expscore = pd.concat([ pd.DataFrame(geno_array.df[:, :3]), pd.DataFrame(res)], axis=1) expscore.columns = geno_array.colnames[:3] + gset_names # output files G = np.sum(g_annot, axis=0) if args.split_output: np.savetxt('{}.Base.{}.G'.format(args.out, args.chr), G[:args.num_background_bins].reshape((1, args.num_background_bins)), fmt='%d') np.savetxt('{}.Base.{}.ave_h2cis'.format(args.out, args.chr), np.array(ave_h2cis[:args.num_background_bins]).reshape((1, args.num_background_bins)), fmt="%.5f") g_annot_final.iloc[:, :args.num_background_bins + 1].to_csv('{}.Base.{}.gannot'.format(args.out, args.chr), sep='\t', index=False) expscore.iloc[:, :args.num_background_bins + 3].to_csv('{}.Base.{}.expscore'.format(args.out, args.chr), sep='\t', index=False, float_format='%.5f') for i in range(args.num_background_bins, args.num_background_bins + len(gsets) * args.num_gene_bins, args.num_gene_bins): gset_name = re.sub('_Cis_herit_bin_1', '', g_annot_final.columns[1 + i]) np.savetxt('{}.{}.{}.G'.format(args.out, gset_name, args.chr), G[i:i + args.num_gene_bins].reshape((1, args.num_gene_bins)), fmt='%d') np.savetxt('{}.{}.{}.ave_h2cis'.format(args.out, gset_name, args.chr), np.array(ave_h2cis[i:i + args.num_gene_bins]).reshape((1, args.num_gene_bins)), fmt="%.5f") g_annot_final.iloc[:, [0] + range(i + 1, i + 1 + args.num_gene_bins)].to_csv( '{}.{}.{}.gannot'.format(args.out, gset_name, args.chr), sep='\t', index=False) expscore.iloc[:, range(0, 3) + range(i + 3, i + 3 + args.num_gene_bins)].to_csv( '{}.{}.{}.expscore'.format(args.out, gset_name, args.chr), sep='\t', index=False, float_format='%.5f') else: np.savetxt('{}.{}.G'.format(args.out, args.chr), G.reshape((1, len(G))), fmt='%d') np.savetxt('{}.{}.ave_h2cis'.format(args.out, args.chr), np.array(ave_h2cis).reshape((1, len(ave_h2cis))), fmt="%.5f") g_annot_final.to_csv('{}.{}.gannot'.format(args.out, args.chr), sep='\t', index=False) expscore.to_csv('{}.{}.expscore'.format(args.out, args.chr), sep='\t', index=False, float_format='%.5f') print('Done!')
def meta_analyze(args): ''' Meta-analyze weights ''' input_prefixes = read_file_line(args.input_prefixes) input_prefixes_name = ['{}.{}'.format(x, args.chr) for x in input_prefixes] genes = get_gene_list(input_prefixes_name) if args.genes: keep_genes = read_file_line(args.genes) genes = [x for x in genes if x in keep_genes] # gene indices for fast merging gene_indices = dict(zip(genes, range(len(genes)))) num = np.zeros(len(genes)) count = np.zeros(len(genes)) all_lasso = pd.DataFrame() # meta-analyze REML h2cis estimates by taking simple average # inverse-variance weighing has issues, since REML SE is downwardly biased for small h2 estimates for input in input_prefixes: cond = os.path.basename(input) lasso = pd.read_csv('{}.{}.lasso'.format(input, args.chr), sep='\t') lasso['COND'] = cond all_lasso = all_lasso.append(lasso) reml = pd.read_csv('{}.{}.hsq'.format(input, args.chr), sep='\t') reml.dropna(inplace=True) reml = reml[reml['Gene'].isin(genes)] gene_idx = [gene_indices[x] for x in reml['Gene'].tolist()] num[gene_idx] += reml['h2cis'].values count[gene_idx] += 1 count[count == 0] = np.nan meta_h2cis = num / count meta_h2cis_out = pd.DataFrame({ 'Gene': genes, 'Chrom': args.chr, 'h2cis': meta_h2cis }) meta_h2cis_out.to_csv(args.out + '.hsq', sep='\t', index=False) keep_snps = pd.read_csv(args.keep, header=None) geno_fname = args.bfile bim = pd.read_csv(geno_fname + '.bim', sep='\t', header=None) # load bim bim = bim.loc[(bim[0] == args.chr).values & bim[1].isin(keep_snps[0]).values, 0:3] bim.columns = ['CHR', 'SNP', 'CM', 'BP'] # create gene membership dict snp_indices = dict(zip(bim['SNP'].tolist(), range(len(bim)))) # SNP indices for fast merging filtered_meta_h2cis = meta_h2cis_out[meta_h2cis_out['h2cis'] > 0] # filter out genes w/h2cis < 0 filtered_meta_h2cis = filtered_meta_h2cis[ ~np.isnan(filtered_meta_h2cis['h2cis'])] g_annot_meta = {} # gene membership in bins gene_bins = pd.qcut(filtered_meta_h2cis['h2cis'], args.num_bins, labels=range(args.num_bins)).astype(int) filtered_meta_h2cis['Bin'] = gene_bins for j in range(len(filtered_meta_h2cis)): g_annot_meta[filtered_meta_h2cis.iloc[j, 1]] = gene_bins.iloc[j] g_annot = [] g_annot_names = [] eqtl_annot = np.zeros((len(bim), args.num_bins)) # create eQTL annot (for expscore) and gene annot print('Combining eQTL weights') for i in range(len(filtered_meta_h2cis)): gene = filtered_meta_h2cis.iloc[i, 1] temp_h2cis = filtered_meta_h2cis.iloc[i, 2] temp_lasso = all_lasso[all_lasso['GENE'] == gene] unique_conds = pd.unique(temp_lasso['COND']) for temp_cond in unique_conds: # for each condition temp_temp_lasso = temp_lasso[temp_lasso['COND'] == temp_cond] snp_idx = [snp_indices[x] for x in temp_temp_lasso['SNP'].tolist()] temp_lasso_weights = temp_temp_lasso['EFFECT'].values emp_herit = np.sum(np.square(temp_lasso_weights)) if emp_herit <= 0: # scale eQTL weights to meta-tissue h2cis bias = 0 else: bias = np.sqrt(temp_h2cis / emp_herit) temp_lasso_weights *= bias eqtl_annot[snp_idx, g_annot_meta[gene]] += np.square(temp_lasso_weights) g_annot_toadd = np.zeros(args.num_bins) g_annot_toadd[g_annot_meta[gene]] = 1 g_annot.append(g_annot_toadd) g_annot_names.append(gene + '_' + temp_cond) g_annot = np.array(g_annot).astype(int) g_annot_final = pd.DataFrame(np.c_[g_annot_names, g_annot]) g_bin_names = [ 'Cis_herit_bin_{}'.format(x) for x in range(1, args.num_bins + 1) ] g_annot_final.columns = ['Gene'] + g_bin_names g_annot_final.to_csv('{}.{}.gannot.gz'.format(args.out, args.chr), sep='\t', index=False, compression='gzip') G = np.sum(g_annot, axis=0) ave_cis_herit = filtered_meta_h2cis.groupby(['Bin']).mean() ave_cis_herit = ave_cis_herit['h2cis'].values np.savetxt('{}.{}.G'.format(args.out, args.chr), G.reshape((1, len(G))), fmt='%d') np.savetxt('{}.{}.ave_h2cis'.format(args.out, args.chr), ave_cis_herit.reshape((1, len(ave_cis_herit))), fmt="%.5f") print('Computing expression scores') # load genotypes array_indivs = ps.PlinkFAMFile(geno_fname + '.fam') array_snps = ps.PlinkBIMFile(geno_fname + '.bim') keep_snps_indices = np.where( (array_snps.df['CHR'] == args.chr).values & array_snps.df['SNP'].isin(keep_snps[0]).values)[0] with Suppressor(): geno_array = ld.PlinkBEDFile(geno_fname + '.bed', array_indivs.n, array_snps, keep_snps=keep_snps_indices) block_left = ld.getBlockLefts(geno_array.df[:, 2], 1e6) # estimate expression scores res = geno_array.ldScoreVarBlocks(block_left, c=50, annot=eqtl_annot) expscore = pd.concat( [pd.DataFrame(geno_array.df[:, :3]), pd.DataFrame(res)], axis=1) expscore.columns = geno_array.colnames[:3] + g_bin_names # output files expscore.to_csv('{}.{}.expscore.gz'.format(args.out, args.chr), sep='\t', index=False, compression='gzip', float_format='%.5f') print('Done!')
def create_gset_expscore_meta_batch(args): ''' Create gene set expression scores meta-analyzed over several tissues/conditions. Analyze gene sets in batches. ''' input_prefixes = read_file_line(args.input_prefix_meta) genes = get_gene_list(input_prefixes) gsets = read_gene_sets(args.gene_sets, args.gset_start, args.gset_end) # gene indices for fast merging gene_indices = dict(zip(genes['Gene'].tolist(), range(len(genes)))) num = np.zeros(len(genes)) count = np.zeros(len(genes)) all_lasso = pd.DataFrame() print('Reading eQTL weights') # meta-analyze REML h2cis estimates by taking simple average # inverse-variance weighing has issues, since REML SE is downwardly biased for small h2 estimates for input in input_prefixes: cond = os.path.basename(input) lasso = pd.read_csv('{}.{}.lasso'.format(input, args.chr), sep='\t') lasso['COND'] = cond all_lasso = all_lasso.append(lasso) all_reml = pd.DataFrame() for i in range(1, N_CHR + 1): reml = pd.read_csv('{}.{}.hsq'.format(input, i), sep='\t') reml.dropna(inplace=True) all_reml = all_reml.append(reml) gene_idx = [gene_indices[x] for x in all_reml['Gene'].tolist()] num[gene_idx] += all_reml['h2cis'].values count[gene_idx] += 1 count[count == 0] = np.nan meta_h2cis = num / count meta_h2cis_out = pd.DataFrame({'Gene': genes['Gene'], 'Chrom': genes['Chrom'], 'h2cis': meta_h2cis}, columns=['Gene', 'Chrom', 'h2cis']) keep_snps = pd.read_csv(args.keep, header=None) geno_fname = args.bfile bim = pd.read_csv(geno_fname + '.bim', sep='\t', header=None) # load bim bim = bim.loc[(bim[0] == args.chr).values & bim[1].isin(keep_snps[0]).values, 0:3] bim.columns = ['CHR', 'SNP', 'CM', 'BP'] # keep genes with positive h2cis and converged LASSO snp_indices = dict(zip(bim['SNP'].tolist(), range(len(bim)))) # SNP indices for fast merging filtered_h2cis = meta_h2cis_out[meta_h2cis_out['h2cis'] > 0] # filter out genes w/h2cis < 0 filtered_h2cis = filtered_h2cis[~np.isnan(filtered_h2cis['h2cis'])] if args.genes: keep_genes = read_file_line(args.genes) filtered_h2cis = filtered_h2cis[filtered_h2cis['Gene'].isin(keep_genes)] # retain genes across all chromosome for binning filtered_gene_indices = dict(zip(filtered_h2cis['Gene'].tolist(), range(len(filtered_h2cis)))) chr_filtered_h2cis = filtered_h2cis[filtered_h2cis['Gene'].isin(all_lasso['GENE'])] # finally retain just genes on input chr # load genotypes array_indivs = ps.PlinkFAMFile(geno_fname + '.fam') array_snps = ps.PlinkBIMFile(geno_fname + '.bim') keep_snps_indices = \ np.where((array_snps.df['CHR'] == args.chr).values & array_snps.df['SNP'].isin(keep_snps[0]).values)[0] with Suppressor(): geno_array = ld.PlinkBEDFile(geno_fname + '.bed', array_indivs.n, array_snps, keep_snps=keep_snps_indices) block_left = ld.getBlockLefts(geno_array.df[:, 2], 1e6) print('Computing expression scores for background gene sets') # analyze background gset gset_names = ['Cis_herit_bin_{}'.format(x) for x in range(1, args.num_background_bins + 1)] gset_indices = dict(zip(gset_names, range(len(gset_names)))) # create dict indicating gene membership in each gene set all_ave_h2cis = [] # compute average cis-heritability of genes in bin all_G = [] gene_gset_dict = defaultdict(list) # background gene set gene_bins = pd.qcut(filtered_h2cis['h2cis'], args.num_background_bins, labels=range(args.num_background_bins)).astype(int).tolist() temp_combined_herit = pd.DataFrame(np.c_[filtered_h2cis[['Gene', 'Chrom', 'h2cis']], gene_bins]) temp_combined_herit[1] = temp_combined_herit[1].astype(int) temp_combined_herit[2] = temp_combined_herit[2].astype(float) temp_combined_herit[3] = temp_combined_herit[3].astype(int) temp_combined_herit = temp_combined_herit[temp_combined_herit[1] == args.chr] temp_h2cis = temp_combined_herit[[2, 3]].groupby([3]).mean() temp_h2cis = temp_h2cis[2].values all_ave_h2cis.extend(temp_h2cis) for i, gene in enumerate(filtered_h2cis['Gene']): gene_gset_dict[gene].append('Cis_herit_bin_{}'.format(gene_bins[i] + 1)) # compute expression scores G, g_annot_final, expscore = batch_expscore(gene_gset_dict, gset_names, chr_filtered_h2cis, all_lasso, bim, snp_indices, gset_indices, geno_array, block_left) all_G.extend(G) if args.split_output: np.savetxt('{}.Base.{}.G'.format(args.out, args.chr), G.reshape((1, len(G))), fmt='%d') np.savetxt('{}.Base.{}.ave_h2cis'.format(args.out, args.chr), np.array(all_ave_h2cis).reshape((1, len(all_ave_h2cis))), fmt="%.5f") g_annot_final.to_csv('{}.Base.{}.gannot'.format(args.out, args.chr), sep='\t', index=False) expscore.to_csv('{}.Base.{}.expscore'.format(args.out, args.chr), sep='\t', index=False, float_format='%.5f') else: g_annot_final.to_csv('{}.{}.gannot.batch0'.format(args.out, args.chr), sep='\t', index=False) expscore.to_csv('{}.{}.expscore.batch0'.format(args.out, args.chr), sep='\t', index=False, float_format='%.5f') # remaining gene sets count = 0 ave_h2cis = [] for i in range(0, len(gsets), args.batch_size): count += 1 print('Computing expression scores for gene sets {} to {} (out of {} total)'.format(i+1, min(i+args.batch_size, len(gsets)), len(gsets))) temp_gsets = OrderedDict(gsets.items()[i:(i+args.batch_size)]) rest_gset_names = [] rest_gene_gset_dict = defaultdict(list) for k in temp_gsets.keys(): rest_gset_names.extend(['{}_Cis_herit_bin_{}'.format(k, x) for x in range(1, args.num_gene_bins + 1)]) gset_indices = dict(zip(rest_gset_names, range(len(rest_gset_names)))) for k, v in temp_gsets.items(): temp_genes = [x for x in v if x in filtered_h2cis['Gene'].tolist()] temp_herit = filtered_h2cis.iloc[[filtered_gene_indices[x] for x in temp_genes], [0, 1, 2]] gene_bins = pd.qcut(temp_herit['h2cis'], args.num_gene_bins, labels=range(args.num_gene_bins)).astype( int).tolist() # bin first, then subset chr temp_combined_herit = pd.DataFrame(np.c_[temp_herit, gene_bins]) temp_combined_herit[1] = temp_combined_herit[1].astype(int) temp_combined_herit[2] = temp_combined_herit[2].astype(float) temp_combined_herit[3] = temp_combined_herit[3].astype(int) temp_combined_herit = temp_combined_herit[temp_combined_herit[1] == args.chr] # subset chr # sometimes for small gene sets, bins will contain no genes for individual chromosomes bins = temp_combined_herit[3].tolist() copy_herit = copy.deepcopy(temp_combined_herit) for i in range(args.num_gene_bins): if i not in bins: copy_herit = copy_herit.append([['GENE', 0, 0, i]]) temp_h2cis = copy_herit[[2, 3]].groupby([3]).mean() temp_h2cis = temp_h2cis[2].values ave_h2cis.extend(temp_h2cis) for i, gene in enumerate(temp_combined_herit[0].values): rest_gene_gset_dict[gene].append('{}_Cis_herit_bin_{}'.format(k, temp_combined_herit[3].values[i] + 1)) G, g_annot_final, expscore = batch_expscore(rest_gene_gset_dict, rest_gset_names, chr_filtered_h2cis, all_lasso, bim, snp_indices, gset_indices, geno_array, block_left) all_G.extend(G) all_ave_h2cis.extend(ave_h2cis) if args.split_output: for i in range(0, len(temp_gsets) * args.num_gene_bins, args.num_gene_bins): gset_name = re.sub('_Cis_herit_bin_1', '', g_annot_final.columns[1 + i]) np.savetxt('{}.{}.{}.G'.format(args.out, gset_name, args.chr), G[i:i + args.num_gene_bins].reshape((1, args.num_gene_bins)), fmt='%d') np.savetxt('{}.{}.{}.ave_h2cis'.format(args.out, gset_name, args.chr), np.array(ave_h2cis[i:i + args.num_gene_bins]).reshape((1, args.num_gene_bins)), fmt="%.5f") g_annot_final.iloc[:, [0] + range(i + 1, i + 1 + args.num_gene_bins)].to_csv( '{}.{}.{}.gannot'.format(args.out, gset_name, args.chr), sep='\t', index=False) expscore.iloc[:, range(0, 3) + range(i + 3, i + 3 + args.num_gene_bins)].to_csv( '{}.{}.{}.expscore'.format(args.out, gset_name, args.chr), sep='\t', index=False, float_format='%.5f') else: temp_g_annot_name = '{}.{}.gannot.batch{}'.format(args.out, args.chr, count) temp_expscore_name = '{}.{}.expscore.batch{}'.format(args.out, args.chr, count) g_annot_final = g_annot_final.iloc[:, 1:] expscore = expscore.iloc[:, 3:] g_annot_final.to_csv(temp_g_annot_name, sep='\t', index=False) expscore.to_csv(temp_expscore_name, sep='\t', index=False, float_format='%.5f') if not args.split_output: subprocess.call('paste {} > {}'.format(' '.join(['{}.{}.gannot.batch{}'.format(args.out, args.chr, x) for x in range(count+1)]), '{}.{}.gannot'.format(args.out, args.chr)), shell=True) subprocess.call('paste {} > {}'.format(' '.join(['{}.{}.expscore.batch{}'.format(args.out, args.chr, x) for x in range(count+1)]), '{}.{}.expscore'.format(args.out, args.chr)), shell=True) subprocess.call('rm {}'.format(' '.join(['{}.{}.gannot.batch{}'.format(args.out, args.chr, x) for x in range(count+1)])), shell=True) subprocess.call('rm {}'.format(' '.join(['{}.{}.expscore.batch{}'.format(args.out, args.chr, x) for x in range(count+1)])), shell=True) np.savetxt('{}.{}.G'.format(args.out, args.chr), np.array(all_G).reshape((1, len(all_G))), fmt='%d') np.savetxt('{}.{}.ave_h2cis'.format(args.out, args.chr), np.array(all_ave_h2cis).reshape((1, len(all_ave_h2cis))), fmt="%.5f") print('Done!')