Пример #1
0
def create_window_ldsc(args):
    '''
    Create SNP annotation corresponding to x kb window around genes in gene sets. Estimate LD scores using these annotations.
    '''
    gsets = read_gene_sets(args.gene_sets, args.gset_start, args.gset_end)
    gene_coords = pd.read_csv(args.gene_coords, sep='\t')

    # read SNPs
    geno_fname = args.bfile
    bim = pd.read_csv(geno_fname + '.bim', sep='\t', header=None)  # load bim
    bim = bim.loc[:, 0:3]
    bim.columns = ['CHR', 'SNP', 'CM', 'BP']

    # create gene window annot file
    for gset, genes in gsets.items():
        temp_genes = [x for x in genes if x in gene_coords.iloc[:, 2].tolist()]
        toadd_annot = np.zeros(len(bim), dtype=int)
        for gene in temp_genes:
            coord = gene_coords[gene_coords.iloc[:, 2] == gene]
            try:
                chr = int(coord.iloc[0, 0])
            except:
                continue
            start = int(coord.iloc[0, 1]) - 1000 * args.make_kb_window
            end = int(coord.iloc[0, 1]) + 1000 * args.make_kb_window
            toadd_annot[(bim['CHR'] == chr).values & (bim['BP'] > start).values
                        & (bim['BP'] < end).values] = 1
        bim[gset] = toadd_annot

    # estimate LD scores
    array_indivs = ps.PlinkFAMFile(geno_fname + '.fam')
    array_snps = ps.PlinkBIMFile(geno_fname + '.bim')

    geno_array = ld.PlinkBEDFile(geno_fname + '.bed', array_indivs.n,
                                 array_snps)
    block_left = ld.getBlockLefts(geno_array.df[:, 2], 1e6)

    # compute M
    M_5_50 = np.sum(bim.iloc[np.array(geno_array.freq) > 0.05, 4:].values,
                    axis=0)

    # estimate ld scores
    res = geno_array.ldScoreVarBlocks(block_left,
                                      c=50,
                                      annot=bim.iloc[:, 4:].values)
    keep_snps = pd.read_csv(args.keep, header=None)
    keep_snps_indices = array_snps.df['SNP'].isin(keep_snps[0]).values
    res = res[keep_snps_indices, :]

    expscore = pd.concat([
        bim.iloc[keep_snps_indices, [0, 1, 3]].reset_index(drop=True),
        pd.DataFrame(res).reset_index(drop=True)
    ],
                         axis=1)
    expscore.columns = geno_array.colnames[:3] + bim.columns[4:].tolist()

    # output files
    if args.split_output:
        for i in range(len(gsets)):
            gset_name = bim.columns[4 + i]
            np.savetxt('{}.{}.l2.M_5_50'.format(args.out, gset_name),
                       M_5_50[i].reshape((1, 1)),
                       fmt='%d')
            bim.iloc[:, range(4) + [4 + i]].to_csv('{}.{}.annot.gz'.format(
                args.out, gset_name),
                                                   sep='\t',
                                                   index=False,
                                                   compression='gzip')
            expscore.iloc[:, range(3) + [3 + i]].to_csv(
                '{}.{}.l2.ldscore.gz'.format(args.out, gset_name),
                sep='\t',
                index=False,
                float_format='%.5f',
                compression='gzip')

    else:
        np.savetxt('{}.l2.M_5_50'.format(args.out),
                   M_5_50.reshape((1, len(M_5_50))),
                   fmt='%d')
        bim.to_csv('{}.annot.gz'.format(args.out),
                   sep='\t',
                   index=False,
                   compression='gzip')
        expscore.to_csv('{}.l2.ldscore.gz'.format(args.out),
                        sep='\t',
                        index=False,
                        float_format='%.5f',
                        compression='gzip')
Пример #2
0
def create_gset_expscore(args):
    '''
    Create gene set expression scores
    '''
    input_prefix = '{}.{}'.format(args.input_prefix, args.chr)
    gsets = read_gene_sets(args.gene_sets, args.gset_start, args.gset_end)

    print('Reading eQTL weights')
    h2cis = pd.DataFrame()
    # read in all chromosome, since partitioning should be by gene across genome (makes a difference for small gene sets)
    for i in range(1, N_CHR+1):
        temp_h2cis = pd.read_csv('{}.{}.hsq'.format(args.input_prefix, i), sep='\t')
        h2cis = h2cis.append(temp_h2cis)
    h2cis.dropna(inplace=True)

    lasso = pd.read_csv(input_prefix + '.lasso', sep='\t')

    keep_snps = pd.read_csv(args.keep, header=None)
    geno_fname = args.bfile
    bim = pd.read_csv(geno_fname + '.bim', sep='\t', header=None) # load bim
    bim = bim.loc[(bim[0] == args.chr).values & bim[1].isin(keep_snps[0]).values, 0:3]
    bim.columns = ['CHR', 'SNP', 'CM', 'BP']

    # keep genes with positive h2cis and converged LASSO
    snp_indices = dict(zip(bim['SNP'].tolist(), range(len(bim))))  # SNP indices for fast merging
    filtered_h2cis = h2cis[h2cis['h2cis'] > 0]  # filter out genes w/h2cis < 0
    filtered_h2cis = filtered_h2cis[~np.isnan(filtered_h2cis['h2cis'])]
    if args.genes:
        keep_genes = read_file_line(args.genes)
        filtered_h2cis = filtered_h2cis[filtered_h2cis['Gene'].isin(keep_genes)]
    # retain genes across all chromosome for binning
    filtered_gene_indices = dict(zip(filtered_h2cis['Gene'].tolist(), range(len(filtered_h2cis))))

    # get gset names
    gset_names = ['Cis_herit_bin_{}'.format(x) for x in range(1,args.num_background_bins+1)]
    for k in gsets.keys():
        gset_names.extend(['{}_Cis_herit_bin_{}'.format(k, x) for x in range(1,args.num_gene_bins+1)])

    # create dict indicating gene membership in each gene set
    ave_h2cis = []  # compute average cis-heritability of genes in bin
    gene_gset_dict = defaultdict(list)
    # background gene set
    gene_bins = pd.qcut(filtered_h2cis['h2cis'], args.num_background_bins, labels=range(args.num_background_bins)).astype(int).tolist()
    temp_combined_herit = pd.DataFrame(np.c_[filtered_h2cis[['Gene', 'Chrom','h2cis']], gene_bins])
    temp_combined_herit[1] = temp_combined_herit[1].astype(int)
    temp_combined_herit[2] = temp_combined_herit[2].astype(float)
    temp_combined_herit[3] = temp_combined_herit[3].astype(int)
    temp_combined_herit = temp_combined_herit[temp_combined_herit[1] == args.chr]
    temp_h2cis = temp_combined_herit[[2,3]].groupby([3]).mean()
    temp_h2cis = temp_h2cis[2].values
    ave_h2cis.extend(temp_h2cis)
    for i, gene in enumerate(filtered_h2cis['Gene']):
        gene_gset_dict[gene].append('Cis_herit_bin_{}'.format(gene_bins[i]+1))

    # remaining gene sets
    for k, v in gsets.items():
        temp_genes = [x for x in v if x in filtered_h2cis['Gene'].tolist()]
        temp_herit = filtered_h2cis.iloc[[filtered_gene_indices[x] for x in temp_genes], [0,1,2]]
        gene_bins = pd.qcut(temp_herit['h2cis'], args.num_gene_bins, labels=range(args.num_gene_bins)).astype(int).tolist() # bin first, then subset chr
        temp_combined_herit = pd.DataFrame(np.c_[temp_herit, gene_bins])
        temp_combined_herit[1] = temp_combined_herit[1].astype(int)
        temp_combined_herit[2] = temp_combined_herit[2].astype(float)
        temp_combined_herit[3] = temp_combined_herit[3].astype(int)
        temp_combined_herit = temp_combined_herit[temp_combined_herit[1] == args.chr] # subset chr

        # sometimes for small gene sets, bins will contain no genes for individual chromosomes
        bins = temp_combined_herit[3].tolist()
        copy_herit = copy.deepcopy(temp_combined_herit)
        for i in range(args.num_gene_bins):
            if i not in bins:
                copy_herit = copy_herit.append([['GENE',0,0,i]])
        temp_h2cis = copy_herit[[2, 3]].groupby([3]).mean()
        temp_h2cis = temp_h2cis[2].values
        ave_h2cis.extend(temp_h2cis)
        for i, gene in enumerate(temp_combined_herit[0].values):
            gene_gset_dict[gene].append('{}_Cis_herit_bin_{}'.format(k, temp_combined_herit[3].values[i]+1))
    gset_indices = dict(zip(gset_names, range(len(gset_names))))
    filtered_h2cis = filtered_h2cis[filtered_h2cis['Gene'].isin(lasso['GENE'])] # finally retain just genes on input chr

    g_annot = []
    glist = []
    eqtl_annot = np.zeros((len(bim), len(gset_names)))

    # create eQTL annot (for expscore) and gene annot
    print('Combining eQTL weights')
    for i in range(len(filtered_h2cis)):
        gene = filtered_h2cis.iloc[i, 0]
        temp_h2cis = filtered_h2cis.iloc[i, 2]
        temp_lasso = lasso[lasso['GENE'] == gene]
        if len(temp_lasso) == 0:
            continue
        if gene not in gene_gset_dict.keys():
            g_annot.append(np.zeros(len(gset_names)))
        else:
            snp_idx = [snp_indices[x] for x in temp_lasso['SNP'].tolist()]
            temp_lasso_weights = temp_lasso['EFFECT'].values
            emp_herit = np.sum(np.square(temp_lasso_weights))
            if emp_herit <= 0:  # scale eQTL weights to h2cis
                bias = 0
            else:
                bias = np.sqrt(temp_h2cis / emp_herit)
            temp_lasso_weights *= bias
            temp_gset_indices = [gset_indices[x] for x in gene_gset_dict[gene]]
            for gset in temp_gset_indices:
                eqtl_annot[snp_idx, gset] += np.square(temp_lasso_weights)
            g_annot_toadd = np.zeros(len(gset_names))
            g_annot_toadd[temp_gset_indices] = 1
            g_annot.append(g_annot_toadd)
        glist.append(gene)

    g_annot = np.array(g_annot).astype(int)
    g_annot_final = pd.DataFrame(np.c_[glist, g_annot])
    g_annot_final.columns = ['Gene'] + gset_names

    print('Computing expression scores')
    # load genotypes
    array_indivs = ps.PlinkFAMFile(geno_fname + '.fam')
    array_snps = ps.PlinkBIMFile(geno_fname + '.bim')
    keep_snps_indices = np.where((array_snps.df['CHR'] == args.chr).values & array_snps.df['SNP'].isin(keep_snps[0]).values)[0]

    with Suppressor():
        geno_array = ld.PlinkBEDFile(geno_fname + '.bed', array_indivs.n, array_snps,
                                     keep_snps=keep_snps_indices)

    block_left = ld.getBlockLefts(geno_array.df[:, 2], 1e6)

    # estimate expression scores
    res = geno_array.ldScoreVarBlocks(block_left, c=50, annot=eqtl_annot)
    expscore = pd.concat([
        pd.DataFrame(geno_array.df[:, :3]),
        pd.DataFrame(res)], axis=1)
    expscore.columns = geno_array.colnames[:3] + gset_names

    # output files
    G = np.sum(g_annot, axis=0)

    if args.split_output:
        np.savetxt('{}.Base.{}.G'.format(args.out, args.chr),
                   G[:args.num_background_bins].reshape((1, args.num_background_bins)), fmt='%d')
        np.savetxt('{}.Base.{}.ave_h2cis'.format(args.out, args.chr),
                   np.array(ave_h2cis[:args.num_background_bins]).reshape((1, args.num_background_bins)),
                   fmt="%.5f")
        g_annot_final.iloc[:, :args.num_background_bins + 1].to_csv('{}.Base.{}.gannot'.format(args.out, args.chr),
                                                                    sep='\t', index=False)
        expscore.iloc[:, :args.num_background_bins + 3].to_csv('{}.Base.{}.expscore'.format(args.out, args.chr),
                                                               sep='\t', index=False,
                                                               float_format='%.5f')
        for i in range(args.num_background_bins, args.num_background_bins + len(gsets) * args.num_gene_bins,
                       args.num_gene_bins):
            gset_name = re.sub('_Cis_herit_bin_1', '', g_annot_final.columns[1 + i])
            np.savetxt('{}.{}.{}.G'.format(args.out, gset_name, args.chr),
                       G[i:i + args.num_gene_bins].reshape((1, args.num_gene_bins)), fmt='%d')
            np.savetxt('{}.{}.{}.ave_h2cis'.format(args.out, gset_name, args.chr),
                       np.array(ave_h2cis[i:i + args.num_gene_bins]).reshape((1, args.num_gene_bins)),
                       fmt="%.5f")
            g_annot_final.iloc[:, [0] + range(i + 1, i + 1 + args.num_gene_bins)].to_csv(
                '{}.{}.{}.gannot'.format(args.out, gset_name, args.chr),
                sep='\t', index=False)
            expscore.iloc[:, range(0, 3) + range(i + 3, i + 3 + args.num_gene_bins)].to_csv(
                '{}.{}.{}.expscore'.format(args.out, gset_name, args.chr),
                sep='\t', index=False,
                float_format='%.5f')

    else:
        np.savetxt('{}.{}.G'.format(args.out, args.chr), G.reshape((1, len(G))), fmt='%d')
        np.savetxt('{}.{}.ave_h2cis'.format(args.out, args.chr), np.array(ave_h2cis).reshape((1, len(ave_h2cis))),
                   fmt="%.5f")
        g_annot_final.to_csv('{}.{}.gannot'.format(args.out, args.chr), sep='\t', index=False)
        expscore.to_csv('{}.{}.expscore'.format(args.out, args.chr), sep='\t', index=False,
                        float_format='%.5f')
    print('Done!')
Пример #3
0
def meta_analyze(args):
    '''
    Meta-analyze weights
    '''
    input_prefixes = read_file_line(args.input_prefixes)
    input_prefixes_name = ['{}.{}'.format(x, args.chr) for x in input_prefixes]
    genes = get_gene_list(input_prefixes_name)
    if args.genes:
        keep_genes = read_file_line(args.genes)
        genes = [x for x in genes if x in keep_genes]
    # gene indices for fast merging
    gene_indices = dict(zip(genes, range(len(genes))))
    num = np.zeros(len(genes))
    count = np.zeros(len(genes))
    all_lasso = pd.DataFrame()

    # meta-analyze REML h2cis estimates by taking simple average
    # inverse-variance weighing has issues, since REML SE is downwardly biased for small h2 estimates
    for input in input_prefixes:
        cond = os.path.basename(input)
        lasso = pd.read_csv('{}.{}.lasso'.format(input, args.chr), sep='\t')
        lasso['COND'] = cond
        all_lasso = all_lasso.append(lasso)
        reml = pd.read_csv('{}.{}.hsq'.format(input, args.chr), sep='\t')
        reml.dropna(inplace=True)
        reml = reml[reml['Gene'].isin(genes)]
        gene_idx = [gene_indices[x] for x in reml['Gene'].tolist()]
        num[gene_idx] += reml['h2cis'].values
        count[gene_idx] += 1

    count[count == 0] = np.nan
    meta_h2cis = num / count
    meta_h2cis_out = pd.DataFrame({
        'Gene': genes,
        'Chrom': args.chr,
        'h2cis': meta_h2cis
    })
    meta_h2cis_out.to_csv(args.out + '.hsq', sep='\t', index=False)

    keep_snps = pd.read_csv(args.keep, header=None)
    geno_fname = args.bfile
    bim = pd.read_csv(geno_fname + '.bim', sep='\t', header=None)  # load bim
    bim = bim.loc[(bim[0] == args.chr).values
                  & bim[1].isin(keep_snps[0]).values, 0:3]
    bim.columns = ['CHR', 'SNP', 'CM', 'BP']

    # create gene membership dict
    snp_indices = dict(zip(bim['SNP'].tolist(),
                           range(len(bim))))  # SNP indices for fast merging
    filtered_meta_h2cis = meta_h2cis_out[meta_h2cis_out['h2cis'] >
                                         0]  # filter out genes w/h2cis < 0
    filtered_meta_h2cis = filtered_meta_h2cis[
        ~np.isnan(filtered_meta_h2cis['h2cis'])]
    g_annot_meta = {}  # gene membership in bins
    gene_bins = pd.qcut(filtered_meta_h2cis['h2cis'],
                        args.num_bins,
                        labels=range(args.num_bins)).astype(int)
    filtered_meta_h2cis['Bin'] = gene_bins
    for j in range(len(filtered_meta_h2cis)):
        g_annot_meta[filtered_meta_h2cis.iloc[j, 1]] = gene_bins.iloc[j]

    g_annot = []
    g_annot_names = []
    eqtl_annot = np.zeros((len(bim), args.num_bins))

    # create eQTL annot (for expscore) and gene annot
    print('Combining eQTL weights')
    for i in range(len(filtered_meta_h2cis)):
        gene = filtered_meta_h2cis.iloc[i, 1]
        temp_h2cis = filtered_meta_h2cis.iloc[i, 2]
        temp_lasso = all_lasso[all_lasso['GENE'] == gene]
        unique_conds = pd.unique(temp_lasso['COND'])
        for temp_cond in unique_conds:  # for each condition
            temp_temp_lasso = temp_lasso[temp_lasso['COND'] == temp_cond]
            snp_idx = [snp_indices[x] for x in temp_temp_lasso['SNP'].tolist()]
            temp_lasso_weights = temp_temp_lasso['EFFECT'].values
            emp_herit = np.sum(np.square(temp_lasso_weights))
            if emp_herit <= 0:  # scale eQTL weights to meta-tissue h2cis
                bias = 0
            else:
                bias = np.sqrt(temp_h2cis / emp_herit)
            temp_lasso_weights *= bias
            eqtl_annot[snp_idx,
                       g_annot_meta[gene]] += np.square(temp_lasso_weights)
            g_annot_toadd = np.zeros(args.num_bins)
            g_annot_toadd[g_annot_meta[gene]] = 1
            g_annot.append(g_annot_toadd)
            g_annot_names.append(gene + '_' + temp_cond)

    g_annot = np.array(g_annot).astype(int)
    g_annot_final = pd.DataFrame(np.c_[g_annot_names, g_annot])
    g_bin_names = [
        'Cis_herit_bin_{}'.format(x) for x in range(1, args.num_bins + 1)
    ]
    g_annot_final.columns = ['Gene'] + g_bin_names
    g_annot_final.to_csv('{}.{}.gannot.gz'.format(args.out, args.chr),
                         sep='\t',
                         index=False,
                         compression='gzip')

    G = np.sum(g_annot, axis=0)
    ave_cis_herit = filtered_meta_h2cis.groupby(['Bin']).mean()
    ave_cis_herit = ave_cis_herit['h2cis'].values

    np.savetxt('{}.{}.G'.format(args.out, args.chr),
               G.reshape((1, len(G))),
               fmt='%d')
    np.savetxt('{}.{}.ave_h2cis'.format(args.out, args.chr),
               ave_cis_herit.reshape((1, len(ave_cis_herit))),
               fmt="%.5f")

    print('Computing expression scores')
    # load genotypes
    array_indivs = ps.PlinkFAMFile(geno_fname + '.fam')
    array_snps = ps.PlinkBIMFile(geno_fname + '.bim')
    keep_snps_indices = np.where(
        (array_snps.df['CHR'] == args.chr).values
        & array_snps.df['SNP'].isin(keep_snps[0]).values)[0]

    with Suppressor():
        geno_array = ld.PlinkBEDFile(geno_fname + '.bed',
                                     array_indivs.n,
                                     array_snps,
                                     keep_snps=keep_snps_indices)

    block_left = ld.getBlockLefts(geno_array.df[:, 2], 1e6)

    # estimate expression scores
    res = geno_array.ldScoreVarBlocks(block_left, c=50, annot=eqtl_annot)
    expscore = pd.concat(
        [pd.DataFrame(geno_array.df[:, :3]),
         pd.DataFrame(res)], axis=1)
    expscore.columns = geno_array.colnames[:3] + g_bin_names

    # output files
    expscore.to_csv('{}.{}.expscore.gz'.format(args.out, args.chr),
                    sep='\t',
                    index=False,
                    compression='gzip',
                    float_format='%.5f')
    print('Done!')
Пример #4
0
def create_gset_expscore_meta_batch(args):
    '''
    Create gene set expression scores meta-analyzed over several tissues/conditions. Analyze gene sets in batches.
    '''
    input_prefixes = read_file_line(args.input_prefix_meta)
    genes = get_gene_list(input_prefixes)
    gsets = read_gene_sets(args.gene_sets, args.gset_start, args.gset_end)

    # gene indices for fast merging
    gene_indices = dict(zip(genes['Gene'].tolist(), range(len(genes))))
    num = np.zeros(len(genes))
    count = np.zeros(len(genes))
    all_lasso = pd.DataFrame()

    print('Reading eQTL weights')
    # meta-analyze REML h2cis estimates by taking simple average
    # inverse-variance weighing has issues, since REML SE is downwardly biased for small h2 estimates
    for input in input_prefixes:
        cond = os.path.basename(input)
        lasso = pd.read_csv('{}.{}.lasso'.format(input, args.chr), sep='\t')
        lasso['COND'] = cond
        all_lasso = all_lasso.append(lasso)
        all_reml = pd.DataFrame()
        for i in range(1, N_CHR + 1):
            reml = pd.read_csv('{}.{}.hsq'.format(input, i), sep='\t')
            reml.dropna(inplace=True)
            all_reml = all_reml.append(reml)
        gene_idx = [gene_indices[x] for x in all_reml['Gene'].tolist()]
        num[gene_idx] += all_reml['h2cis'].values
        count[gene_idx] += 1

    count[count == 0] = np.nan
    meta_h2cis = num / count
    meta_h2cis_out = pd.DataFrame({'Gene': genes['Gene'],
                                   'Chrom': genes['Chrom'],
                                   'h2cis': meta_h2cis}, columns=['Gene', 'Chrom', 'h2cis'])

    keep_snps = pd.read_csv(args.keep, header=None)
    geno_fname = args.bfile
    bim = pd.read_csv(geno_fname + '.bim', sep='\t', header=None)  # load bim
    bim = bim.loc[(bim[0] == args.chr).values & bim[1].isin(keep_snps[0]).values, 0:3]
    bim.columns = ['CHR', 'SNP', 'CM', 'BP']

    # keep genes with positive h2cis and converged LASSO
    snp_indices = dict(zip(bim['SNP'].tolist(), range(len(bim))))  # SNP indices for fast merging
    filtered_h2cis = meta_h2cis_out[meta_h2cis_out['h2cis'] > 0]  # filter out genes w/h2cis < 0
    filtered_h2cis = filtered_h2cis[~np.isnan(filtered_h2cis['h2cis'])]
    if args.genes:
        keep_genes = read_file_line(args.genes)
        filtered_h2cis = filtered_h2cis[filtered_h2cis['Gene'].isin(keep_genes)]
    # retain genes across all chromosome for binning
    filtered_gene_indices = dict(zip(filtered_h2cis['Gene'].tolist(), range(len(filtered_h2cis))))
    chr_filtered_h2cis = filtered_h2cis[filtered_h2cis['Gene'].isin(all_lasso['GENE'])]  # finally retain just genes on input chr

    # load genotypes
    array_indivs = ps.PlinkFAMFile(geno_fname + '.fam')
    array_snps = ps.PlinkBIMFile(geno_fname + '.bim')
    keep_snps_indices = \
        np.where((array_snps.df['CHR'] == args.chr).values & array_snps.df['SNP'].isin(keep_snps[0]).values)[0]

    with Suppressor():
        geno_array = ld.PlinkBEDFile(geno_fname + '.bed', array_indivs.n, array_snps,
                                     keep_snps=keep_snps_indices)

    block_left = ld.getBlockLefts(geno_array.df[:, 2], 1e6)

    print('Computing expression scores for background gene sets')
    # analyze background gset
    gset_names = ['Cis_herit_bin_{}'.format(x) for x in range(1, args.num_background_bins + 1)]
    gset_indices = dict(zip(gset_names, range(len(gset_names))))

    # create dict indicating gene membership in each gene set
    all_ave_h2cis = []  # compute average cis-heritability of genes in bin
    all_G = []
    gene_gset_dict = defaultdict(list)
    # background gene set
    gene_bins = pd.qcut(filtered_h2cis['h2cis'], args.num_background_bins,
                        labels=range(args.num_background_bins)).astype(int).tolist()
    temp_combined_herit = pd.DataFrame(np.c_[filtered_h2cis[['Gene', 'Chrom', 'h2cis']], gene_bins])
    temp_combined_herit[1] = temp_combined_herit[1].astype(int)
    temp_combined_herit[2] = temp_combined_herit[2].astype(float)
    temp_combined_herit[3] = temp_combined_herit[3].astype(int)
    temp_combined_herit = temp_combined_herit[temp_combined_herit[1] == args.chr]
    temp_h2cis = temp_combined_herit[[2, 3]].groupby([3]).mean()
    temp_h2cis = temp_h2cis[2].values
    all_ave_h2cis.extend(temp_h2cis)
    for i, gene in enumerate(filtered_h2cis['Gene']):
        gene_gset_dict[gene].append('Cis_herit_bin_{}'.format(gene_bins[i] + 1))

    # compute expression scores
    G, g_annot_final, expscore = batch_expscore(gene_gset_dict, gset_names, chr_filtered_h2cis, all_lasso, bim, snp_indices, gset_indices,
                                                geno_array, block_left)
    all_G.extend(G)

    if args.split_output:
        np.savetxt('{}.Base.{}.G'.format(args.out, args.chr), G.reshape((1, len(G))), fmt='%d')
        np.savetxt('{}.Base.{}.ave_h2cis'.format(args.out, args.chr), np.array(all_ave_h2cis).reshape((1, len(all_ave_h2cis))),
                   fmt="%.5f")
        g_annot_final.to_csv('{}.Base.{}.gannot'.format(args.out, args.chr), sep='\t', index=False)
        expscore.to_csv('{}.Base.{}.expscore'.format(args.out, args.chr), sep='\t', index=False,
                        float_format='%.5f')
    else:
        g_annot_final.to_csv('{}.{}.gannot.batch0'.format(args.out, args.chr), sep='\t', index=False)
        expscore.to_csv('{}.{}.expscore.batch0'.format(args.out, args.chr), sep='\t', index=False, float_format='%.5f')

    # remaining gene sets
    count = 0
    ave_h2cis = []
    for i in range(0, len(gsets), args.batch_size):
        count += 1
        print('Computing expression scores for gene sets {} to {} (out of {} total)'.format(i+1, min(i+args.batch_size, len(gsets)), len(gsets)))
        temp_gsets = OrderedDict(gsets.items()[i:(i+args.batch_size)])
        rest_gset_names = []
        rest_gene_gset_dict = defaultdict(list)

        for k in temp_gsets.keys():
            rest_gset_names.extend(['{}_Cis_herit_bin_{}'.format(k, x) for x in range(1, args.num_gene_bins + 1)])

        gset_indices = dict(zip(rest_gset_names, range(len(rest_gset_names))))

        for k, v in temp_gsets.items():
            temp_genes = [x for x in v if x in filtered_h2cis['Gene'].tolist()]
            temp_herit = filtered_h2cis.iloc[[filtered_gene_indices[x] for x in temp_genes], [0, 1, 2]]
            gene_bins = pd.qcut(temp_herit['h2cis'], args.num_gene_bins, labels=range(args.num_gene_bins)).astype(
                int).tolist()  # bin first, then subset chr
            temp_combined_herit = pd.DataFrame(np.c_[temp_herit, gene_bins])
            temp_combined_herit[1] = temp_combined_herit[1].astype(int)
            temp_combined_herit[2] = temp_combined_herit[2].astype(float)
            temp_combined_herit[3] = temp_combined_herit[3].astype(int)
            temp_combined_herit = temp_combined_herit[temp_combined_herit[1] == args.chr]  # subset chr

            # sometimes for small gene sets, bins will contain no genes for individual chromosomes
            bins = temp_combined_herit[3].tolist()
            copy_herit = copy.deepcopy(temp_combined_herit)
            for i in range(args.num_gene_bins):
                if i not in bins:
                    copy_herit = copy_herit.append([['GENE', 0, 0, i]])
            temp_h2cis = copy_herit[[2, 3]].groupby([3]).mean()
            temp_h2cis = temp_h2cis[2].values
            ave_h2cis.extend(temp_h2cis)
            for i, gene in enumerate(temp_combined_herit[0].values):
                rest_gene_gset_dict[gene].append('{}_Cis_herit_bin_{}'.format(k, temp_combined_herit[3].values[i] + 1))

        G, g_annot_final, expscore = batch_expscore(rest_gene_gset_dict, rest_gset_names, chr_filtered_h2cis, all_lasso, bim,
                                                    snp_indices, gset_indices, geno_array, block_left)
        all_G.extend(G)
        all_ave_h2cis.extend(ave_h2cis)
        if args.split_output:
            for i in range(0, len(temp_gsets) * args.num_gene_bins, args.num_gene_bins):
                gset_name = re.sub('_Cis_herit_bin_1', '', g_annot_final.columns[1 + i])
                np.savetxt('{}.{}.{}.G'.format(args.out, gset_name, args.chr),
                           G[i:i + args.num_gene_bins].reshape((1, args.num_gene_bins)), fmt='%d')
                np.savetxt('{}.{}.{}.ave_h2cis'.format(args.out, gset_name, args.chr),
                           np.array(ave_h2cis[i:i + args.num_gene_bins]).reshape((1, args.num_gene_bins)),
                           fmt="%.5f")
                g_annot_final.iloc[:, [0] + range(i + 1, i + 1 + args.num_gene_bins)].to_csv(
                    '{}.{}.{}.gannot'.format(args.out, gset_name, args.chr),
                    sep='\t', index=False)
                expscore.iloc[:, range(0, 3) + range(i + 3, i + 3 + args.num_gene_bins)].to_csv(
                    '{}.{}.{}.expscore'.format(args.out, gset_name, args.chr),
                    sep='\t', index=False,
                    float_format='%.5f')
        else:
            temp_g_annot_name = '{}.{}.gannot.batch{}'.format(args.out, args.chr, count)
            temp_expscore_name = '{}.{}.expscore.batch{}'.format(args.out, args.chr, count)

            g_annot_final = g_annot_final.iloc[:, 1:]
            expscore = expscore.iloc[:, 3:]
            g_annot_final.to_csv(temp_g_annot_name, sep='\t', index=False)
            expscore.to_csv(temp_expscore_name, sep='\t', index=False, float_format='%.5f')

    if not args.split_output:
        subprocess.call('paste {} > {}'.format(' '.join(['{}.{}.gannot.batch{}'.format(args.out, args.chr, x) for x in range(count+1)]),
                                                '{}.{}.gannot'.format(args.out, args.chr)), shell=True)
        subprocess.call('paste {} > {}'.format(' '.join(['{}.{}.expscore.batch{}'.format(args.out, args.chr, x) for x in range(count+1)]),
                                                '{}.{}.expscore'.format(args.out, args.chr)), shell=True)
        subprocess.call('rm {}'.format(' '.join(['{}.{}.gannot.batch{}'.format(args.out, args.chr, x) for x in range(count+1)])), shell=True)
        subprocess.call('rm {}'.format(' '.join(['{}.{}.expscore.batch{}'.format(args.out, args.chr, x) for x in range(count+1)])), shell=True)

        np.savetxt('{}.{}.G'.format(args.out, args.chr), np.array(all_G).reshape((1, len(all_G))), fmt='%d')
        np.savetxt('{}.{}.ave_h2cis'.format(args.out, args.chr), np.array(all_ave_h2cis).reshape((1, len(all_ave_h2cis))),
                   fmt="%.5f")

    print('Done!')