def compute_statistic(self, alphahat, R, RA, N, Nref, memoize=False): Rajd = Nadjust_after = None if self.params.Radjust == 'after': Nadjust_after = Nref Radj = R elif self.params.Radjust == 'before': Nadjust_after = None Radj = R.adjusted_before_inversion(Nref) else: Nadjust_after = None Radj = R if self.params.RAreg: print('regularizing RA') RA = RA.add_ridge(self.params.Lambda, renormalize=True) gs = GenomicSubset(self.params.region) A = SnpSubset(self.refpanel, bedtool=gs.bedtool) RA.zero_outside_irs(A.irs) if not memoize or not hasattr(self, 'bias'): print('adding lambda') Radjreg = Radj.add_ridge(self.params.Lambda, renormalize=True) print('computing inverse') self.Radjreginv = Radjreg.inv(Nadjust_after=Nadjust_after) print('done.computing bias...') A = SnpSubset(self.refpanel, bedtool=GenomicSubset(self.params.region).bedtool) W = self.window(A) if not self.params.avgunbiased: tr = self.Radjreginv.dot(RA).trace() self.scaling = 1 else: tr = RA.dot(self.Radjreginv).dot(R).dot( self.Radjreginv).trace() Q = R.dot(self.Radjreginv).dot(RA).dot(self.Radjreginv).dot(R) Q.zero_outside_irs(A.irs) self.scaling = A.num_snps() / Q.trace() # self.bias = tr / N + \ # float(self.refpanel.M-len(W.irs))/self.refpanel.M * \ # self.params.sigma2g * tr / self.params.pop_size self.bias = tr / N + \ self.params.sigma2g * tr / self.params.pop_size print('\nbias =', self.bias) print('scaling =', self.scaling) betahat = self.Radjreginv.dot(alphahat) return self.scaling * (betahat.dot(RA.dot(betahat)) - self.bias)
def preprocess(self): if self.params.baseline and not self.baseline_preprocessing_in_progress( ): print('baseline model not found. creating...') self.declare_baseline_preprocessing_in_progress() self.create_baseline_model() print('submitting ld score jobs for annotation of interest') gs = GenomicSubset(self.params.region) # create the annotation file for chrnum in self.refpanel.chromosomes(): d = Dataset(self.params.refpanel, chrnum=chrnum) ss = SnpSubset(d, gs.restricted_to_chrom_bedtool(chrnum)) SnpSubset.print_subsets(self.annotation_filename(chrnum), [ss], [self.params.region], add_other=True) # create the ldscores file for chrnum in self.refpanel.chromosomes(): d = Dataset(self.params.refpanel, chrnum=chrnum) ldscores_command = [ 'python', '-u', paths.foreign + 'ldsc/ldsc.py', '--l2', '--ld-wind-cm', str(self.params.ld_window / 1000.), '--bfile', d.genotypes_bedfile.filename, '--annot', self.annotation_filename(chrnum), '--out', self.annotation_l2_filestem(chrnum) ] print(' '.join(ldscores_command)) outfilepath = self.annotation_l2_filestem(chrnum) + '.bsub_out' bsub.submit(ldscores_command, outfilepath, jobname=self.preprocessing_foldername() + ',chr=' + str(chrnum))
def create_baseline_model(self): gss = [GenomicSubset(region) for region in LDSC.baseline_model_regions] # create the annotation file for chrnum in self.refpanel.chromosomes(): print('creating baseline annot file for chr', chrnum) d = Dataset(self.params.refpanel, chrnum=chrnum) sss = [ SnpSubset(d, gs.restricted_to_chrom_bedtool(chrnum)) for gs in gss ] SnpSubset.print_subsets(self.baseline_filename(chrnum), sss, LDSC.baseline_model_regions) # create the ldscores file for chrnum in self.refpanel.chromosomes(): d = Dataset(self.params.refpanel, chrnum=chrnum) ldscores_command = [ 'python', '-u', paths.foreign + 'ldsc/ldsc.py', '--l2', '--ld-wind-cm', str(self.params.ld_window / 1000.), '--bfile', d.genotypes_bedfile.filename, '--annot', self.baseline_filename(chrnum), '--out', self.baseline_l2_filestem(chrnum) ] print(' '.join(ldscores_command)) outfilepath = self.baseline_l2_filestem(chrnum) + '.bsub_out' bsub.submit(ldscores_command, outfilepath, jobname='baseline,chr=' + str(chrnum))
def preprocess(self, use_filesystem=True): if not self.covariance_preprocessing_in_progress( ) or not use_filesystem: print('creating covariance matrix...') if use_filesystem: self.declare_covariance_preprocessing_in_progress() self.R = self.compute_covariance() if use_filesystem: pickle.dump(self.R, self.R_file(mode='wb'), 2) else: print('loading covariance matrix') self.R = pickle.load(self.R_file()) if not self.invcovariance_preprocessing_in_progress( ) or not use_filesystem: print('creating inverse covariance matrix') if use_filesystem: self.declare_invcovariance_preprocessing_in_progress() self.Rri = self.compute_invcovariance() if use_filesystem: pickle.dump(self.Rri, self.Rri_file(mode='wb'), 2) else: print('loading inverse covariance matrix') self.Rri = pickle.load(self.Rri_file()) t0 = time.time() print(time.time() - t0, ': creating and saving RA') self.A = SnpSubset(self.refpanel, GenomicSubset(self.params.region).bedtool) self.RA = self.R.copy() self.RA.zero_outside_irs(self.A.irs) if use_filesystem: pickle.dump(self.RA, self.RA_file(mode='wb'), 2) print(time.time() - t0, ': computing and saving scaling') self.Z = self.Rri.dot(self.RA.dot(self.Rri)) self.Q = self.R.dot(self.Z).dot(self.R) QA = self.Q.copy() QA.zero_outside_irs(self.A.irs) self.scalings = { r: len(self.A.irs & IntRangeSet(r)) / np.trace(QA.ranges_to_arrays[r]) for r in QA.ranges() } print(time.time() - t0, ': scalings are', self.scalings) if use_filesystem: self.set_scalings(self.scalings) print(time.time() - t0, ': computing and saving bias matrix') self.ZR = self.RA.dot(self.Rri).dot(self.R).dot(self.Rri) if use_filesystem: pickle.dump(self.ZR, self.biasmatrix_file(mode='wb'), 2) print(time.time() - t0, ': variance matrices') self.QZ = self.Q.dot(self.Z) self.QZR = self.QZ.dot(self.R) if use_filesystem: self.save_variance_matrices(self.Q, self.Z, self.QZ, self.QZR) print(time.time() - t0, ': done')
def init(self): self.Rri = pickle.load(self.Rri_file()) self.R = pickle.load(self.R_file()) self.RA = pickle.load(self.RA_file()) self.A = SnpSubset(self.refpanel, GenomicSubset(self.params.region).bedtool) self.ZR = pickle.load(self.biasmatrix_file()) self.Q, self.Z, self.QZ, self.QZR = self.get_variance_matrices()
def chunks_containing_region(self): breakpoints = BedTool(paths.reference + self.params.breakpointsfile) blocks = SnpPartition(self.refpanel, breakpoints, remove_mhc=True) self.A = SnpSubset(self.refpanel, GenomicSubset(self.params.region).bedtool) return [ int(i / self.chunk_size(blocks.ranges())) for i in blocks.indices_containing(self.A.irs) ]
def preprocess(self): matplotlib.use('Agg') gs = GenomicSubset(self.params.region) ss = SnpSubset(self.refpanel, bedtool=gs.bedtool) RA = BlockDiag.ld_matrix(self.refpanel, ss.irs.ranges(), self.params.ld_bandwidth / 1000.) try: # if the plotting has some error we don't want to not save the stuff # RA.plot(ss.irs, filename=self.RA_plotfilename()) pass except: pass pickle.dump(RA, self.RA_file(mode='wb'), 2)
def preprocess(self): matplotlib.use('Agg') gs = GenomicSubset(self.params.region) A = SnpSubset(self.refpanel, bedtool=gs.bedtool) W = A.expanded_by(self.params.ld_window / 1000.) R = BlockDiag.ld_matrix(self.refpanel, W.irs.ranges(), 300, band_units='SNPs') pickle.dump(R, self.R_file(mode='wb'), 2) # R.plot(A.irs, filename=self.R_plotfilename()) RA = R.zero_outside_irs(A.irs) pickle.dump(RA, self.RA_file(mode='wb'), 2)
def preprocess(self): matplotlib.use('Agg') gs = GenomicSubset(self.params.region) A = SnpSubset(self.refpanel, bedtool=gs.bedtool) W = self.window(A) R = BlockDiag.ld_matrix(self.refpanel, W.irs.ranges(), 1000000) # bandwidth=infty pickle.dump(R, self.R_file(mode='wb'), 2) try: # if the plotting has some error we don't want to not save the stuff # R.plot(A.irs, filename=self.R_plotfilename()) pass except: pass RA = R.zero_outside_irs(A.irs) pickle.dump(RA, self.RA_file(mode='wb'), 2)
def create_annot(args): path = '/'.join(args.bedfile.split('/')[:-1]) + '/' filename = args.bedfile.split('/')[-1] if filename[-4:] == '.bed': name = filename[:-4] else: name = filename gs = GenomicSubset(name, path=path) for chrnum in range(1, 23)[::-1]: print('creating annot file for chr', chrnum) d = Dataset(args.refpanel + '.' + str(chrnum)) sss = [SnpSubset(d, gs.restricted_to_chrom_bedtool(chrnum))] SnpSubset.print_subsets('{}{}.{}.annot.gz'.format(path, name, chrnum), sss, [name])
def run(self, beta_num, sim): print('loading data set and region info') d = Dataset(sim.dataset) gs = GenomicSubset(self.params.region) ss = SnpSubset(d, bedtool=gs.bedtool) print('loading ld score info') ref_ldscores, w_ld, M_annot = self.ld_score_info() N = np.ones((d.M, 1)) * d.N print(('ref_ldscores shape:{}\nw_ld shape:{}\nN shape:{}\n' + \ 'M_annot shape:{}').format( ref_ldscores.shape, w_ld.shape, N.shape, M_annot.shape)) overlaps = self.overlap_vector() print('num snps overlapping with each category:', overlaps) results = [] variances = [] for alphahat in sim.sumstats_files(beta_num): alphahat = d.N * alphahat**2 if self.params.constrain_intercept: hsqhat = ldsc.ldscore.regressions.Hsq(alphahat.reshape( (d.M, 1)), ref_ldscores, w_ld, N, M_annot, intercept=1) else: hsqhat = ldsc.ldscore.regressions.Hsq( alphahat.reshape((d.M, 1)), ref_ldscores, w_ld, N, M_annot) results.append(hsqhat.coef.dot(overlaps)) variances.append(overlaps.dot(hsqhat.coef_cov).dot(overlaps)) print('intercept:', hsqhat.intercept) print(len(results), results[-1], variances[-1]) return np.concatenate([np.array([results]).T, np.array([variances]).T], axis=1)
A = GenomicSubset(args.region) print('loading ld blocks') blocks = BedTool(paths.reference + args.ldblocks) print('finding ld blocks that overlap with A') relevant_blocks = blocks.intersect(A.bedtool, wa=True).saveas() print('found', len(relevant_blocks), 'blocks that overlap with A') print('reading refpanel bim') refpanel_bim = pd.read_csv(refpanel.genotypes_bedfile.filename + '.bim', sep='\t', names=['CHR', 'SNP', 'cM', 'BP', 'A1', 'A2']) refpanel_bim['INDEX'] = np.arange(len(refpanel_bim)) refpanel_bim['A'] = 1 refpanel_bim.ix[SnpSubset(refpanel, A.bedtool).irs, 'A'] = 1 print('reading sumstats') sumstats = pd.read_csv(args.sumstats_path + '.gz', header=0, sep='\t', compression='gzip') for block in relevant_blocks: window_ref = SnpSubset(refpanel, BedTool([block])) refpanel_bim_w = refpanel_bim.loc[window_ref.irs] print('merging') refpanel_with_sumstats = refpanel_bim_w.merge(sumstats, how='left', on=['SNP'])
if __name__ == '__main__': from primitives import Dataset, GenomicSubset, SnpSubset import copy from time import time import argparse np.random.seed(0) parser = argparse.ArgumentParser() parser.add_argument('--M', type=int, required=True, help='the number of SNPs to use') parser.add_argument('-check_dense', action='store_true', default=False) args = parser.parse_args() d = Dataset('GERA', forced_M=args.M) indivs = d.random_indivs(200) tiny_gs = GenomicSubset('50') tiny_ss = SnpSubset(d, bedtool=tiny_gs.bedtool) tiny_buffered_ss = tiny_ss.expanded_by(0.01) t0 = time() R = BlockDiag.ld_matrix(d, tiny_buffered_ss.irs.ranges(), 0.01, indivs=indivs) # 1 cM bandwidth R = R.add_ridge(0.05, renormalize=True) print('trace of renormalized R should be close to M (with noise due to sample vs pop LD', R.trace(), tiny_buffered_ss.num_snps(), R.trace() == tiny_buffered_ss.num_snps()) print('computing R took', time() - t0) print('shape of R is:', R.shape()) RA = R.copy() RA.zero_outside_irs(tiny_ss.irs) b = BlockDiag.from_big1darray(np.random.randn(d.M), R.ranges())