def print_subsets(cls, outfilename, snpsubsets, names, add_other=False): def snp_info_df(d): bfile = d.genotypes_bedfile.filename return pd.read_csv(bfile + '.bim', delim_whitespace=True, usecols=[0,1,2,3], names=['CHR','SNP','CM','BP']) # check that all snpsubsets have the same data set if len(set([ss.dataset for ss in snpsubsets])) > 1: print('error: all subsets must have the same underlying dataset') return if not outfilename.endswith('.gz'): print('outfilename must end with ".gz". I only write zipped files') return # get snp info for this dataset d = snpsubsets[0].dataset df = snp_info_df(d) # add the 'other' annotation if necessary if add_other: union = IntRangeSet() for ss in snpsubsets: union.update(ss.irs) snpsubsets.append(SnpSubset(d, irs=d.all_snps() - union)) names.append('OTHER') # create the pandas dataframe and output it for name, ss in zip(names, snpsubsets): df[name] = 0 df.ix[[i for i in ss.irs], name] = 1 df = df[['CHR','BP','SNP','CM'] + names] with gzip.open(outfilename, 'wt') as write_file: df.to_csv(write_file, index=False, sep='\t')
def add_covariance_for_range(r): print(r) range_size = r[1] - r[0] cov = np.zeros((range_size, range_size)) range_genotypes = d.get_standardized_genotypes(r, indivs=indivs) def compute_cov_for_snp(m): end = d.buffer_around_snp(m, bandwidth, start=r[0], end=r[1], units=band_units)[1] window_start = m - r[0] window_end = end - r[0] window = range_genotypes[:, window_start:window_end] cov_to_snps_in_window = \ range_genotypes[:,m-r[0]].T.dot(window) / range_genotypes.shape[0] cov_to_snps_in_window[0] /= 2 # since we're going to symmetrize later cov[m-r[0], window_start:window_end] = cov_to_snps_in_window map(compute_cov_for_snp, it.show_progress(range(r[0], r[1]))) # symmetrization ranges_to_arrays[r] = cov + cov.T # make coding of snps consistent with other dataset flip = np.array(IntRangeSet(positions_to_flip) & IntRangeSet((r[0],r[1])), dtype=int) - r[0] # dtype required so we can use empty array as index ranges_to_arrays[r][flip] *= -1 ranges_to_arrays[r][:,flip] *= -1
def get_high_ld_snps(subset, matrix): result = IntRangeSet() for i in subset: snps = IntRangeSet(np.flatnonzero(matrix[i]**2 > args.R2_threshold)) snps -= subset result += snps return result
def add_covariance_for_range(r): print(int(time()-t0), ':', r) range_genotypes = d.get_standardized_genotypes(r, indivs=indivs) ranges_to_arrays[r] = \ range_genotypes.T.dot(range_genotypes) / range_genotypes.shape[0] # make coding of snps consistent with other dataset flip = np.array(IntRangeSet(positions_to_flip) & IntRangeSet((r[0],r[1])), dtype=int) - r[0] # dtype required so we can use empty array as index ranges_to_arrays[r][flip] *= -1 ranges_to_arrays[r][:,flip] *= -1
def __init__(self, dataset, bedtool=None, irs=None): # use bedtools to create an indicator vector for the snps membership in the subset self.dataset = dataset if bedtool: indicator = dataset.snp_coords().intersect(bedtool, c=True) self.irs = IntRangeSet(np.flatnonzero( np.array([int(snp.name) for snp in indicator]))) elif irs: self.irs = irs else: self.irs = IntRangeSet()
def __zero_block_outside_irs(self, r, other_intrangeset): my_intrangeset = IntRangeSet(r) intersection_intrangeset = my_intrangeset & other_intrangeset if intersection_intrangeset.isempty: del self.ranges_to_arrays[r] else: mask = np.zeros(len(my_intrangeset), dtype=bool) for s in intersection_intrangeset.ranges(): start = my_intrangeset.index(s[0]) end = start + s[1] - s[0] mask[start:end] = True self.ranges_to_arrays[r][~mask] = 0 self.ranges_to_arrays[r].T[~mask] = 0 # for compatibility with 1d arrays
def plot(self, irs_to_mark, filename=None): import matplotlib.pyplot as plt rows = int(math.ceil(math.sqrt(len(self.ranges())))) cols = max(int(math.ceil(len(self.ranges()) / rows)), 2) # the max is so we get # back a 2-D array always fig, axes = plt.subplots(nrows=rows, ncols=cols) for ax,(r, A) in zip(fig.axes, self.ranges_to_arrays.items()): width = r[1] - r[0] ax.matshow(A, vmin=-1, vmax=1) # import pdb; pdb.set_trace() my_intrangeset = IntRangeSet(r) intersection = my_intrangeset & irs_to_mark def draw_line(xs, ys): ax.plot(xs, ys, transform=ax.transAxes, lw=0.2, color='k') for s in intersection.ranges(): draw_line([(s[0] - r[0])/width, (s[0] - r[0])/width], [0, 1]) draw_line([(s[1] - r[0])/width, (s[1] - r[0])/width], [0, 1]) draw_line([0,1], [(r[1] - s[0])/width, (r[1] - s[0])/width]) draw_line([0,1], [(r[1] - s[1])/width, (r[1] - s[1])/width]) ax.set_xticks([0,width]); ax.set_yticks([0,width]) ax.set_xlim(0,width); ax.set_ylim(width, 0) ax.set_title(str(r)) fig.set_size_inches(axes.shape[0] * 3, axes.shape[1]*4) if filename: fig.savefig(filename, dpi=400) else: fig.show()
def preprocess(self, use_filesystem=True): if not self.covariance_preprocessing_in_progress( ) or not use_filesystem: print('creating covariance matrix...') if use_filesystem: self.declare_covariance_preprocessing_in_progress() self.R = self.compute_covariance() if use_filesystem: pickle.dump(self.R, self.R_file(mode='wb'), 2) else: print('loading covariance matrix') self.R = pickle.load(self.R_file()) if not self.invcovariance_preprocessing_in_progress( ) or not use_filesystem: print('creating inverse covariance matrix') if use_filesystem: self.declare_invcovariance_preprocessing_in_progress() self.Rri = self.compute_invcovariance() if use_filesystem: pickle.dump(self.Rri, self.Rri_file(mode='wb'), 2) else: print('loading inverse covariance matrix') self.Rri = pickle.load(self.Rri_file()) t0 = time.time() print(time.time() - t0, ': creating and saving RA') self.A = SnpSubset(self.refpanel, GenomicSubset(self.params.region).bedtool) self.RA = self.R.copy() self.RA.zero_outside_irs(self.A.irs) if use_filesystem: pickle.dump(self.RA, self.RA_file(mode='wb'), 2) print(time.time() - t0, ': computing and saving scaling') self.Z = self.Rri.dot(self.RA.dot(self.Rri)) self.Q = self.R.dot(self.Z).dot(self.R) QA = self.Q.copy() QA.zero_outside_irs(self.A.irs) self.scalings = { r: len(self.A.irs & IntRangeSet(r)) / np.trace(QA.ranges_to_arrays[r]) for r in QA.ranges() } print(time.time() - t0, ': scalings are', self.scalings) if use_filesystem: self.set_scalings(self.scalings) print(time.time() - t0, ': computing and saving bias matrix') self.ZR = self.RA.dot(self.Rri).dot(self.R).dot(self.Rri) if use_filesystem: pickle.dump(self.ZR, self.biasmatrix_file(mode='wb'), 2) print(time.time() - t0, ': variance matrices') self.QZ = self.Q.dot(self.Z) self.QZR = self.QZ.dot(self.R) if use_filesystem: self.save_variance_matrices(self.Q, self.Z, self.QZ, self.QZR) print(time.time() - t0, ': done')
def ranges(self): ranges = zip(np.concatenate([[0], self.last_snps]), self.last_snps) if not self.remove_mhc: return ranges else: if self.mhc is None: self.mhc = SnpSubset(self.dataset, self.dataset.mhc_bedtool()) return [r for r in ranges if (IntRangeSet(r) & self.mhc.irs).isempty]
def compute_cov_for_snp(m): # we just compute the numbers needed for the top trianglular half # of the LD matrix, then we symmetrize the matrix. (commented line is old) # start = max(0, m - int(bandwidth/2)) start = m end = min(slice_genotypes.shape[1], m + int(bandwidth / 2)) window_indices = IntRangeSet( (start, end)) & snpset_relative_to_slice window = slice_genotypes[:, window_indices] cov_to_snps_in_window = slice_genotypes[:, m].T.dot( window) / len(indivs) cov_to_snps_in_window[ 0] /= 2 # since we're going to symmetrize later target_indices = IntRangeSet( (s[0] + start, s[0] + end)) & snpset_irs lil_cov[s[0] + m, target_indices] = cov_to_snps_in_window
def __init__(self, d, indivs, bandwidth, snpset_irs=None, output=False): if snpset_irs is None: snpset_irs = IntRangeSet((0, d.M)) bandwidth = bandwidth + 1 self.bandwidth = 2 * int(bandwidth / 2) + 1 self.indivs = indivs lil_cov = sps.lil_matrix((d.M, d.M)) def compute_cov_for_slice(s): indices = IntRangeSet( (s[0] if s[0] == 0 else s[0] + int(bandwidth / 2), s[1] if s[1] == d.M else s[1] - int(bandwidth / 2))) indices = indices & snpset_irs if indices.isempty: # if there are no indices to analyze then we can move on return print(s) slice_genotypes = d.get_standardized_genotypes(s, indivs=indivs) snpset_relative_to_slice = IntRangeSet([ (x - s[0], y - s[0]) for x, y in snpset_irs.ranges() ]) def compute_cov_for_snp(m): # we just compute the numbers needed for the top trianglular half # of the LD matrix, then we symmetrize the matrix. (commented line is old) # start = max(0, m - int(bandwidth/2)) start = m end = min(slice_genotypes.shape[1], m + int(bandwidth / 2)) window_indices = IntRangeSet( (start, end)) & snpset_relative_to_slice window = slice_genotypes[:, window_indices] cov_to_snps_in_window = slice_genotypes[:, m].T.dot( window) / len(indivs) cov_to_snps_in_window[ 0] /= 2 # since we're going to symmetrize later target_indices = IntRangeSet( (s[0] + start, s[0] + end)) & snpset_irs lil_cov[s[0] + m, target_indices] = cov_to_snps_in_window map(compute_cov_for_snp, it.show_progress([x - s[0] for x in indices])) map(compute_cov_for_slice, d.slices(buffer_size=int(bandwidth / 2))) from time import time t0 = time() if output: print('starting symmetrization and conversion to csr') self.covcsr = lil_cov.tocsr() self.covcsr = self.covcsr + self.covcsr.T if output: print('took time:', time() - t0)
def compute_cov_for_slice(s): indices = IntRangeSet( (s[0] if s[0] == 0 else s[0] + int(bandwidth / 2), s[1] if s[1] == d.M else s[1] - int(bandwidth / 2))) indices = indices & snpset_irs if indices.isempty: # if there are no indices to analyze then we can move on return print(s) slice_genotypes = d.get_standardized_genotypes(s, indivs=indivs) snpset_relative_to_slice = IntRangeSet([ (x - s[0], y - s[0]) for x, y in snpset_irs.ranges() ]) def compute_cov_for_snp(m): # we just compute the numbers needed for the top trianglular half # of the LD matrix, then we symmetrize the matrix. (commented line is old) # start = max(0, m - int(bandwidth/2)) start = m end = min(slice_genotypes.shape[1], m + int(bandwidth / 2)) window_indices = IntRangeSet( (start, end)) & snpset_relative_to_slice window = slice_genotypes[:, window_indices] cov_to_snps_in_window = slice_genotypes[:, m].T.dot( window) / len(indivs) cov_to_snps_in_window[ 0] /= 2 # since we're going to symmetrize later target_indices = IntRangeSet( (s[0] + start, s[0] + end)) & snpset_irs lil_cov[s[0] + m, target_indices] = cov_to_snps_in_window map(compute_cov_for_snp, it.show_progress([x - s[0] for x in indices]))
range_count = int(150000 * scale) position_count = int(3000000000 * scale) #3 billion region_max_length = int(2000000 * scale) #2 million np.random.seed(seed) for range_index in xrange(range_count): length = int(np.exp(np.random.random() * np.log(region_max_length))) start = randlong(position_count - length) #does randint really go up to 3 billin? stop = start + length yield start, stop from pysnptools.util import IntRangeSet geneset = IntRangeSet() for start, stop in region_gen(scale=.1): geneset |= (start, stop) print geneset print geneset.ranges_len print("done") os.chdir(r"C:\Source\carlk\fastlmm2\tests\datasets\synth") from pysnptools.snpreader import Bed # Use "Bed" to access file "all.bed" snpreader = Bed("all.bed") # What is snpreader?
class SnpSubset(object): def __init__(self, dataset, bedtool=None, irs=None): # use bedtools to create an indicator vector for the snps membership in the subset self.dataset = dataset if bedtool: indicator = dataset.snp_coords().intersect(bedtool, c=True) self.irs = IntRangeSet(np.flatnonzero( np.array([int(snp.name) for snp in indicator]))) elif irs: self.irs = irs else: self.irs = IntRangeSet() def num_snps(self): return len(self.irs) def expand_by(self, expansion_in_each_direction, units='Morgans'): result = IntRangeSet() for r in self.irs.ranges(): result += self.dataset.buffer_around_slice( r, expansion_in_each_direction, units=units) self.irs = result def expanded_by(self, expansion_in_each_direction, units='Morgans'): result = copy.copy(self) result.expand_by(expansion_in_each_direction, units=units) return result # prints subsets in the appropriate format for ldsc # all subsets must have the same dataset @classmethod def print_subsets(cls, outfilename, snpsubsets, names, add_other=False): def snp_info_df(d): bfile = d.genotypes_bedfile.filename return pd.read_csv(bfile + '.bim', delim_whitespace=True, usecols=[0,1,2,3], names=['CHR','SNP','CM','BP']) # check that all snpsubsets have the same data set if len(set([ss.dataset for ss in snpsubsets])) > 1: print('error: all subsets must have the same underlying dataset') return if not outfilename.endswith('.gz'): print('outfilename must end with ".gz". I only write zipped files') return # get snp info for this dataset d = snpsubsets[0].dataset df = snp_info_df(d) # add the 'other' annotation if necessary if add_other: union = IntRangeSet() for ss in snpsubsets: union.update(ss.irs) snpsubsets.append(SnpSubset(d, irs=d.all_snps() - union)) names.append('OTHER') # create the pandas dataframe and output it for name, ss in zip(names, snpsubsets): df[name] = 0 df.ix[[i for i in ss.irs], name] = 1 df = df[['CHR','BP','SNP','CM'] + names] with gzip.open(outfilename, 'wt') as write_file: df.to_csv(write_file, index=False, sep='\t')
help='the number of SNPs to use') parser.add_argument('-check_dense', action='store_true', default=False) args = parser.parse_args() d = Dataset('GERA', forced_M=args.M) indivs = d.random_indivs(200) t0 = time() R = LdMatrix(d, indivs, 200) R.add_ridge(0.05) print('computing R took', time() - t0) print('shape of R is:', R.covcsr.shape) # tiny = GenomicSubset('tiny') # tiny_irs = SnpSubset(d, bedtool=tiny.bedtool).irs tiny_irs = IntRangeSet('300:350') RA = LdMatrix(d, indivs, 200, snpset_irs=tiny_irs, output=False) b = np.random.randn(d.M) # check inverse computation t0 = time() Rinvb = R.solve_banded(b) print('R^{-1}b took', time() - t0) if args.check_dense: Rinvb_dense = np.linalg.solve(R.covcsr.toarray(), b) print('R^{-1}b behaves well:', np.allclose(Rinvb, Rinvb_dense)) t0 = time() TrRinvRA = R.trace_of_inverse_times_matrix(RA) print('Tr(Rinv*RA) took', time() - t0) if args.check_dense:
return result def interval_from_range(r): return Interval( 'chr'+str(int(d.genotypes_bedfile.pos[r[0]][0])), int(d.genotypes_bedfile.pos[r[0]][2]), int(d.genotypes_bedfile.pos[r[1]][2])-1) d = Dataset(args.dataset) A = SnpSubset(d, GenomicSubset(args.subset).bedtool) if args.path_to_R is not None: R = pickle.load(open(args.path_to_R)) else: R = None newA = IntRangeSet() for r in A.expanded_by(0.003).irs.ranges(): S = IntRangeSet([a-r[0] for a in A.irs & IntRangeSet(r)]) print(r, 'analyzing', len(S), 'snps') if R is None: X = d.get_standardized_genotypes(r) cov = X.T.dot(X) / d.N else: cov = R.ranges_to_arrays[r] while True: new = get_high_ld_snps(S, cov) if len(new) == 0: break else: print('\tadding', len(new), 'snps') print('\t\tbefore', S)
def expand_by(self, expansion_in_each_direction, units='Morgans'): result = IntRangeSet() for r in self.irs.ranges(): result += self.dataset.buffer_around_slice( r, expansion_in_each_direction, units=units) self.irs = result
def all_snps(self): return IntRangeSet((0, self.M))
def indices_containing(self, irs): ranges = self.ranges() return [i for i, r in enumerate(ranges) if not (IntRangeSet(r) & irs).isempty]
def compute_variance(self, alphahat, point_estimate, N, use_beta=False): def term1_coeff(r): return 4 * (1 / N + self.c(r)) + 4 / N * (1 / N + self.c(r)) def term2_coeff(): return 4 / N + 2 / N**2 def term3_coeff(r): return 2 * (1 / N + self.c(r))**2 # compute the term that doesn't depend on beta variance3 = sum([ self.scalings[r]**2 * np.trace(self.QZ.ranges_to_arrays[r]) * \ term3_coeff(r) for r in self.Q.ranges()]) # now compute the other two terms if use_beta: beta = self.beta # term A: beta^T RZRZR beta = beta^T QZR beta variance1 = sum([ self.scalings[r]**2 * \ beta[r[0]:r[1]].dot(self.QZR.ranges_to_arrays[r].dot(beta[r[0]:r[1]])) * \ term1_coeff(r) for r in self.Q.ranges()]) # term B: (beta^T Q beta)^2 variance2 = sum([ self.scalings[r]**2 * \ beta[r[0]:r[1]].dot(self.Q.ranges_to_arrays[r].dot(beta[r[0]:r[1]]))**2 * \ term2_coeff() for r in self.Q.ranges()]) else: # term A # alphahatTZR = alphahat.dot(self.ZR) # Zalphahat = self.Z.dot(alphahat) # termAbiases = {r : # np.einsum('ij,ji',self.ZR.ranges_to_arrays[r],self.ZR.ranges_to_arrays[r]) * \ # (1/N + self.c(r)) # for r in self.Q.ranges()} # variance1 = sum([ # (alphahatTZR.ranges_to_arrays[r].dot(Zalphahat.ranges_to_arrays[r]) - \ # termAbiases[r]) * \ # term1_coeff(r) # for r in self.Q.ranges()]) betahat = self.Rri.dot(alphahat) point_estimates = { r: self.scalings[r] * (betahat.ranges_to_arrays[r].dot(self.RA.ranges_to_arrays[r]). dot(betahat.ranges_to_arrays[r]) - self.biases[r]) for r in self.Q.ranges() } variance1 = sum([ self.scalings[r]**2 * point_estimates[r] / len(self.A.irs & IntRangeSet(r)) * \ np.trace(self.QZR.ranges_to_arrays[r]) * \ term1_coeff(r) for r in self.Q.ranges()]) # term B variance2 = term2_coeff() * sum([ self.scalings[r]**2 * (point_estimates[r] / self.scalings[r])**2 for r in self.Q.ranges() ]) variance = variance1 + variance2 + variance3 print('\nvariance is {} + {} + {} = {}'.format(variance1, variance2, variance3, variance)) return variance