def add_covariance_for_range(r): print(r) range_size = r[1] - r[0] cov = np.zeros((range_size, range_size)) range_genotypes = d.get_standardized_genotypes(r, indivs=indivs) def compute_cov_for_snp(m): end = d.buffer_around_snp(m, bandwidth, start=r[0], end=r[1], units=band_units)[1] window_start = m - r[0] window_end = end - r[0] window = range_genotypes[:, window_start:window_end] cov_to_snps_in_window = \ range_genotypes[:,m-r[0]].T.dot(window) / range_genotypes.shape[0] cov_to_snps_in_window[0] /= 2 # since we're going to symmetrize later cov[m-r[0], window_start:window_end] = cov_to_snps_in_window map(compute_cov_for_snp, it.show_progress(range(r[0], r[1]))) # symmetrization ranges_to_arrays[r] = cov + cov.T # make coding of snps consistent with other dataset flip = np.array(IntRangeSet(positions_to_flip) & IntRangeSet((r[0],r[1])), dtype=int) - r[0] # dtype required so we can use empty array as index ranges_to_arrays[r][flip] *= -1 ranges_to_arrays[r][:,flip] *= -1
def compute_cov_for_slice(s): indices = IntRangeSet((s[0] if s[0] == 0 else s[0] + int(bandwidth/2), s[1] if s[1] == d.M else s[1] - int(bandwidth/2))) indices = indices & snpset_irs if indices.isempty: # if there are no indices to analyze then we can move on return print(s) slice_genotypes = d.get_standardized_genotypes(s, indivs=indivs) snpset_relative_to_slice = IntRangeSet([ (x-s[0],y-s[0]) for x,y in snpset_irs.ranges()]) def compute_cov_for_snp(m): # we just compute the numbers needed for the top trianglular half # of the LD matrix, then we symmetrize the matrix. (commented line is old) # start = max(0, m - int(bandwidth/2)) start = m end = min(slice_genotypes.shape[1], m + int(bandwidth/2)) window_indices = IntRangeSet((start, end)) & snpset_relative_to_slice window = slice_genotypes[:, window_indices] cov_to_snps_in_window = slice_genotypes[:,m].T.dot(window) / len(indivs) cov_to_snps_in_window[0] /= 2 # since we're going to symmetrize later target_indices = IntRangeSet((s[0] + start, s[0] + end)) & snpset_irs lil_cov[s[0] + m, target_indices] = cov_to_snps_in_window map(compute_cov_for_snp, it.show_progress([x - s[0] for x in indices]))
def main(args): print('reading seeed snps') seed_snps = pd.read_csv(args.seed_snps, header=None, names=['SNP'], index_col='SNP') seed_snps['ibs_length'] = 0 seed_snps['ibd'] = 0 print('reading typed snps') typed_snps = pd.read_csv(args.typed_snps, header=None, names=['SNP']) print('reading genotypes') data = Bed(args.bfile) X = data.read().val typed_snps_indices = np.sort(data.sid_to_index(typed_snps.SNP)) typed_snps_bp = data.col_property[typed_snps_indices,2] print(len(seed_snps), 'snps in list') print(data.iid_count, data.sid_count, 'are dimensions of X') def analyze_snp(i): # find first typed snp after query snp snp_bp = data.col_property[i,2] v = np.where(typed_snps_bp > snp_bp)[0] if len(v) > 0: typed_i = v[0] else: typed_i = len(typed_snps_indices)-1 n1, n2 = np.where(X[:,i] == 1)[0] if (X[n1,typed_snps_indices[typed_i]] - X[n2, typed_snps_indices[typed_i]])**2 == 4: return 0, 0 typed_il, typed_ir = fis.find_boundaries( X[n1,typed_snps_indices], X[n2,typed_snps_indices], typed_i) typed_ir -= 1 il = typed_snps_indices[typed_il] ir = typed_snps_indices[typed_ir] cM = data.col_property[ir, 1] - \ data.col_property[il, 1] ibd = (np.mean(X[n1,il:ir] == X[n2,il:ir]) > 0.99) return cM, int(ibd) for (i, snp) in iter.show_progress( it.izip(data.sid_to_index(seed_snps.index), seed_snps.index), total=len(seed_snps)): # total=10): seed_snps.ix[snp, ['ibs_length', 'ibd']] = analyze_snp(i) print(seed_snps.iloc[:100]) seed_snps.to_csv(args.outfile, sep='\t')
def compute_cov_for_slice(s): indices = IntRangeSet( (s[0] if s[0] == 0 else s[0] + int(bandwidth / 2), s[1] if s[1] == d.M else s[1] - int(bandwidth / 2))) indices = indices & snpset_irs if indices.isempty: # if there are no indices to analyze then we can move on return print(s) slice_genotypes = d.get_standardized_genotypes(s, indivs=indivs) snpset_relative_to_slice = IntRangeSet([ (x - s[0], y - s[0]) for x, y in snpset_irs.ranges() ]) def compute_cov_for_snp(m): # we just compute the numbers needed for the top trianglular half # of the LD matrix, then we symmetrize the matrix. (commented line is old) # start = max(0, m - int(bandwidth/2)) start = m end = min(slice_genotypes.shape[1], m + int(bandwidth / 2)) window_indices = IntRangeSet( (start, end)) & snpset_relative_to_slice window = slice_genotypes[:, window_indices] cov_to_snps_in_window = slice_genotypes[:, m].T.dot( window) / len(indivs) cov_to_snps_in_window[ 0] /= 2 # since we're going to symmetrize later target_indices = IntRangeSet( (s[0] + start, s[0] + end)) & snpset_irs lil_cov[s[0] + m, target_indices] = cov_to_snps_in_window map(compute_cov_for_snp, it.show_progress([x - s[0] for x in indices]))