def __init__(self,h,thid,ra,dec,zqso,plate,mjd,fid): qso.__init__(self,thid,ra,dec,zqso,plate,mjd,fid) ll = sp.array(h["loglam"][:]) fl = sp.array(h["coadd"][:]) iv = sp.array(h["ivar"][:])*(sp.array(h["and_mask"][:])==0) w=(ll>forest.lmin) & (ll<forest.lmax) & (ll-sp.log10(1+self.zqso)>forest.lmin_rest) & (ll-sp.log10(1+self.zqso)<forest.lmax_rest) w = w & (iv>0) if w.sum()==0:return ll=ll[w] fl=fl[w] iv=iv[w] ## rebin bins = ((ll-forest.lmin)/forest.dll+0.5).astype(int) civ=sp.bincount(bins,weights=iv) w=civ>0 civ=civ[w] c=sp.bincount(bins,weights=ll*iv) c=c[w] ll = c/civ c=sp.bincount(bins,weights=fl*iv) c=c[w] fl=c/civ iv = civ self.T_dla = None self.ll = ll self.fl = fl self.iv = iv
def rebin_diff_noise(dll, ll, diff): crebin = 3 if (diff.size < crebin): print("Warning: diff.size too small for rebin") return diff dll2 = crebin * dll # rebin not mixing pixels separated by masks bin2 = sp.floor((ll - ll.min()) / dll2 + 0.5).astype(int) # rebin regardless of intervening masks # nmax = diff.size//crebin # bin2 = sp.zeros(diff.size) # for n in range (1,nmax +1): # bin2[n*crebin:] += sp.ones(diff.size-n*crebin) cdiff2 = sp.bincount(bin2.astype(int), weights=diff) civ2 = sp.bincount(bin2.astype(int)) w = (civ2 > 0) if (len(civ2) == 0): print("Error: diff size = 0 ", diff) diff2 = cdiff2[w] / civ2[w] * sp.sqrt(civ2[w]) diffout = sp.zeros(diff.size) nmax = len(diff) // len(diff2) for n in range(nmax + 1): lengthmax = min(len(diff), (n + 1) * len(diff2)) diffout[n * len(diff2):lengthmax] = diff2[:lengthmax - n * len(diff2)] sp.random.shuffle(diff2) return diffout
def cf(data): xi = sp.zeros(np * nt) we = sp.zeros(np * nt) for i, d1 in enumerate(data): wd1 = d1.de * d1.we for d2 in d1.neighs: wd2 = d2.de * d2.we ang = d1 ^ d2 rp = abs(d1.r_comov - d2.r_comov[:, None]) * sp.cos(ang / 2) rt = (d1.r_comov + d2.r_comov[:, None]) * sp.sin(ang / 2) wd12 = wd1 * wd2[:, None] w12 = d1.we * d2.we[:, None] w = (rp < rp_max) & (rt < rt_max) rp = rp[w] rt = rt[w] wd12 = wd12[w] w12 = w12[w] bp = (rp / rp_max * np).astype(int) bt = (rt / rt_max * nt).astype(int) bins = bt + nt * bp c = sp.bincount(bins, weights=wd12) xi[:len(c)] += c c = sp.bincount(bins, weights=w12) we[:len(c)] += c w = we > 0 xi[w] /= we[w] return we, xi
def fast_co(z1, r1, w1, z2, r2, w2, ang): rp = (r1 - r2) * sp.cos(ang / 2.) if not x_correlation or type_corr in ['DR', 'RD']: rp = sp.absolute(rp) rt = (r1 + r2) * sp.sin(ang / 2.) z = (z1 + z2) / 2. w12 = w1 * w2 w = (rp >= rp_min) & (rp < rp_max) & (rt < rt_max) & (w12 > 0.) rp = rp[w] rt = rt[w] z = z[w] w12 = w12[w] bp = sp.floor((rp - rp_min) / (rp_max - rp_min) * np).astype(int) bt = (rt / rt_max * nt).astype(int) bins = bt + nt * bp cw = sp.bincount(bins, weights=w12) crp = sp.bincount(bins, weights=rp * w12) crt = sp.bincount(bins, weights=rt * w12) cz = sp.bincount(bins, weights=z * w12) cnb = sp.bincount(bins) return cw, crp, crt, cz, cnb
def __add__(self, d): if not hasattr(self, 'll') or not hasattr(d, 'll'): return self ll = sp.append(self.ll, d.ll) fl = sp.append(self.fl, d.fl) iv = sp.append(self.iv, d.iv) if self.mmef is not None: mmef = sp.append(self.mmef, d.mmef) bins = sp.floor((ll - forest.lmin) / forest.dll + 0.5).astype(int) cll = forest.lmin + sp.arange(bins.max() + 1) * forest.dll cfl = sp.zeros(bins.max() + 1) civ = sp.zeros(bins.max() + 1) if mmef is not None: cmmef = sp.zeros(bins.max() + 1) ccfl = sp.bincount(bins, weights=iv * fl) cciv = sp.bincount(bins, weights=iv) if mmef is not None: ccmmef = sp.bincount(bins, weights=iv * mmef) cfl[:len(ccfl)] += ccfl civ[:len(cciv)] += cciv if mmef is not None: cmmef[:len(ccmmef)] += ccmmef w = (civ > 0.) self.ll = cll[w] self.fl = cfl[w] / civ[w] self.iv = civ[w] if mmef is not None: self.mmef = cmmef[w] return self
def stack(data, delta=False): nstack = int((forest.lmax - forest.lmin) / forest.dll) + 1 ll = forest.lmin + sp.arange(nstack) * forest.dll st = sp.zeros(nstack) wst = sp.zeros(nstack) for p in data: for d in data[p]: bins = ((d.ll - forest.lmin) / forest.dll + 0.5).astype(int) var_lss = forest.var_lss(d.ll) eta = forest.eta(d.ll) if delta: we = d.we else: iv = d.iv / eta we = iv * d.co**2 / (iv * d.co**2 * var_lss + 1) if delta: de = d.de else: de = d.fl / d.co c = sp.bincount(bins, weights=de * we) st[:len(c)] += c c = sp.bincount(bins, weights=we) wst[:len(c)] += c w = wst > 0 st[w] /= wst[w] return ll, st
def find_neighbor_throats(self,pores,mode='union',flatten=True): r""" Returns a list of throats neighboring the given pore(s) Parameters ---------- pores : array_like Indices of pores whose neighbors are sought flatten : boolean, optional If flatten is True (default) a 1D array of unique throat ID numbers is returned. If flatten is False the returned array contains arrays of neighboring throat ID numbers for each input pore, in the order they were sent. mode : string, optional Specifies which neighbors should be returned. The options are: * 'union' : All neighbors of the input pores * 'intersection' : Only neighbors shared by all input pores * 'not_intersection' : Only neighbors not shared by any input pores Returns ------- neighborTs : 1D array (if flatten is True) or ndarray of arrays (if flatten if False) Examples -------- >>> import OpenPNM >>> pn = OpenPNM.Network.TestNet() >>> pn.find_neighbor_throats(pores=[0,1]) array([0, 1, 2, 3, 4, 5]) >>> pn.find_neighbor_throats(pores=[0,1],flatten=False) array([array([0, 1, 2]), array([0, 3, 4, 5])], dtype=object) """ #Test for existence of incidence matrix try: neighborTs = self._incidence_matrix['lil'].rows[[pores]] except: temp = self.create_incidence_matrix(sprsfmt='lil') self._incidence_matrix['lil'] = temp neighborTs = self._incidence_matrix['lil'].rows[[pores]] if [sp.asarray(x) for x in neighborTs if x] == []: return sp.array([],ndmin=1) if flatten: #All the empty lists must be removed to maintain data type after hstack (numpy bug?) neighborTs = [sp.asarray(x) for x in neighborTs if x] neighborTs = sp.hstack(neighborTs) #Remove references to input pores and duplicates if mode == 'not_intersection': neighborTs = sp.unique(sp.where(sp.bincount(neighborTs)==1)[0]) elif mode == 'union': neighborTs = sp.unique(neighborTs) elif mode == 'intersection': neighborTs = sp.unique(sp.where(sp.bincount(neighborTs)>1)[0]) else: for i in range(0,sp.size(pores)): neighborTs[i] = sp.array(neighborTs[i]) return sp.array(neighborTs,ndmin=1)
def stack(data, delta=False): nstack = int((forest.lmax - forest.lmin) / forest.dll) + 1 ll = forest.lmin + sp.arange(nstack) * forest.dll st = sp.zeros(nstack) wst = sp.zeros(nstack) for p in sorted(list(data.keys())): for d in data[p]: if delta: de = d.de we = d.we else: de = d.fl / d.co var_lss = forest.var_lss(d.ll) eta = forest.eta(d.ll) fudge = forest.fudge(d.ll) var = 1. / d.iv / d.co**2 we = 1. / variance(var, eta, var_lss, fudge) bins = ((d.ll - forest.lmin) / forest.dll + 0.5).astype(int) c = sp.bincount(bins, weights=de * we) st[:len(c)] += c c = sp.bincount(bins, weights=we) wst[:len(c)] += c w = wst > 0 st[w] /= wst[w] return ll, st, wst
def find_neighbor_throats(self,pnums,flatten=True,mode='union'): r""" Returns a list of throats neighboring the given pore(s) Parameters ---------- pnums : array_like Indices of pores whose neighbors are sought flatten : boolean, optional If flatten is True (default) a 1D array of unique throat ID numbers is returned. If flatten is False the returned array contains arrays of neighboring throat ID numbers for each input pore, in the order they were sent. mode : string, optional Specifies which neighbors should be returned. The options are: * 'union' : All neighbors of the input pores * 'intersection' : Only neighbors shared by all input pores * 'not_intersection' : Only neighbors not shared by any input pores Returns ------- neighborTs : 1D array (if flatten is True) or ndarray of arrays (if flatten if False) Examples -------- >>> pn = OpenPNM.Network.Cubic(name='doc_test').generate(divisions=[5,5,5],lattice_spacing=[1]) >>> pn.find_neighbor_throats(pnums=[0,1]) array([0, 1, 2, 3, 4, 5]) >>> pn.find_neighbor_throats(pnums=[0,1],flatten=False) array([array([0, 1, 2]), array([0, 3, 4, 5])], dtype=object) """ #Test for existance of incidence matrix try: neighborTs = self.incidence_matrix['lil']['connections'].rows[[pnums]] except: self._logger.info('Creating incidence matrix, please wait') self.create_incidence_matrix() neighborTs = self.incidence_matrix['lil']['connections'].rows[[pnums]] if flatten: #All the empty lists must be removed to maintain data type after hstack (numpy bug?) neighborTs = [sp.asarray(x) for x in neighborTs if x] neighborTs = sp.hstack(neighborTs) #Remove references to input pores and duplicates if mode == 'not_intersection': neighborTs = sp.unique(sp.where(sp.bincount(neighborTs)==1)[0]) elif mode == 'union': neighborTs = sp.unique(neighborTs) elif mode == 'intersection': neighborTs = sp.unique(sp.where(sp.bincount(neighborTs)>1)[0]) else: for i in range(0,sp.size(pnums)): neighborTs[i] = sp.array(neighborTs[i]) return sp.array(neighborTs,ndmin=1)
def worker_quality(predictions, num_classes): predictions = sp.atleast_2d(predictions) num_workers, num_objects = predictions.shape error_rates = sp.zeros((num_workers, num_classes, num_classes)) diy, diz = sp.diag_indices(num_classes) error_rates[:, diy, diz] = 1 while True: # E step new_predictions = sp.zeros((num_objects, num_classes)) for i in xrange(num_objects): individual_predictions = predictions[:, i] individual_error_rates = error_rates[range(num_workers), individual_predictions, individual_predictions] new_predictions[i, :] = sp.bincount(individual_predictions, individual_error_rates, minlength=num_classes) correct_labels = sp.argmax(new_predictions, axis=1) count_per_label = sp.bincount(correct_labels) # M step new_error_rates = sp.zeros((num_workers, num_classes, num_classes)) for i, label in enumerate(correct_labels): new_error_rates[range(num_workers), label, predictions[:, i]] += 1 for i in xrange(num_classes): new_error_rates[:, :, i] /= count_per_label diff_error_rates = sp.absolute(new_error_rates - error_rates) error_rates = new_error_rates if sp.amax(diff_error_rates) < 0.001: break # calculate the cost of each worker class_priors = sp.bincount(correct_labels, minlength=num_classes) / float(num_objects) costs = [] for k in xrange(num_workers): worker_class_priors = sp.dot(sp.atleast_2d(class_priors), error_rates[k])[0] + 0.0000001 cost = 0 for j in xrange(num_classes): soft_label = error_rates[k, :, j] * class_priors / worker_class_priors[j] soft_label_cost = 0.0 for i in xrange(num_classes): soft_label_cost += sp.sum(soft_label[i] * soft_label) soft_label_cost -= sp.sum(soft_label ** 2) # subtract the diagonal entries (those costs = 0) cost += soft_label_cost * worker_class_priors[j] costs.append(cost) return error_rates, correct_labels, costs
def exp_diff(file, ll): nexp_per_col = file[0].read_header()['NEXP'] // 2 fltotodd = sp.zeros(ll.size) ivtotodd = sp.zeros(ll.size) fltoteven = sp.zeros(ll.size) ivtoteven = sp.zeros(ll.size) if (nexp_per_col) < 2: print("DBG : not enough exposures for diff") for iexp in range(nexp_per_col): for icol in range(2): llexp = file[4 + iexp + icol * nexp_per_col]["loglam"][:] flexp = file[4 + iexp + icol * nexp_per_col]["flux"][:] ivexp = file[4 + iexp + icol * nexp_per_col]["ivar"][:] mask = file[4 + iexp + icol * nexp_per_col]["mask"][:] bins = sp.searchsorted(ll, llexp) # exclude masks 25 (COMBINEREJ), 23 (BRIGHTSKY)? if iexp % 2 == 1: civodd = sp.bincount(bins, weights=ivexp * (mask & 2**25 == 0)) cflodd = sp.bincount(bins, weights=ivexp * flexp * (mask & 2**25 == 0)) fltotodd[:civodd.size - 1] += cflodd[:-1] ivtotodd[:civodd.size - 1] += civodd[:-1] else: civeven = sp.bincount(bins, weights=ivexp * (mask & 2**25 == 0)) cfleven = sp.bincount(bins, weights=ivexp * flexp * (mask & 2**25 == 0)) fltoteven[:civeven.size - 1] += cfleven[:-1] ivtoteven[:civeven.size - 1] += civeven[:-1] w = ivtotodd > 0 fltotodd[w] /= ivtotodd[w] w = ivtoteven > 0 fltoteven[w] /= ivtoteven[w] alpha = 1 if (nexp_per_col % 2 == 1): n_even = (nexp_per_col - 1) // 2 alpha = sp.sqrt(4. * n_even * (n_even + 1)) / nexp_per_col diff = 0.5 * (fltoteven - fltotodd) * alpha ### CHECK THE * alpha (Nathalie) return diff
def region_size(im): r""" Replace each voxel with size of region to which it belongs Parameters ---------- im : ND-array Either a boolean image wtih ``True`` indicating the features of interest, in which case ``scipy.ndimage.label`` will be applied to find regions, or a greyscale image with integer values indicating regions. Returns ------- image : ND-array A copy of ``im`` with each voxel value indicating the size of the region to which it belongs. This is particularly useful for finding chord sizes on the image produced by ``apply_chords``. """ if im.dtype == bool: im = spim.label(im)[0] counts = sp.bincount(im.flatten()) counts[0] = 0 chords = counts[im] return chords
def update_kinship(self, removed_snps, full_kinship, full_indivs, full_num_snps, retained_indivs, kinship_type='ibs', snps_data_format='binary', snp_dtype='int8', dtype='single'): assert kinship_type == 'ibs', 'Only IBS kinships can be updated at the moment' #Cut full kinship cut_kinship = prepare_k(full_kinship, full_indivs, retained_indivs) num_lines = cut_kinship.shape[0] k_mat = sp.zeros((num_lines, num_lines), dtype=dtype) num_snps = len(removed_snps) snps_array = sp.array(removed_snps, dtype=snp_dtype) snps_array = snps_array.T if snps_data_format == 'diploid_int': for i in range(num_lines): for j in range(i): bin_counts = sp.bincount(sp.absolute(snps_array[j] - snps_array[i])) if len(bin_counts) > 1: k_mat[i, j] += (bin_counts[0] + 0.5 * bin_counts[1]) else: k_mat[i, j] += bin_counts[0] k_mat[j, i] = k_mat[i, j] elif snps_data_format == 'binary': sm = sp.mat(snps_array * 2.0 - 1.0) k_mat = k_mat + sm * sm.T else: raise NotImplementedError if self.data_format == 'diploid_int': k_mat = k_mat / float(num_snps) + sp.eye(num_lines) elif self.data_format == 'binary': k_mat = k_mat / (2 * float(num_snps)) + 0.5 updated_k = (cut_kinship * full_num_snps - k_mat * removed_snps) / (full_num_snps - removed_snps) return updated_k
def get_phenotypes(plinkf, debug=False): samples = plinkf.get_samples() num_individs = len(samples) Y = [s.phenotype for s in samples] fids = [s.fid for s in samples] iids = [s.iid for s in samples] unique_phens = sp.unique(Y) if len(unique_phens) == 1: print('Unable to find phenotype values.') has_phenotype = False elif len(unique_phens) == 2: cc_bins = sp.bincount(Y) assert len(cc_bins) == 2, 'Problems with loading phenotype' if debug: print('Loaded %d controls and %d cases' % (cc_bins[0], cc_bins[1])) has_phenotype = True else: if debug: print('Found quantitative phenotype values') has_phenotype = True return { 'has_phenotype': has_phenotype, 'fids': fids, 'iids': iids, 'phenotypes': Y, 'num_individs': num_individs }
def _hough_transform(img, angles): rows, cols = img.shape # determine the number of bins d = sp.ceil(sp.hypot(*img.shape)) nr_bins = 2 * d bins = sp.linspace(-d, d, nr_bins) # create the accumulator out = sp.zeros((nr_bins, len(angles)), dtype=sp.float64) # compute the sines/cosines cos_theta = sp.cos(angles) sin_theta = sp.sin(angles) # constructe the x and y values y = [] x = [] for i in xrange(rows): y += [i] * cols x += range(cols) y = sp.array(y) x = sp.array(x) # flatten image flattened_img = img.flatten() for i, (c, s) in enumerate(zip(cos_theta, sin_theta)): distances = x * c + y * s bin_indices = (sp.round_(distances) - bins[0]).astype(sp.uint8) bin_sums = sp.bincount(bin_indices, flattened_img) out[:len(bin_sums), i] = bin_sums return out
def _calc_ibs_kinship_(self, dtype='single', chunk_size=None): n_snps = self.num_snps() n_indivs = self.num_individs() if chunk_size is None: chunk_size = n_indivs #print 'Allocating K matrix' k_mat = sp.zeros((n_indivs, n_indivs), dtype=dtype) #print 'Starting calculation' i = 0 snps_chunks = self.snps_chunks(chunk_size) for snps_chunk in snps_chunks: #FINISH!!! i += len(snps_chunk) snps_array = snps_chunk.T if self.data_format == 'diploid_int': for i in range(n_indivs): for j in range(i): bin_counts = sp.bincount(sp.absolute(snps_array[j] - snps_array[i])) if len(bin_counts) > 1: k_mat[i, j] += (bin_counts[0] + 0.5 * bin_counts[1]) else: k_mat[i, j] += bin_counts[0] k_mat[j, i] = k_mat[i, j] elif self.data_format == 'binary': sm = sp.mat(snps_array * 2.0 - 1.0) k_mat = k_mat + sm * sm.T sys.stdout.write('\b\b\b\b\b\b%0.1f%%' % (100.0 * i / n_snps)) sys.stdout.flush() if self.data_format == 'diploid_int': k_mat = k_mat / float(n_snps) + sp.eye(n_indivs) elif self.data_format == 'binary': k_mat = k_mat / (2 * float(n_snps)) + 0.5 return k_mat
def stack_flux(data, delta): '''Make a weighted sum of flux/delta values in wavelength bins.''' nstack = int((forest.lmax - forest.lmin) / forest.dll) + 1 ll = forest.lmin + sp.arange(nstack) * forest.dll st = sp.zeros(nstack) wst = sp.zeros(nstack) data_bad_cont = [] # Stack flux & weights, or deltas & weights for d in data: if d.bad_cont is not None: data_bad_cont.append(d) continue bins=((d.ll - d.lmin) / d.dll + 0.5).astype(int) eta = forest.eta(d.ll) var_lss = forest.var_lss(d.ll) fudge = forest.fudge(d.ll) if (delta == 0): # convert ivar into normalized ivar (going from flux units to F units) ivar_F = d.iv * d.co**2 # correct this variance, adding the var_lss and eta factors var_F = 1./ivar_F var_F_tot = var_F*eta + var_lss + fudge/var_F # convert back to flux units var_flux_tot = var_F_tot * d.co**2 we = 1./var_flux_tot c = sp.bincount(bins, weights = d.fl * we) else: iv = d.iv / eta we = iv * d.co**2 / (iv * d.co**2 * var_lss + 1) c = sp.bincount(bins, weights = (d.fl/d.co - 1) * we) st[:len(c)] += c c = sp.bincount(bins, weights = we) wst[:len(c)] += c w = wst>0 st[w] /= wst[w] for d in data_bad_cont: print ("rejected {} due to {}\n".format(d.thid,d.bad_cont)) return ll, st, wst
def getNumRequestsEachBus(self): # Assertion assert Solution.totalBuses is not None if self._numRequestsEachBus is None: self._numRequestsEachBus = scipy.bincount(self.getBusEachRequest(), minlength=Solution.totalBuses) return self._numRequestsEachBus
def sorted_csr_from_coo(shape, row_idx, col_idx, val, only_topk=None): m = (sp.absolute(val).sum() + 1) * 3 sorted_idx = sp.argsort(row_idx * m - val) row_idx[:] = row_idx[sorted_idx] col_idx[:] = col_idx[sorted_idx] val[:] = val[sorted_idx] indptr = sp.cumsum(sp.bincount(row_idx + 1, minlength=(shape[0] + 1))) if only_topk is not None and isinstance(only_topk, int): only_topk = max(min(1, only_topk), only_topk) selected_idx = (sp.arange(len(val)) - indptr[row_idx]) < only_topk row_idx = row_idx[selected_idx] col_idx = col_idx[selected_idx] val = val[selected_idx] indptr = sp.cumsum(sp.bincount(row_idx + 1, minlength=(shape[0] + 1))) return smat.csr_matrix((val, col_idx, indptr), shape=shape, dtype=val.dtype)
def __add__(self, d): if not hasattr(self, 'll') or not hasattr(d, 'll'): return self dic = { } # this should contain all quantities that are to be coadded with ivar weighting ll = sp.append(self.ll, d.ll) dic['fl'] = sp.append(self.fl, d.fl) iv = sp.append(self.iv, d.iv) if self.mmef is not None: dic['mmef'] = sp.append(self.mmef, d.mmef) if self.diff is not None: dic['diff'] = sp.append(self.diff, d.diff) if self.reso is not None: dic['reso'] = sp.append(self.reso, d.reso) bins = sp.floor((ll - forest.lmin) / forest.dll + 0.5).astype(int) cll = forest.lmin + sp.arange(bins.max() + 1) * forest.dll civ = sp.zeros(bins.max() + 1) cciv = sp.bincount(bins, weights=iv) civ[:len(cciv)] += cciv w = (civ > 0.) self.ll = cll[w] self.iv = civ[w] for k, v in dic.items(): cnew = sp.zeros(bins.max() + 1) ccnew = sp.bincount(bins, weights=iv * v) cnew[:len(ccnew)] += ccnew setattr(self, k, cnew[w] / civ[w]) # recompute means of quality variables if self.reso is not None: self.mean_reso = self.reso.mean() err = 1. / sp.sqrt(self.iv) SNR = self.fl / err self.mean_SNR = SNR.mean() lam_lya = constants.absorber_IGM["LYA"] self.mean_z = (sp.power(10., ll[len(ll) - 1]) + sp.power(10., ll[0])) / 2. / lam_lya - 1.0 return self
def fill_dmat(l1, r1, rdm1, z1, w1, r2, rdm2, z2, w2, ang, wdm, dm, rpeff, rteff, zeff, weff): rp = (r1[:, None] - r2) * sp.cos(ang / 2) rt = (rdm1[:, None] + rdm2) * sp.sin(ang / 2) z = (z1[:, None] + z2) / 2. w = (rp > rp_min) & (rp < rp_max) & (rt < rt_max) bp = ((rp - rp_min) / (rp_max - rp_min) * np).astype(int) bt = (rt / rt_max * nt).astype(int) bins = bt + nt * bp bins = bins[w] m_bp = ((rp - rp_min) / (rp_max - rp_min) * npm).astype(int) m_bt = (rt / rt_max * ntm).astype(int) m_bins = m_bt + ntm * m_bp m_bins = m_bins[w] sw1 = w1.sum() ml1 = sp.average(l1, weights=w1) dl1 = l1 - ml1 slw1 = (w1 * dl1**2).sum() n1 = len(l1) n2 = len(r2) ij = sp.arange(n1)[:, None] + n1 * sp.arange(n2) ij = ij[w] we = w1[:, None] * w2 we = we[w] c = sp.bincount(bins, weights=we) wdm[:len(c)] += c eta2 = sp.zeros(npm * ntm * n2) eta4 = sp.zeros(npm * ntm * n2) c = sp.bincount(m_bins, weights=we * rp[w]) rpeff[:c.size] += c c = sp.bincount(m_bins, weights=we * rt[w]) rteff[:c.size] += c c = sp.bincount(m_bins, weights=we * z[w]) zeff[:c.size] += c c = sp.bincount(m_bins, weights=we) weff[:c.size] += c c = sp.bincount((ij - ij % n1) // n1 + n2 * m_bins, weights=(w1[:, None] * sp.ones(n2))[w] / sw1) eta2[:len(c)] += c c = sp.bincount((ij - ij % n1) // n1 + n2 * m_bins, weights=((w1 * dl1)[:, None] * sp.ones(n2))[w] / slw1) eta4[:len(c)] += c ubb = sp.unique(m_bins) for k, (ba, m_ba) in enumerate(zip(bins, m_bins)): dm[m_ba + npm * ntm * ba] += we[k] i = ij[k] % n1 j = (ij[k] - i) // n1 for bb in ubb: dm[bb + npm * ntm * ba] -= we[k] * (eta2[j + n2 * bb] + eta4[j + n2 * bb] * dl1[i])
def __init__(self, kdim, depth, algo, seed, codes): assert(kdim == 2) self.kdim = kdim self.depth = depth self.algo = algo self.seed = seed self.codes = codes self.indptr = sp.cumsum(sp.bincount(codes + 1, minlength=(self.nr_codes + 1)), dtype=sp.uint64) self.indices = sp.argsort(codes * sp.float64(self.nr_elements) + sp.arange(self.nr_elements))
def is_near_constant(self, pid, min_num_diff=10): vals = sp.array(self.phen_dict[pid]["values"]) if sp.std(vals) > 0: vals = 50 * (vals - sp.mean(vals)) / sp.std(vals) vals = vals - vals.min() + 0.1 b_counts = sp.bincount(sp.array(sp.around(vals), dtype="int")) b = b_counts.max() > len(vals) - min_num_diff return b else: return True
def is_near_constant(self, min_num_diff=10): vals = sp.array(self.values) if sp.std(vals) > 0: vals = 50 * (vals - sp.mean(vals)) / sp.std(vals) vals = vals - vals.min() + 0.1 b_counts = sp.bincount(sp.array(sp.around(vals), dtype='int')) b = b_counts.max() > len(vals) - min_num_diff return b else: return True
def plot_marker_box_plot(self, pid, marker, m_accessions, m_position=None, m_chromosome=None, plot_file=None, plot_format='png', title=None, m_score=None): """ Plots a box plot for the given binary marker and phenotype. Assumes the marker is integer based. Assumes the marker and the phenotype accessions are aligned. """ phen_vals = self.get_values(pid) if len(m_accessions) != len(phen_vals): raise Exception nt_counts = sp.bincount(marker) if len(nt_counts) > 2: import warnings warnings.warn("More than 2 alleles, box-plot might be wrong?") allele_phen_val_dict = {} for nt in set(marker): allele_phen_val_dict[nt] = {'values':[], 'ecotypes':[]} for i, nt in enumerate(marker): allele_phen_val_dict[nt]['values'].append(phen_vals[i]) if m_accessions: allele_phen_val_dict[nt]['ecotypes'].append(m_accessions[i]) xs = [] positions = [] for nt in allele_phen_val_dict: positions.append(nt) xs.append(allele_phen_val_dict[nt]['values']) plt.figure() plt.boxplot(xs, positions=positions) min_val = min(phen_vals) max_val = max(phen_vals) val_range = max_val - min_val max_pos = max(positions) min_pos = min(positions) x_range = max_pos - min_pos plt.axis([min_pos - 0.5 * x_range, max_pos + 0.5 * x_range, min_val - val_range * 0.3, max_val + val_range * 0.3]) plt.text(min_pos - 0.45 * x_range, min_val - 0.15 * val_range, "# of obs.: ", color='k') for i, (x, pos) in enumerate(it.izip(xs, positions)): plt.text(pos - 0.05, min_val - 0.15 * val_range, str(len(xs[i])), color='k') if m_score: plt.text(min_pos + 0.13 * x_range, max_val + 0.15 * val_range, '$-log_{10}$(p-value)/score: %0.2f' % m_score, color='k') if title: plt.title(title) elif m_chromosome and m_position: plt.title('%s : chromosome=%d, position=%d' % (self.get_name(pid), m_chromosome, m_position)) if plot_file: plt.savefig(plot_file, format=plot_format) else: plt.show() plt.clf()
def weight_angular(catalogue, nside=nside): self.logger.info('Angular integral constraint.') import healpy pixarea = healpy.nside2pixarea(nside, degrees=True) npix = healpy.nside2npix(nside) self.logger.info( 'Pixels with nside = {:d}: {:.1f} square degree ({:d}).'. format(nside, pixarea, npix)) #weights theta, phi = healpy.vec2ang(catalogue['Position']) ra, dec = phi / constants.degree, 90. - theta / constants.degree self.logger.info( 'RA x DEC: [{:.1f}, {:.1f}] x [{:.1f}, {:.1f}].'.format( ra.min(), ra.max(), dec.min(), dec.max())) pix = healpy.ang2pix(nside, theta, phi, nest=False) counts = scipy.bincount(pix, minlength=npix) mask = counts > 0 nbins = mask.sum() self.logger.info( 'There are {:d} pixels with an average of {:.1f} objects.'. format(nbins, len(catalogue) * 1. / nbins)) pixtoibin = -scipy.ones((npix), dtype=scipy.int64) pixtoibin[mask] = scipy.arange(nbins) for iaddbin in range(catalogue.attrs['naddbins']): mask = catalogue['iaddbin'] == iaddbin wcounts = scipy.bincount(pix[mask], weights=catalogue['Weight'][mask]) catalogue['Weight'][mask] /= wcounts[pix[mask]] attrs = {'nside': nside, 'nbins': nbins} def bin(catalogue): theta, phi = healpy.vec2ang(catalogue['Position']) pix = healpy.ang2pix(nside, theta, phi, nest=False) return pixtoibin[pix] return attrs, bin
def get_mafs(self): macs = [] mafs = [] num_nts = len(self.accessions) if self.data_format in ['binary', 'int']: for snp in self.get_snps_iterator(): l = scipy.bincount(snp) mac = min(l) macs.append(mac) mafs.append(mac / float(num_nts)) elif self.data_format == 'diploid_int': for snp in self.get_snps_iterator(): bin_counts = scipy.bincount(snp, minlength=3) l = scipy.array([bin_counts[0], bin_counts[2]]) + bin_counts[1] / 2.0 mac = l.min() macs.append(mac) mafs.append(mac / float(num_nts)) else: raise NotImplementedError return {"macs":macs, "mafs":mafs}
def coo_to_csr(coo): nr_rows, nr_cols, nnz, row, col, val = \ coo.shape[0], coo.shape[1], coo.data.shape[0], coo.row, coo.col, coo.data indptr = sp.cumsum(sp.bincount(row + 1, minlength=(nr_rows + 1)), dtype=sp.uint64) indices = sp.zeros(nnz, dtype=sp.uint32) data = sp.zeros(nnz, dtype=dtype) sorted_idx = sp.argsort(row * sp.float64(nr_cols) + col) indices[:] = col[sorted_idx] data[:] = val[sorted_idx] return indptr, indices, data
def mc(data): nmc = 100 mcont = sp.zeros(nmc) wcont = sp.zeros(nmc) ll = forest.lmin_rest + (sp.arange(nmc) + .5) * (forest.lmax_rest - forest.lmin_rest) / nmc for p in data: for d in data[p]: bins = ((d.ll - forest.lmin_rest - sp.log10(1 + d.zqso)) / (forest.lmax_rest - forest.lmin_rest) * nmc).astype(int) var_lss = forest.var_lss(d.ll) we = d.iv / var_lss * d.co**2 / (d.iv + d.co**2 / var_lss) c = sp.bincount(bins, weights=d.fl / d.co * we) mcont[:len(c)] += c c = sp.bincount(bins, weights=we) wcont[:len(c)] += c w = wcont > 0 mcont[w] /= wcont[w] mcont /= mcont.mean() return ll, mcont
def fast_xcf(z1, r1, rdm1, w1, d1, z2, r2, rdm2, w2, ang): if ang_correlation: rp = r1[:, None] / r2 rt = ang * sp.ones_like(rp) else: rp = (r1[:, None] - r2) * sp.cos(ang / 2) rt = (rdm1[:, None] + rdm2) * sp.sin(ang / 2) z = (z1[:, None] + z2) / 2 we = w1[:, None] * w2 wde = (w1 * d1)[:, None] * w2 w = (rp > rp_min) & (rp < rp_max) & (rt < rt_max) rp = rp[w] rt = rt[w] z = z[w] we = we[w] wde = wde[w] bp = ((rp - rp_min) / (rp_max - rp_min) * np).astype(int) bt = (rt / rt_max * nt).astype(int) bins = bt + nt * bp cd = sp.bincount(bins, weights=wde) cw = sp.bincount(bins, weights=we) crp = sp.bincount(bins, weights=rp * we) crt = sp.bincount(bins, weights=rt * we) cz = sp.bincount(bins, weights=z * we) cnb = sp.bincount(bins, weights=(we > 0.)) return cw, cd, crp, crt, cz, cnb
def filter_mac_snps(self, min_mac=10): """ Removes SNPs from the data which are have low macs. """ snps_ix = [] num_snps = self.num_snps for i,snp in enumerate(self.get_snps_iterator()): if self.data_format in ['binary', 'int']: l = scipy.bincount(snp) mac = l.min() elif self.data_format == 'diploid_int': bin_counts = scipy.bincount(snp, minlength=3) l = scipy.array([bin_counts[0], bin_counts[2]]) + bin_counts[1] / 2.0 mac = l.min() else: mac=0 if mac < min_mac: snps_ix.append(i) numRemoved = len(snps_ix) self.filter_snps_ix(snps_ix) log.info("Removed %d SNPs with mac below %d, out of %d SNPs in total." % (numRemoved, min_mac, num_snps)) return (num_snps, numRemoved)
def convert_codes_to_csc_matrix(codes, depth): nr_codes = 1 << depth nr_elements = len(codes) indptr = sp.cumsum(sp.bincount(codes + 1, minlength=(nr_codes + 1)), dtype=sp.uint64) indices = sp.argsort(codes * sp.float64(nr_elements) + sp.arange(nr_elements)) C = smat.csc_matrix( (sp.ones_like(indices, dtype=sp.float32), indices, indptr), shape=(nr_elements, nr_codes), ) return C
def vertex_degrees(self): """Computes vertex degrees diagonal matrix d(v)=sum(w(e)), where e in E, v in e Returns ------- d_v: sparse diagonal matrix sparse diagonal vertex degree matrix """ return spsp.diags( sp.bincount(self.edge_list.flatten(), weights=sp.array([[i] * self.k for i in self.weights]).flatten()))
def feval(self, x, average=True): '''I'm considering opt problem regarding parameters of each digit independently i.e.., I have ten opt problems to solve. loss_fn is an 10X1 matrix or vector''' data = self.data.train loss_fn = self.funcEval(x, data) if average == True: n_vals = sp.bincount(self.data.train[1]).astype( float) #counts no of 1's 2's ..... in the dataset loss_fn = sp.divide(loss_fn, sp.reshape(n_vals, sp.shape(loss_fn))) return loss_fn else: return loss_fn
def calc_ibs_kinship(snps, snps_data_format='binary', snp_dtype='int8', dtype='single', chunk_size=None, scaled=True): """ Calculates IBS kinship data_format: two are currently supported, 'binary', and 'diploid_int' """ num_snps = len(snps) # print 'Allocating K matrix' num_lines = len(snps[0]) if chunk_size == None: chunk_size = num_lines k_mat = sp.zeros((num_lines, num_lines), dtype=dtype) # print 'Starting calculation' chunk_i = 0 for snp_i in range(0, num_snps, chunk_size): #FINISH!!! chunk_i += 1 snps_array = sp.array(snps[snp_i:snp_i + chunk_size], dtype=snp_dtype) snps_array = snps_array.T if snps_data_format == 'diploid_int': for i in range(num_lines): for j in range(i): bin_counts = sp.bincount( sp.absolute(snps_array[j] - snps_array[i])) if len(bin_counts) > 1: k_mat[i, j] += (bin_counts[0] + 0.5 * bin_counts[1]) else: k_mat[i, j] += bin_counts[0] k_mat[j, i] = k_mat[i, j] elif snps_data_format == 'binary': sm = sp.mat(snps_array * 2.0 - 1.0) k_mat = k_mat + sm * sm.T else: raise NotImplementedError sys.stdout.write('\b\b\b\b\b\b\b%0.2f%%' % (100.0 * (min(1, ((chunk_i + 1.0) * chunk_size) / num_snps)))) sys.stdout.flush() print '' if snps_data_format == 'diploid_int': k_mat = k_mat / float(num_snps) + sp.eye(num_lines) elif snps_data_format == 'binary': k_mat = k_mat / (2 * float(num_snps)) + 0.5 if scaled: k_mat = scale_k(k_mat) return k_mat
def calc_ibs_kinship(snps, snps_data_format='binary', snp_dtype='int8', dtype='single', chunk_size=None, scaled=True): """ Calculates IBS kinship data_format: two are currently supported, 'binary', and 'diploid_int' """ num_snps = len(snps) #print 'Allocating K matrix' num_lines = len(snps[0]) if chunk_size == None: chunk_size = num_lines k_mat = sp.zeros((num_lines, num_lines), dtype=dtype) #print 'Starting calculation' chunk_i = 0 for snp_i in range(0, num_snps, chunk_size): #FINISH!!! chunk_i += 1 snps_array = sp.array(snps[snp_i:snp_i + chunk_size], dtype=snp_dtype) snps_array = snps_array.T if snps_data_format == 'diploid_int': for i in range(num_lines): for j in range(i): bin_counts = sp.bincount(sp.absolute(snps_array[j] - snps_array[i])) if len(bin_counts) > 1: k_mat[i, j] += (bin_counts[0] + 0.5 * bin_counts[1]) else: k_mat[i, j] += bin_counts[0] k_mat[j, i] = k_mat[i, j] elif snps_data_format == 'binary': sm = sp.mat(snps_array * 2.0 - 1.0) k_mat = k_mat + sm * sm.T else: raise NotImplementedError sys.stdout.write('\b\b\b\b\b\b\b%0.2f%%' % (100.0 * (min(1, ((chunk_i + 1.0) * chunk_size) / num_snps)))) sys.stdout.flush() print '' if snps_data_format == 'diploid_int': k_mat = k_mat / float(num_snps) + sp.eye(num_lines) elif snps_data_format == 'binary': k_mat = k_mat / (2 * float(num_snps)) + 0.5 if scaled: k_mat = scale_k(k_mat) return k_mat
def calc_ibs_kinship(genotype, snp_dtype='int8', dtype='single',chunk_size=None): """ Calculates IBS kinship data_format: two are currently supported, 'binary', and 'diploid_int' """ num_snps = genotype.num_snps num_lines = len(genotype.accessions) if chunk_size == None: chunk_size = num_lines k_mat = sp.zeros((num_lines, num_lines), dtype=dtype) log.info('Starting calculation of IBS kinship') chunk_i = 0 snps = genotype.get_snps_iterator(is_chunked=True,chunk_size=chunk_size) snps_data_format = genotype.data_format for snps_chunk in snps: chunk_i += 1 snps_array = sp.array(snps_chunk, dtype=snp_dtype) snps_array = snps_array.T if snps_data_format == 'diploid_int': for i in range(num_lines): for j in range(i): bin_counts = sp.bincount(sp.absolute(snps_array[j] - snps_array[i])) if len(bin_counts) > 1: k_mat[i, j] += (bin_counts[0] + 0.5 * bin_counts[1]) else: k_mat[i, j] += bin_counts[0] k_mat[j, i] = k_mat[i, j] elif snps_data_format == 'binary': sm = sp.mat(snps_array * 2.0 - 1.0) k_mat = k_mat + sm * sm.T else: raise NotImplementedError log.debug('%0.2f%%' % (100.0 * (min(1, ((chunk_i + 1.0) * chunk_size) / num_snps)))) if snps_data_format == 'diploid_int': k_mat = k_mat / float(num_snps) + sp.eye(num_lines) elif snps_data_format == 'binary': k_mat = k_mat / (2 * float(num_snps)) + 0.5 log.info('Finished calculation') return k_mat
def run(self, nbins=25): r""" Computes the pore size function of the image. This method calculates the distance transform of the void space, then computes a histogram of the occurances of each distance value. Parameters ---------- nbins : int The number of bins into which the distance values should be sorted. The default is 25. """ temp_img = spim.distance_transform_edt(self.image) dvals = temp_img[self.image].flatten() rmax = sp.amax(dvals) bins = sp.linspace(1, rmax, nbins) binned = sp.digitize(x=dvals, bins=bins) vals = namedtuple('PoreSizeFunction', ('distance', 'frequency')) vals.distance = bins vals.frequency = sp.bincount(binned, minlength=nbins)[1:] return vals
def knn(trainpoints, traincats, testpoints, k): """Given training data points and a 1-d array of the corresponding categories of the points, predict category for each test point, using k nearest neighbors (with cosine distance). Return a 1-d array of predicted categories. """ # TODO: fill in testtraindist = cdist(testpoints, trainpoints, 'cosine') # pairwise distance between every test and train point print 'Computed pairwise distances' testtrainsort = scipy.argsort(testtraindist, axis=1)[:, :k] # for each row (test), column (train) indices sorted by distance in increasing order, and take first k print 'Sorted distances' numtest, numtrain = testtraindist.shape predictions = scipy.zeros(numtest) for i in range(numtest): predcats = traincats[testtrainsort[i, :]] catcounts = scipy.bincount(predcats) predictions[i] = scipy.argmax(catcounts) return predictions
def coordinate_genot_ss(genotype_file=None, hdf5_file=None, genetic_map_dir=None, check_mafs=False, min_maf =0.01): """ Assumes plink BED files. Imputes missing genotypes. """ plinkf = plinkfile.PlinkFile(genotype_file) samples = plinkf.get_samples() num_individs = len(samples) # num_individs = len(gf['chrom_1']['snps'][:, 0]) # Y = sp.array(gf['indivs']['phenotype'][...] == 'Case', dtype='int8') Y = [s.phenotype for s in samples] fids = [s.fid for s in samples] iids = [s.iid for s in samples] unique_phens = sp.unique(Y) if len(unique_phens)==1: print 'Unable to find phenotype values.' has_phenotype=False elif len(unique_phens)==2: cc_bins = sp.bincount(Y) assert len(cc_bins)==2, 'Problems with loading phenotype' print 'Loaded %d controls and %d cases'%(cc_bins[0], cc_bins[1]) has_phenotype=True else: print 'Found quantitative phenotype values' has_phenotype=True risk_scores = sp.zeros(num_individs) rb_risk_scores = sp.zeros(num_individs) num_common_snps = 0 corr_list = [] rb_corr_list = [] if has_phenotype: hdf5_file.create_dataset('y', data=Y) hdf5_file.create_dataset('fids', data=fids) hdf5_file.create_dataset('iids', data=iids) ssf = hdf5_file['sum_stats'] cord_data_g = hdf5_file.create_group('cord_data') #Figure out chromosomes and positions by looking at SNPs. loci = plinkf.get_loci() plinkf.close() gf_chromosomes = [l.chromosome for l in loci] chromosomes = sp.unique(gf_chromosomes) chromosomes.sort() chr_dict = _get_chrom_dict_(loci, chromosomes) tot_num_non_matching_nts = 0 for chrom in chromosomes: chr_str = 'chrom_%d'%chrom print 'Working on chromsome: %s'%chr_str chrom_d = chr_dict[chr_str] try: ssg = ssf['chrom_%d' % chrom] except Exception, err_str: print err_str print 'Did not find chromsome in SS dataset.' print 'Continuing.' continue g_sids = chrom_d['sids'] g_sid_set = set(g_sids) assert len(g_sid_set) == len(g_sids), 'Some duplicates?' ss_sids = ssg['sids'][...] ss_sid_set = set(ss_sids) assert len(ss_sid_set) == len(ss_sids), 'Some duplicates?' #Figure out filters: g_filter = sp.in1d(g_sids,ss_sids) ss_filter = sp.in1d(ss_sids,g_sids) #Order by SNP IDs g_order = sp.argsort(g_sids) ss_order = sp.argsort(ss_sids) g_indices = [] for g_i in g_order: if g_filter[g_i]: g_indices.append(g_i) ss_indices = [] for ss_i in ss_order: if ss_filter[ss_i]: ss_indices.append(ss_i) g_nts = chrom_d['nts'] snp_indices = chrom_d['snp_indices'] ss_nts = ssg['nts'][...] betas = ssg['betas'][...] log_odds = ssg['log_odds'][...] assert not sp.any(sp.isnan(betas)), 'WTF?' assert not sp.any(sp.isinf(betas)), 'WTF?' num_non_matching_nts = 0 num_ambig_nts = 0 ok_nts = [] print 'Found %d SNPs present in both datasets'%(len(g_indices)) if 'freqs' in ssg.keys(): ss_freqs = ssg['freqs'][...] ss_freqs_list=[] ok_indices = {'g':[], 'ss':[]} for g_i, ss_i in it.izip(g_indices, ss_indices): #Is the nucleotide ambiguous? #g_nt = [recode_dict[g_nts[g_i][0]],recode_dict[g_nts[g_i][1]] g_nt = [g_nts[g_i][0],g_nts[g_i][1]] if tuple(g_nt) in ambig_nts: num_ambig_nts +=1 tot_num_non_matching_nts += 1 continue #First check if nucleotide is sane? if (not g_nt[0] in valid_nts) or (not g_nt[1] in valid_nts): num_non_matching_nts += 1 tot_num_non_matching_nts += 1 continue ss_nt = ss_nts[ss_i] #Are the nucleotides the same? flip_nts = False os_g_nt = sp.array([opp_strand_dict[g_nt[0]], opp_strand_dict[g_nt[1]]]) if not (sp.all(g_nt == ss_nt) or sp.all(os_g_nt == ss_nt)): # Opposite strand nucleotides flip_nts = (g_nt[1] == ss_nt[0] and g_nt[0] == ss_nt[1]) or (os_g_nt[1] == ss_nt[0] and os_g_nt[0] == ss_nt[1]) if flip_nts: betas[ss_i] = -betas[ss_i] log_odds[ss_i] = -log_odds[ss_i] if 'freqs' in ssg.keys(): ss_freqs[ss_i] = 1-ss_freqs[ss_i] else: # print "Nucleotides don't match after all?: g_sid=%s, ss_sid=%s, g_i=%d, ss_i=%d, g_nt=%s, ss_nt=%s" % \ # (g_sids[g_i], ss_sids[ss_i], g_i, ss_i, str(g_nt), str(ss_nt)) num_non_matching_nts += 1 tot_num_non_matching_nts += 1 continue # everything seems ok. ok_indices['g'].append(g_i) ok_indices['ss'].append(ss_i) ok_nts.append(g_nt) print '%d SNPs were excluded due to ambiguous nucleotides.' % num_ambig_nts print '%d SNPs were excluded due to non-matching nucleotides.' % num_non_matching_nts #Resorting by position positions = sp.array(chrom_d['positions'])[ok_indices['g']] order = sp.argsort(positions) ok_indices['g'] = list(sp.array(ok_indices['g'])[order]) ok_indices['ss'] = list(sp.array(ok_indices['ss'])[order]) positions = positions[order] #Parse SNPs snp_indices = sp.array(chrom_d['snp_indices']) snp_indices = snp_indices[ok_indices['g']] #Pinpoint where the SNPs are in the file. raw_snps, freqs = _parse_plink_snps_(genotype_file, snp_indices) print 'raw_snps.shape=', raw_snps.shape snp_stds = sp.sqrt(2*freqs*(1-freqs)) #sp.std(raw_snps, 1) snp_means = freqs*2 #sp.mean(raw_snps, 1) betas = betas[ok_indices['ss']] log_odds = log_odds[ok_indices['ss']] ps = ssg['ps'][...][ok_indices['ss']] nts = sp.array(ok_nts)[order] sids = ssg['sids'][...][ok_indices['ss']] #Check SNP frequencies.. if check_mafs and 'freqs' in ssg.keys(): ss_freqs = ss_freqs[ok_indices['ss']] freq_discrepancy_snp = sp.absolute(ss_freqs-(1-freqs))>0.15 if sp.any(freq_discrepancy_snp): print 'Warning: %d SNPs appear to have high frequency discrepancy between summary statistics and validation sample'%sp.sum(freq_discrepancy_snp) print freqs[freq_discrepancy_snp] print ss_freqs[freq_discrepancy_snp] #Filter freq_discrepancy_snps ok_freq_snps = sp.negative(freq_discrepancy_snp) raw_snps = raw_snps[ok_freq_snps] snp_stds = snp_stds[ok_freq_snps] snp_means = snp_means[ok_freq_snps] freqs = freqs[ok_freq_snps] ps = ps[ok_freq_snps] positions = positions[ok_freq_snps] nts = nts[ok_freq_snps] sids = sids[ok_freq_snps] betas = betas[ok_freq_snps] log_odds = log_odds[ok_freq_snps] #Filter minor allele frequency SNPs. maf_filter = (freqs>min_maf)*(freqs<(1-min_maf)) maf_filter_sum = sp.sum(maf_filter) n_snps = len(maf_filter) assert maf_filter_sum<=n_snps, "WTF?" if sp.sum(maf_filter)<n_snps: raw_snps = raw_snps[maf_filter] snp_stds = snp_stds[maf_filter] snp_means = snp_means[maf_filter] freqs = freqs[maf_filter] ps = ps[maf_filter] positions = positions[maf_filter] nts = nts[maf_filter] sids = sids[maf_filter] betas = betas[maf_filter] log_odds = log_odds[maf_filter] print '%d SNPs with MAF < %0.3f were filtered'%(n_snps-maf_filter_sum,min_maf) print '%d SNPs were retained on chromosome %d.' % (maf_filter_sum, chrom) rb_prs = sp.dot(sp.transpose(raw_snps), log_odds) if has_phenotype: print 'Normalizing SNPs' snp_means.shape = (len(raw_snps),1) snp_stds.shape = (len(raw_snps),1) snps = (raw_snps - snp_means) / snp_stds assert snps.shape==raw_snps.shape, 'Aha!' snp_stds = snp_stds.flatten() snp_means = snp_means.flatten() prs = sp.dot(sp.transpose(snps), betas) corr = sp.corrcoef(Y, prs)[0, 1] corr_list.append(corr) print 'PRS correlation for chromosome %d was %0.4f' % (chrom, corr) rb_corr = sp.corrcoef(Y, rb_prs)[0, 1] rb_corr_list.append(rb_corr) print 'Raw effect sizes PRS correlation for chromosome %d was %0.4f' % (chrom, rb_corr) sid_set = set(sids) if genetic_map_dir is not None: genetic_map = [] with gzip.open(genetic_map_dir+'chr%d.interpolated_genetic_map.gz'%chrom) as f: for line in f: l = line.split() if l[0] in sid_set: genetic_map.append(l[0]) print 'Now storing coordinated data to HDF5 file.' ofg = cord_data_g.create_group('chrom_%d' % chrom) ofg.create_dataset('raw_snps_ref', data=raw_snps, compression='lzf') ofg.create_dataset('snp_stds_ref', data=snp_stds) ofg.create_dataset('snp_means_ref', data=snp_means) ofg.create_dataset('freqs_ref', data=freqs) ofg.create_dataset('ps', data=ps) ofg.create_dataset('positions', data=positions) ofg.create_dataset('nts', data=nts) ofg.create_dataset('sids', data=sids) if genetic_map_dir is not None: ofg.create_dataset('genetic_map', data=genetic_map) # print 'Sum of squared effect sizes:', sp.sum(betas ** 2) # print 'Sum of squared log odds:', sp.sum(log_odds ** 2) ofg.create_dataset('betas', data=betas) ofg.create_dataset('log_odds', data=log_odds) ofg.create_dataset('log_odds_prs', data=rb_prs) if has_phenotype: risk_scores += prs rb_risk_scores += rb_prs num_common_snps += len(betas)
def coordinate_genotypes_ss_w_ld_ref(genotype_file = None, reference_genotype_file = None, hdf5_file = None, genetic_map_dir=None, check_mafs=False, min_maf=0.01): # recode_dict = {1:'A', 2:'T', 3:'C', 4:'G'} #1K genomes recoding.. print 'Coordinating things w genotype file: %s \nref. genot. file: %s'%(genotype_file, reference_genotype_file) plinkf = plinkfile.PlinkFile(genotype_file) #Loads only the individuals... (I think?) samples = plinkf.get_samples() num_individs = len(samples) Y = [s.phenotype for s in samples] fids = [s.fid for s in samples] iids = [s.iid for s in samples] unique_phens = sp.unique(Y) if len(unique_phens)==1: print 'Unable to find phenotype values.' has_phenotype=False elif len(unique_phens)==2: cc_bins = sp.bincount(Y) assert len(cc_bins)==2, 'Problems with loading phenotype' print 'Loaded %d controls and %d cases'%(cc_bins[0], cc_bins[1]) has_phenotype=True else: print 'Found quantitative phenotype values' has_phenotype=True #Figure out chromosomes and positions. print 'Parsing validation genotype bim file' loci = plinkf.get_loci() plinkf.close() gf_chromosomes = [l.chromosome for l in loci] chromosomes = sp.unique(gf_chromosomes) chromosomes.sort() chr_dict = _get_chrom_dict_(loci, chromosomes) print 'Parsing LD reference genotype bim file' plinkf_ref = plinkfile.PlinkFile(reference_genotype_file) loci_ref = plinkf_ref.get_loci() plinkf_ref.close() chr_dict_ref = _get_chrom_dict_(loci_ref, chromosomes) # chr_dict_ref = _get_chrom_dict_bim_(reference_genotype_file+'.bim', chromosomes) #Open HDF5 file and prepare out data assert not 'iids' in hdf5_file.keys(), 'Something is wrong with the HDF5 file?' if has_phenotype: hdf5_file.create_dataset('y', data=Y) hdf5_file.create_dataset('fids', data=fids) hdf5_file.create_dataset('iids', data=iids) ssf = hdf5_file['sum_stats'] cord_data_g = hdf5_file.create_group('cord_data') maf_adj_risk_scores = sp.zeros(num_individs) num_common_snps = 0 #corr_list = [] tot_g_ss_nt_concord_count = 0 tot_rg_ss_nt_concord_count = 0 tot_g_rg_nt_concord_count = 0 tot_num_non_matching_nts = 0 #Now iterate over chromosomes for chrom in chromosomes: ok_indices = {'g':[], 'rg':[], 'ss':[]} chr_str = 'chrom_%d'%chrom print 'Working on chromsome: %s'%chr_str chrom_d = chr_dict[chr_str] chrom_d_ref = chr_dict_ref[chr_str] try: ssg = ssf['chrom_%d' % chrom] except Exception, err_str: print err_str print 'Did not find chromsome in SS dataset.' print 'Continuing.' continue ssg = ssf['chrom_%d' % chrom] g_sids = chrom_d['sids'] rg_sids = chrom_d_ref['sids'] ss_sids = ssg['sids'][...] print 'Found %d SNPs in validation data, %d SNPs in LD reference data, and %d SNPs in summary statistics.'%(len(g_sids), len(rg_sids), len(ss_sids)) common_sids = sp.intersect1d(ss_sids, g_sids) common_sids = sp.intersect1d(common_sids, rg_sids) print 'Found %d SNPs on chrom %d that were common across all datasets'%(len(common_sids), chrom) ss_snp_map = [] g_snp_map = [] rg_snp_map = [] ss_sid_dict = {} for i, sid in enumerate(ss_sids): ss_sid_dict[sid]=i g_sid_dict = {} for i, sid in enumerate(g_sids): g_sid_dict[sid]=i rg_sid_dict = {} for i, sid in enumerate(rg_sids): rg_sid_dict[sid]=i for sid in common_sids: g_snp_map.append(g_sid_dict[sid]) #order by positions g_positions = sp.array(chrom_d['positions'])[g_snp_map] order = sp.argsort(g_positions) #order = order.tolist() g_snp_map = sp.array(g_snp_map)[order] g_snp_map = g_snp_map.tolist() common_sids = sp.array(common_sids)[order] #Get the other two maps for sid in common_sids: rg_snp_map.append(rg_sid_dict[sid]) for sid in common_sids: ss_snp_map.append(ss_sid_dict[sid]) g_nts = sp.array(chrom_d['nts']) rg_nts = sp.array(chrom_d_ref['nts']) rg_nts_ok = sp.array(rg_nts)[rg_snp_map] # rg_nts_l = [] # for nt in rg_nts_ok: # rg_nts_l.append([recode_dict[nt[0]],recode_dict[nt[1]]]) # rg_nts_ok = sp.array(rg_nts_l) ss_nts = ssg['nts'][...] betas = ssg['betas'][...] log_odds = ssg['log_odds'][...] if 'freqs' in ssg.keys(): ss_freqs = ssg['freqs'][...] g_ss_nt_concord_count = sp.sum(g_nts[g_snp_map] == ss_nts[ss_snp_map])/2.0 rg_ss_nt_concord_count = sp.sum(rg_nts_ok == ss_nts[ss_snp_map])/2.0 g_rg_nt_concord_count = sp.sum(g_nts[g_snp_map] == rg_nts_ok)/2.0 print 'Nucleotide concordance counts out of %d genotypes: vg-g: %d, vg-ss: %d, g-ss: %d'%(len(g_snp_map),g_rg_nt_concord_count, g_ss_nt_concord_count, rg_ss_nt_concord_count) tot_g_ss_nt_concord_count += g_ss_nt_concord_count tot_rg_ss_nt_concord_count += rg_ss_nt_concord_count tot_g_rg_nt_concord_count += g_rg_nt_concord_count num_non_matching_nts = 0 num_ambig_nts = 0 #Identifying which SNPs have nucleotides that are ok.. ok_nts = [] for g_i, rg_i, ss_i in it.izip(g_snp_map, rg_snp_map, ss_snp_map): #To make sure, is the SNP id the same? assert g_sids[g_i]==rg_sids[rg_i]==ss_sids[ss_i], 'Some issues with coordinating the genotypes.' g_nt = g_nts[g_i] rg_nt = rg_nts[rg_i] # rg_nt = [recode_dict[rg_nts[rg_i][0]],recode_dict[rg_nts[rg_i][1]]] ss_nt = ss_nts[ss_i] #Is the nucleotide ambiguous. g_nt = [g_nts[g_i][0],g_nts[g_i][1]] if tuple(g_nt) in ambig_nts: num_ambig_nts +=1 tot_num_non_matching_nts += 1 continue #First check if nucleotide is sane? if (not g_nt[0] in valid_nts) or (not g_nt[1] in valid_nts): num_non_matching_nts += 1 tot_num_non_matching_nts += 1 continue os_g_nt = sp.array([opp_strand_dict[g_nt[0]], opp_strand_dict[g_nt[1]]]) flip_nts = False if not ((sp.all(g_nt == ss_nt) or sp.all(os_g_nt == ss_nt)) and (sp.all(g_nt == rg_nt) or sp.all(os_g_nt == rg_nt))): if sp.all(g_nt == rg_nt) or sp.all(os_g_nt == rg_nt): flip_nts = (g_nt[1] == ss_nt[0] and g_nt[0] == ss_nt[1]) or (os_g_nt[1] == ss_nt[0] and os_g_nt[0] == ss_nt[1]) #Try flipping the SS nt if flip_nts: betas[ss_i] = -betas[ss_i] log_odds[ss_i] = -log_odds[ss_i] if 'freqs' in ssg.keys(): ss_freqs[ss_i] = 1-ss_freqs[ss_i] else: print "Nucleotides don't match after all?: g_sid=%s, ss_sid=%s, g_i=%d, ss_i=%d, g_nt=%s, ss_nt=%s" % \ (g_sids[g_i], ss_sids[ss_i], g_i, ss_i, str(g_nt), str(ss_nt)) num_non_matching_nts += 1 tot_num_non_matching_nts += 1 continue else: num_non_matching_nts += 1 tot_num_non_matching_nts += 1 continue # Opposite strand nucleotides # everything seems ok. ok_indices['g'].append(g_i) ok_indices['rg'].append(rg_i) ok_indices['ss'].append(ss_i) ok_nts.append(g_nt) # if flip_nts: # ok_nts.append([ss_nt[1],ss_nt[0]]) # else: # ok_nts.append(ss_nt) #print '%d SNPs in LD references to be flipped.'%((len(ref_snp_directions)-sp.sum(ref_snp_directions))/2.0) print '%d SNPs had ambiguous nucleotides.' % num_ambig_nts print '%d SNPs were excluded due to nucleotide issues.' % num_non_matching_nts print '%d SNPs were retained on chromosome %d.' % (len(ok_indices['g']), chrom) #Resorting by position positions = sp.array(chrom_d['positions'])[ok_indices['g']] # order = sp.argsort(positions) # sorted_positions = positions[order] # assert sp.all(sorted_positions==positions), 'Perhaps something is wrong here?' # ok_indices['g'] = list(sp.array(ok_indices['g'])[order]) # ok_indices['ss'] = list(sp.array(ok_indices['ss'])[order]) #Now parse SNPs .. snp_indices = sp.array(chrom_d['snp_indices']) snp_indices = snp_indices[ok_indices['g']] #Pinpoint where the SNPs are in the file. raw_snps,freqs = _parse_plink_snps_(genotype_file, snp_indices) snp_indices_ref = sp.array(chrom_d_ref['snp_indices']) snp_indices_ref = snp_indices_ref[ok_indices['rg']] #Pinpoint where the SNPs are in the file. raw_ref_snps, freqs_ref = _parse_plink_snps_(reference_genotype_file, snp_indices_ref) snp_stds_ref = sp.sqrt(2*freqs_ref*(1-freqs_ref)) snp_means_ref = freqs_ref*2 snp_stds = sp.sqrt(2*freqs*(1-freqs)) snp_means = freqs*2 betas = betas[ok_indices['ss']] # * sp.sqrt(freqs * (1 - freqs)) log_odds = log_odds[ok_indices['ss']] # * sp.sqrt(freqs * (1 - freqs)) ps = ssg['ps'][...][ok_indices['ss']] nts = sp.array(ok_nts)#[order] sids = ssg['sids'][...][ok_indices['ss']] #For debugging... # g_sids = sp.array(chrom_d['sids'])[ok_indices['g']] # rg_sids = sp.array(chrom_d_ref['sids'])[ok_indices['rg']] # ss_sids = ssg['sids'][...][ok_indices['ss']] # assert sp.all(g_sids==rg_sids) and sp.all(rg_sids==ss_sids), 'WTF?' #Check SNP frequencies.. if check_mafs and 'freqs' in ssg.keys(): ss_freqs = ss_freqs[ok_indices['ss']] freq_discrepancy_snp = sp.absolute(ss_freqs-(1-freqs))>0.15 if sp.any(freq_discrepancy_snp): print 'Warning: %d SNPs were filtered due to high allele frequency discrepancy between summary statistics and validation sample'%sp.sum(freq_discrepancy_snp) # print freqs[freq_discrepancy_snp] # print ss_freqs[freq_discrepancy_snp] #Filter freq_discrepancy_snps ok_freq_snps = sp.negative(freq_discrepancy_snp) raw_snps = raw_snps[ok_freq_snps] snp_stds = snp_stds[ok_freq_snps] snp_means = snp_means[ok_freq_snps] raw_ref_snps = raw_ref_snps[ok_freq_snps] snp_stds_ref = snp_stds_ref[ok_freq_snps] snp_means_ref = snp_means_ref[ok_freq_snps] freqs = freqs[ok_freq_snps] freqs_ref = freqs_ref[ok_freq_snps] ps = ps[ok_freq_snps] positions = positions[ok_freq_snps] nts = nts[ok_freq_snps] sids = sids[ok_freq_snps] betas = betas[ok_freq_snps] log_odds = log_odds[ok_freq_snps] #For debugging... # if sp.any(freq_discrepancy_snp): # g_sids = g_sids[ok_freq_snps] # rg_sids = rg_sids[ok_freq_snps] # ss_sids = ss_sids[ok_freq_snps] # assert sp.all(g_sids==rg_sids) and sp.all(rg_sids==ss_sids), 'WTF?' #Filter minor allele frequency SNPs. maf_filter = (freqs>min_maf)*(freqs<(1-min_maf)) maf_filter_sum = sp.sum(maf_filter) n_snps = len(maf_filter) assert maf_filter_sum<=n_snps, "WTF?" if sp.sum(maf_filter)<n_snps: raw_snps = raw_snps[maf_filter] snp_stds = snp_stds[maf_filter] snp_means = snp_means[maf_filter] raw_ref_snps = raw_ref_snps[maf_filter] snp_stds_ref = snp_stds_ref[maf_filter] snp_means_ref = snp_means_ref[maf_filter] freqs = freqs[maf_filter] freqs_ref = freqs_ref[maf_filter] ps = ps[maf_filter] positions = positions[maf_filter] nts = nts[maf_filter] sids = sids[maf_filter] betas = betas[maf_filter] log_odds = log_odds[maf_filter] # if sp.sum(maf_filter)<n_snps: # g_sids = g_sids[maf_filter] # rg_sids = rg_sids[maf_filter] # ss_sids = ss_sids[maf_filter] # assert sp.all(g_sids==rg_sids) and sp.all(rg_sids==ss_sids), 'WTF?' maf_adj_prs = sp.dot(log_odds, raw_snps) if has_phenotype: maf_adj_corr = sp.corrcoef(Y, maf_adj_prs)[0, 1] print 'Log odds, per genotype PRS correlation w phenotypes for chromosome %d was %0.4f' % (chrom, maf_adj_corr) genetic_map = [] if genetic_map_dir is not None: with gzip.open(genetic_map_dir+'chr%d.interpolated_genetic_map.gz'%chrom) as f: for line in f: l = line.split() if l[0] in sid_set: genetic_map.append(l[0]) print 'Now storing coordinated data to HDF5 file.' ofg = cord_data_g.create_group('chrom_%d' % chrom) ofg.create_dataset('raw_snps_val', data=raw_snps, compression='lzf') ofg.create_dataset('snp_stds_val', data=snp_stds) ofg.create_dataset('snp_means_val', data=snp_means) ofg.create_dataset('freqs_val', data=freqs) ofg.create_dataset('raw_snps_ref', data=raw_ref_snps, compression='lzf') ofg.create_dataset('snp_stds_ref', data=snp_stds_ref) ofg.create_dataset('snp_means_ref', data=snp_means_ref) ofg.create_dataset('freqs_ref', data=freqs_ref) ofg.create_dataset('nts', data=nts) ofg.create_dataset('ps', data=ps) ofg.create_dataset('positions', data=positions) ofg.create_dataset('sids', data=sids) if genetic_map_dir is not None: ofg.create_dataset('genetic_map', data=genetic_map) ofg.create_dataset('betas', data=betas) ofg.create_dataset('log_odds', data=log_odds) ofg.create_dataset('log_odds_prs', data=maf_adj_prs) # print 'Sum betas', sp.sum(betas ** 2) #ofg.create_dataset('prs', data=prs) #risk_scores += prs maf_adj_risk_scores += maf_adj_prs num_common_snps += len(betas)
def find_neighbor_pores(self,pnums,flatten=True,mode='union',excl_self=False): r""" Returns a list of pores neighboring the given pore(s) Parameters ---------- pnums : array_like ID numbers of pores whose neighbors are sought. flatten : boolean, optional If flatten is True (default) a 1D array of unique pore ID numbers is returned with the input pores (Pnum) removed. If flatten is False the returned array contains arrays of neighboring pores for each input pore, in the order they were sent. excl_self : bool, optional If this is True (default) then the input pores are not included in the returned list. This option only applies when input pores are in fact neighbors to each other, otherwise they are not part of the returned list. mode : string, optional Specifies which neighbors should be returned. The options are: * 'union' : All neighbors of the input pores * 'intersection' : Only neighbors shared by all input pores * 'not_intersection' : Only neighbors not shared by any input pores Returns ------- neighborPs : 1D array (if flatten is True) or ndarray of ndarrays (if flatten if False) Examples -------- >>> pn = OpenPNM.Network.TestNet() >>> pn.find_neighbor_pores(pnums=[0,2]) array([ 1, 3, 5, 7, 25, 27]) >>> pn.find_neighbor_pores(pnums=[0,1]) #Find all neighbors, excluding selves (default behavior) array([ 2, 5, 6, 25, 26]) >>> pn.find_neighbor_pores(pnums=[0,2],flatten=False) array([array([ 1, 5, 25]), array([ 1, 3, 7, 27])], dtype=object) >>> pn.find_neighbor_pores(pnums=[0,2],mode='intersection') #Find only common neighbors array([1], dtype=int64) >>> pn.find_neighbor_pores(pnums=[0,2],mode='not_intersection') #Exclude common neighbors array([ 3, 5, 7, 25, 27], dtype=int64) >>> pn.find_neighbor_pores(pnums=[0,1],mode='union') #Find all neighbors, including selves array([ 0, 1, 2, 5, 6, 25, 26]) """ #Count neighboring pores try: neighborPs = self.adjacency_matrix['lil']['connections'].rows[[pnums]] except: self._logger.info('Creating adjacency matrix, please wait') self.create_adjacency_matrix() neighborPs = self.adjacency_matrix['lil']['connections'].rows[[pnums]] if flatten: #All the empty lists must be removed to maintain data type after hstack (numpy bug?) neighborPs = [sp.asarray(x) for x in neighborPs if x] neighborPs = sp.hstack(neighborPs) #neighborPs = sp.concatenate((neighborPs,pnums)) #Remove references to input pores and duplicates if mode == 'not_intersection': neighborPs = sp.unique(sp.where(sp.bincount(neighborPs)==1)[0]) elif mode == 'union': neighborPs = sp.unique(neighborPs) elif mode == 'intersection': neighborPs = sp.unique(sp.where(sp.bincount(neighborPs)>1)[0]) if excl_self: neighborPs = neighborPs[~sp.in1d(neighborPs,pnums)] else: for i in range(0,sp.size(pnums)): neighborPs[i] = sp.array(neighborPs[i]) return sp.array(neighborPs,ndmin=1)
def _find_neighbors(self, pores, element, mode, flatten, excl_self): r""" Private method for finding the neighboring pores or throats connected directly to given set of pores. Parameters ---------- pores : array_like The list of pores whose neighbors are sought element : string, either 'pore' or 'throat' Whether to find neighboring pores or throats mode : string Controls how the neighbors are filtered. Options are: **'union'** : All neighbors of the input pores **'intersection'** : Only neighbors shared by all input pores **'not_intersection'** : Only neighbors not shared by any input pores flatten : boolean If flatten is True (default) a 1D array of unique neighbors is returned. If flatten is False the returned array contains arrays of neighboring throat ID numbers for each input pore, in the order they were sent. excl_self : bool When True the input pores are not included in the returned list of neighboring pores. This option only applies when input pores are in fact neighbors to each other, otherwise they are not part of the returned list anyway. This is ignored with the element is 'throats'. See Also -------- find_neighbor_pores find_neighbor_throats num_neighors """ element = self._parse_element(element=element, single=True) pores = self._parse_locations(pores) if sp.size(pores) == 0: return sp.array([], ndmin=1, dtype=int) # Test for existence of incidence or adjacency matrix if element == 'pore': try: neighbors = self._adjacency_matrix['lil'].rows[[pores]] except: temp = self.create_adjacency_matrix(sprsfmt='lil') self._adjacency_matrix['lil'] = temp neighbors = self._adjacency_matrix['lil'].rows[[pores]] elif element == 'throat': try: neighbors = self._incidence_matrix['lil'].rows[[pores]] except: temp = self.create_incidence_matrix(sprsfmt='lil') self._incidence_matrix['lil'] = temp neighbors = self._incidence_matrix['lil'].rows[[pores]] if flatten: # Convert rows of lil into single flat list neighbors = itertools.chain.from_iterable(neighbors) if element == 'pore': # Add input pores to list neighbors = itertools.chain.from_iterable([neighbors, pores]) # Convert list to numpy array neighbors = sp.fromiter(neighbors, dtype=int) if mode == 'not_intersection': neighbors = sp.unique(sp.where(sp.bincount(neighbors) == 1)[0]) elif mode == 'union': neighbors = sp.unique(neighbors) elif mode == 'intersection': neighbors = sp.unique(sp.where(sp.bincount(neighbors) > 1)[0]) if excl_self and element == 'pore': # Remove input pores from list neighbors = neighbors[~sp.in1d(neighbors, pores)] return sp.array(neighbors, ndmin=1, dtype=int) else: # Convert lists in array to numpy arrays neighbors = [sp.array(neighbors[i]) for i in range(0, len(pores))] return sp.array(neighbors, ndmin=1)
def gen_anc_afs_plot(ancestry = 'AFR', plot_prefix = '/Users/bjv/Dropbox/Cloud_folder/tmp/1kg_AFS_all', outfile_prefix='/Users/bjv/Dropbox/Cloud_folder/tmp/1kg_AFS_all', data_filter=1, chunk_size = 10000): h5f = h5py.File('%s1k_genomes_hg.hdf5'%kg_dir) ancestries = sp.unique(h5f['indivs']['ancestry'][...]) #ancestries = sp.unique(h5f['indivs']['continent'][...]) for ancestry in ancestries: anc_filter = h5f['indivs']['ancestry'][...]==ancestry #anc_filter = h5f['indivs']['continent'][...]==ancestry num_indivs = sp.sum(anc_filter) print "%d individuals with %s ancestry are used"%(num_indivs,ancestry) sids_list = [] acs = [] for chrom in range(1,23): print 'Working on chromosome %d'%chrom chr_str = 'chr%d'%chrom num_snps = len(h5f[chr_str]['calldata']['snps']) assert num_snps==len(h5f[chr_str]['variants']['ID']), 'WTF?' for start_i in range(0,num_snps,chunk_size): snps = sp.array(h5f[chr_str]['calldata']['snps'][start_i:start_i+chunk_size],dtype='int8') sids = h5f[chr_str]['variants']['ID'][start_i:start_i+chunk_size] snps = snps[:,anc_filter] if data_filter<1: rand_filt = sp.random.random(len(snps)) rand_filt = rand_filt<data_filter snps = snps[rand_filt] sids = sids[rand_filt] (m,n) = snps.shape ac = sp.sum(snps,1) flip_filter = ac>n ac[flip_filter]=2*n-ac[flip_filter] #Plotting filter acs.extend(ac) sids_list.extend(sids) if start_i%1000000==0: print 'Parsed %d SNPs'%start_i print '%d SNPs loaded and filtered'%num_snps print '%d ACs and SIDs found'%len(acs) print 'Storing the AFS' with open('%s_%s.txt'%(outfile_prefix,ancestry),'w') as f: f.write('# %d individuals used\n'%num_indivs) f.write('SID AC\n') for sid, ac in izip(sids_list,acs): f.write('%s %d\n'%(sid,ac)) print 'Plot things' acs = sp.array(acs,dtype='int') acs = acs[acs>0] acs = acs[acs<30] min_ac = acs.min() max_ac = acs.max() sp.bincount(acs) plt.clf() plt.hist(acs, bins=sp.arange(min_ac-0.5, max_ac + 1.5, 1)) plt.title('%s AFS'%ancestry) plt.savefig('%s_%s.png'%(plot_prefix,ancestry)) h5f.close()
def find_neighbor_pores(self, pores, mode='union', flatten=True, excl_self=True): r""" Returns a list of pores neighboring the given pore(s) Parameters ---------- pores : array_like ID numbers of pores whose neighbors are sought. flatten : boolean, optional If flatten is True a 1D array of unique pore ID numbers is returned. If flatten is False the returned array contains arrays of neighboring pores for each input pore, in the order they were sent. excl_self : bool, optional (Default is False) If this is True then the input pores are not included in the returned list. This option only applies when input pores are in fact neighbors to each other, otherwise they are not part of the returned list anyway. mode : string, optional Specifies which neighbors should be returned. The options are: **'union'** : All neighbors of the input pores **'intersection'** : Only neighbors shared by all input pores **'not_intersection'** : Only neighbors not shared by any input pores Returns ------- neighborPs : 1D array (if flatten is True) or ndarray of ndarrays (if flatten if False) Examples -------- >>> import OpenPNM >>> pn = OpenPNM.Network.TestNet() >>> pn.find_neighbor_pores(pores=[0, 2]) array([ 1, 3, 5, 7, 25, 27]) >>> pn.find_neighbor_pores(pores=[0, 1]) array([ 2, 5, 6, 25, 26]) >>> pn.find_neighbor_pores(pores=[0, 1], mode='union', excl_self=False) array([ 0, 1, 2, 5, 6, 25, 26]) >>> pn.find_neighbor_pores(pores=[0, 2], flatten=False) array([array([ 1, 5, 25]), array([ 1, 3, 7, 27])], dtype=object) >>> pn.find_neighbor_pores(pores=[0, 2], mode='intersection') array([1]) >>> pn.find_neighbor_pores(pores=[0, 2], mode='not_intersection') array([ 3, 5, 7, 25, 27]) """ pores = self._parse_locations(pores) allowed_modes = ['union', 'intersection', 'not_intersection'] mode = self._parse_mode(mode, allowed=allowed_modes, single=True) if sp.size(pores) == 0: return sp.array([], ndmin=1, dtype=int) # Test for existence of incidence matrix try: neighborPs = self._adjacency_matrix['lil'].rows[[pores]] except: temp = self.create_adjacency_matrix(sprsfmt='lil') self._adjacency_matrix['lil'] = temp neighborPs = self._adjacency_matrix['lil'].rows[[pores]] if flatten: # Convert rows of lil into single flat list neighborPs = itertools.chain.from_iterable(neighborPs) # Add input pores to list neighborPs = itertools.chain.from_iterable([neighborPs, pores]) # Convert list to numpy array neighborPs = sp.fromiter(neighborPs, dtype=int) # Apply logic to include/exclude items of the set if mode == 'not_intersection': temp = sp.where(sp.bincount(neighborPs) == 1)[0] neighborPs = sp.unique(temp) elif mode == 'union': neighborPs = sp.unique(neighborPs) elif mode == 'intersection': temp = sp.where(sp.bincount(neighborPs) > 1)[0] neighborPs = sp.unique(temp) if excl_self: neighborPs = neighborPs[~sp.in1d(neighborPs, pores)] return sp.array(neighborPs, ndmin=1, dtype=int) else: # Convert lists in array to numpy arrays neighborPs = [sp.array(neighborPs[i]) for i in range(0, len(pores))] return sp.array(neighborPs, ndmin=1)
def load_eigenstrat_genotypes(in_file_prefix='eigenstrat_file_prefix', out_file_prefix='hdf5_file_prefix', impute_type='mode', filter_monomorphic_snps=True, missing_val_thr=0.1): """ Parses eigenstrat formated genotype files to a HDF5 format. It requires the h5py and scipy package. Ideally the genotypes are imputed apriory, otherwise a rough imputation (the most common genotype) is used for missing genotypes. Notes: Assumes the files are in diploid format! """ import h5py import scipy as sp import os import sys data_file_prefix = '%s_mv%0.2f_imp_%s.' % (out_file_prefix, missing_val_thr, impute_type) genotype_data = {} # Setting the HDF5 file up h5py_file_name = data_file_prefix + 'h5py' if os.path.isfile(h5py_file_name): print 'Overwriting: %s' % h5py_file_name os.remove(h5py_file_name) h5py_file = h5py.File(h5py_file_name) genotype_data['h5py_file'] = h5py_file_name # Fill out individuals data, if available i_filename = '%sind' % (in_file_prefix) if os.path.isfile(i_filename): iids = [] phens = [] genders = [] with open(i_filename) as f: for line in f: l = (line.strip()).split() iids.append(l[0]) genders.append(l[1]) phens.append(l[2]) ind_group = h5py_file.create_group('indivs') ind_group.create_dataset('indiv_ids', data=iids) ind_group.create_dataset('sex', data=genders) ind_group.create_dataset('phenotype', data=phens) else: print 'Individual information file not found: %s' % i_filename tot_num_snps = 0 tot_num_duplicated_snps_removed = 0 tot_num_missing_val_snps_removed = 0 tot_num_monomorphic_snps_removed = 0 # Open the genotype files. s_filename = '%ssnp' % (in_file_prefix) g_filename = '%sgeno' % (in_file_prefix) print 'Starting to parse files:\n\t %s \n\t %s' % (s_filename, g_filename) sf = open(s_filename) gf = open(g_filename) # Figure out sample size, number of SNPs, etc. # Initialize HDF5 file. # Setting up containers. curr_chrom = 1 curr_hdf5_group = h5py_file.create_group('chrom_%d' % curr_chrom) snps_mat = [] positions = [] sids = [] nts_list = [] nt_counts_list = [] missing_counts = [] freqs = [] num_missing_removed = 0 num_monomorphic_removed = 0 num_duplicated_snps_removed = 0 print 'Starting to parse SNP files' for s_line in sf: g_line = gf.next() sl = s_line.split() pos = int(sl[3]) chrom = int(sl[1]) sid = sl[0] if chrom != curr_chrom: # Report statistics and store stuff print 'Finished with Chromosome %d' % curr_chrom print 'Number of SNPs removed due to too many missing values: %d' % num_missing_removed print 'Number of duplicated SNPs removed: %d' % num_duplicated_snps_removed print 'Number of monomorphic SNPs removed: %d' % num_monomorphic_removed print 'Number of SNPs retained: %d' % len(positions) snps = sp.array(snps_mat, dtype='int8') curr_hdf5_group.create_dataset('raw_snps', compression='lzf', data=snps) h5py_file.flush() print 'Raw SNPs stored' snps = snps.T snps = (snps - sp.mean(snps, 0)) / sp.std(snps, 0) curr_hdf5_group.create_dataset('snps', compression='lzf', data=snps.T) h5py_file.flush() print 'Normalized SNPs stored' del snps del snps_mat curr_hdf5_group.create_dataset('positions', compression='lzf', data=positions) curr_hdf5_group.create_dataset('nts', compression='lzf', data=nts_list) curr_hdf5_group.create_dataset('nt_counts', compression='lzf', data=sp.array(nt_counts_list)) curr_hdf5_group.create_dataset('missing_counts', compression='lzf', data=missing_counts) curr_hdf5_group.create_dataset('freqs', compression='lzf', data=freqs) curr_hdf5_group.create_dataset('snp_ids', compression='lzf', data=sids) h5py_file.flush() sys.stdout.flush() # Reset containers curr_chrom = chrom curr_hdf5_group = h5py_file.create_group('chrom_%d' % curr_chrom) snps_mat = [] positions = [] sids = [] nts_list = [] nt_counts_list = [] missing_counts = [] freqs = [] num_missing_removed = 0 num_monomorphic_removed = 0 num_duplicated_snps_removed = 0 # Debug filter nt = (sl[4], sl[5]) snp = sp.array(map(int, g_line.strip()), dtype='int8') num_indiv = len(snp) bin_counts = sp.bincount(snp) # print bin_counts missing_count = bin_counts[-1] # Filtering SNPs with too many missing values if missing_count > missing_val_thr * 2 * num_indiv: num_missing_removed += 1 tot_num_missing_val_snps_removed += 1 continue nt_counts = list(bin_counts[:3]) # Imputing the SNPs roughly by replacing missing values with the mode value. if impute_type == 'mode': v = sp.argmax(nt_counts) snp[snp == 9] = v else: raise Exception('Imputation type is unknown') bin_counts = sp.bincount(snp) nt_counts = list(bin_counts[:3]) # Removing monomorphic SNPs if max(nt_counts) == sum(nt_counts): num_monomorphic_removed += 1 tot_num_monomorphic_snps_removed += 1 continue if len(nt_counts) == 2: nt_counts.append(0) # assert len(nt_counts) == 3, 'ARrrg' # Is this position already there? if len(positions) > 0 and pos == positions[-1]: num_duplicated_snps_removed += 1 tot_num_duplicated_snps_removed += 1 continue freq = sp.mean(snp) / 2.0 snps_mat.append(snp) positions.append(pos) sids.append(sid) nts_list.append(nt) nt_counts_list.append(nt_counts) missing_counts.append(missing_count) freqs.append(freq) tot_num_snps += 1 # Report statistics and store stuff print 'Number of SNPs removed due to too many missing values: %d' % num_missing_removed print 'Number of duplicated SNPs removed: %d' % num_duplicated_snps_removed print 'Number of monomorphic SNPs removed: %d' % num_monomorphic_removed print 'Number of SNPs retained: %d' % len(positions) snps = sp.array(snps_mat, dtype='int8') curr_hdf5_group.create_dataset('raw_snps', compression='lzf', data=snps) h5py_file.flush() print 'Raw SNPs stored' snps = snps.T snps = (snps - sp.mean(snps, 0)) / sp.std(snps, 0) curr_hdf5_group.create_dataset('snps', compression='lzf', data=snps.T) h5py_file.flush() print 'Normalized SNPs stored' del snps del snps_mat curr_hdf5_group.create_dataset('positions', compression='lzf', data=positions) curr_hdf5_group.create_dataset('nts', compression='lzf', data=nts_list) curr_hdf5_group.create_dataset('nt_counts', compression='lzf', data=sp.array(nt_counts_list)) curr_hdf5_group.create_dataset('missing_counts', compression='lzf', data=missing_counts) curr_hdf5_group.create_dataset('freqs', compression='lzf', data=freqs) curr_hdf5_group.create_dataset('snp_ids', compression='lzf', data=sids) gf.close() sf.close() print 'Genotypes for %d individuals were parsed.' % num_indiv print 'Total number of SNPs parsed successfully was: %d' % tot_num_snps print 'Total number of SNPs removed due to too many missing values: %d' % tot_num_missing_val_snps_removed print 'Total number of SNPs removed due to monomorphicity: %d' % tot_num_monomorphic_snps_removed print 'Total number of duplicated SNPs removed: %d' % tot_num_duplicated_snps_removed h5py_file.close() sys.stdout.flush() print 'Done parsing genotypes.'
def parse_plink_tped_file(file_prefix, imputation_type='simple', return_kinship=False): """ Requires a .tped file in 12 format. - Converts (on-the-fly) to a integer format. - Imputes missing data. """ tped_filename = file_prefix + '.tped' tped_pickled_filename = tped_filename + '.imputed.pickled' tfam_filename = file_prefix + '.tfam' tfam_pickled_filename = tfam_filename + '.pickled' if os.path.isfile(tfam_pickled_filename): print 'Loading pickled tfam file' individs, sex_list = cPickle.load(open(tfam_pickled_filename)) print 'Pickled tfam file was loaded.' else: individs = [] sex_list = [] with open(tfam_filename) as f: for line in f: l = map(str.strip, line.split()) individs.append(l[1]) sex_list.append(int(l[4])) cPickle.dump((individs, sex_list), open(tfam_pickled_filename, 'wb'), protocol=2) num_individs = len(individs) # k_mat = sp.zeros((num_individs, num_individs)) if os.path.isfile(tped_pickled_filename): print 'Loading pickled tped file' chrom_pos_snp_dict = cPickle.load(open(tped_pickled_filename)) print 'Pickled tped file was loaded.' else: chrom_pos_snp_dict = {} with open(tped_filename) as f: cur_chrom = -1 for line_i, line in enumerate(f): if line_i % 1000 == 0: print line_i l = map(str.strip, line.split()) chrom = int(l[0]) if chrom != cur_chrom: chrom_pos_snp_dict[chrom] = {'positions':[], 'snps':[]} cur_chrom = chrom chrom_pos_snp_dict[chrom]['positions'].append(int(l[3])) snp = sp.zeros(num_individs, dtype='int8') j = 0 w_missing = False for i in range(4, 2 * num_individs + 4, 2): nt1 = int(l[i]) nt2 = int(l[i + 1]) if nt1 == 0 or nt2 == 0: snp[j] = 3 w_missing = True elif nt1 == 2 and nt2 == 2: snp[j] = 2 elif nt1 != 1 or nt2 != 1: snp[j] = 1 # #Calculating K # for ind_i in range(j): # if snp[j] != 3 and snp[ind_i] != 3: # k_mat[ind_i, j] = int(snp[j] == snp[ind_i]) + 0.5 * int(sp.absolute(snp[j] - snp[ind_i]) == 1) # k_mat[ind_i, j] += 1 j += 1 # print k_mat bin_counts = sp.bincount(snp) if w_missing: if imputation_type == 'simple': mean = (bin_counts[1] + 2 * bin_counts[2]) / (bin_counts[0] + bin_counts[1] + bin_counts[2]) snp[snp == 3] = round(mean) if imputation_type == 'simple2': snp[snp == 3] = sp.argmax(bin_counts[:-1]) chrom_pos_snp_dict[chrom]['snps'].append(snp) cPickle.dump(chrom_pos_snp_dict, open(tped_pickled_filename, 'wb'), protocol=2) chromosomes = sorted(chrom_pos_snp_dict.keys()) snpsds = [] for chrom in chromosomes: snps = chrom_pos_snp_dict[chrom]['snps'] positions = chrom_pos_snp_dict[chrom]['positions'] snpsds.append(SNPsData(snps, positions, accessions=individs, chromosome=chrom)) sd = SNPsDataSet(snpsds, chromosomes, data_format='diploid_int') print 'SNPsDataSet constructed!' if return_kinship: print 'Loading the kinship matrix' ibs_filename = file_prefix + '.mibs' ibs_pickled_filename = ibs_filename + '.pickled' if os.path.isfile(ibs_pickled_filename): print 'Loading pickled IBS kinship file' l = cPickle.load(open(ibs_pickled_filename)) K = l[0] print 'Pickled IBS kinship was loaded.' else: print 'Loading K...' K = sp.zeros((num_individs, num_individs), dtype='double') with open(ibs_filename) as f: for i, line in enumerate(f): K[i] = map(float, line.split()) cPickle.dump([K, individs], open(ibs_pickled_filename, 'wb'), protocol=2) print 'K was loaded.' return sd, K return sd
def parse_single_12tped_to_hdf5(in_file_prefix='/home/bv25/data/Ls154/Ls154_12', out_file_prefix='/home/bv25/data/Ls154/Ls154_12', impute_type='mode', filter_monomorphic_snps=True, missing_val_thr=0.1): """ Parses plink 12 formatted tped file and stores it in a HDF5 file. It requires the h5py and scipy package. Ideally the genotypes are imputed apriory, otherwise a rough imputation (the most common genotype) is used for missing genotypes. Notes: Assumes the files are in diploid format! """ print 'Starting to parse genotypes' genotype_data = {} h5py_file = h5py.File(out_file_prefix + '.hdf5') genotype_data['hdf5p_file'] = h5py_file genot_group = h5py_file.create_group('genot_data') indiv_group = h5py_file.create_group('indiv_data') tot_num_snps = 0 tot_num_missing_val_snps_removed = 0 tot_num_ambiguous_loc_removed = 0 curr_chrom = 1 print 'Working on chromosome %d' % curr_chrom g_filename = '%s.tped' % (in_file_prefix) s_filename = '%s.bim' % (in_file_prefix) i_filename = '%s.tfam' % (in_file_prefix) indiv_ids = [] phenotypes = [] sex = [] print 'Parsing individuals file: %s' % i_filename with open(i_filename) as f: for line in f: l = line.split() iid = l[0] indiv_ids.append(iid) sex.append(int(l[4])) phenotypes.append(float(l[5])) tot_num_indiv = len(indiv_ids) print 'Storing individual data in individ. group' indiv_group.create_dataset('indiv_ids', data=indiv_ids) indiv_group.create_dataset('sex', data=sex) indiv_group.create_dataset('phenotypes', data=phenotypes) num_indiv = len(indiv_ids) print 'Found %d Individuals' % (num_indiv) print 'Parsing nucleotide map' nt_map = {} chromsomoes = [] curr_chrom = 0 with open(s_filename) as f: for line in f: l = line.split() chrom = l[0] if chrom != curr_chrom: chromsomoes.append(chrom) curr_chrom = chrom nt_map[l[1]] = (l[4], l[5]) assert len(chromsomoes) == len(set(chromsomoes)), 'Chromosomes need to be in order.' curr_chrom = chromsomoes[0] position = -1 # Initializing containers. snps_mat = [] positions = [] sids = [] nts_list = [] nt_counts_list = [] missing_counts = [] freqs = [] num_missing_removed = 0 num_monomorphic_removed = 0 num_ambiguous_loc_removed = 0 t0 = time.time() print 'Starting to parse SNP files' gf = open(g_filename) for g_line in gf: # if random.random() > 0.01: # continue gl = g_line.split() chrom = gl[0] if chrom != curr_chrom: # Store everything and reset. print 'Number of SNPs removed due to too many missing values: %d' % num_missing_removed print 'Number of SNPs removed due to ambiguous location: %d' % num_ambiguous_loc_removed print 'Number of monomorphic SNPs removed: %d' % num_monomorphic_removed print 'Number of SNPs retained: %d' % len(positions) print 'Number of individuals: %d' % num_indiv snps = sp.array(snps_mat, dtype='int8') h5py_chrom_group = genot_group.create_group('chrom_%s' % curr_chrom) h5py_chrom_group.create_dataset('raw_snps', compression='lzf', data=snps) h5py_chrom_group.create_dataset('positions', compression='lzf', data=positions) h5py_chrom_group.create_dataset('nts', compression='lzf', data=nts_list) h5py_chrom_group.create_dataset('nt_counts', compression='lzf', data=nt_counts_list) h5py_chrom_group.create_dataset('missing_counts', compression='lzf', data=missing_counts) h5py_chrom_group.create_dataset('freqs', compression='lzf', data=freqs) h5py_chrom_group.create_dataset('snp_ids', compression='lzf', data=sids) tot_num_snps += len(positions) tot_num_missing_val_snps_removed += num_missing_removed tot_num_ambiguous_loc_removed += num_ambiguous_loc_removed h5py_file.flush() t1 = time.time() t = t1 - t0 print 'It took %d minutes and %0.2f seconds to parse Chromosome %s.' % (t / 60, t % 60, curr_chrom) t0 = time.time() # Reset containers snps_mat = [] positions = [] sids = [] nts_list = [] nt_counts_list = [] missing_counts = [] freqs = [] num_missing_removed = 0 num_ambiguous = 0 num_monomorphic_removed = 0 num_ambiguous_loc_removed = 0 curr_chrom = chrom sid = gl[1] prev_position = position position = int(gl[3]) # Skipping unmappable locations if position == prev_position: num_ambiguous_loc_removed += 1 continue if position == 0: num_ambiguous_loc_removed += 1 continue nt = nt_map[sid] snp0 = sp.array(map(int, (g_line.strip()).split()[4:]), 'int8') a = sp.arange(tot_num_indiv * 2) even_map = a % 2 == 0 odd_map = a % 2 == 1 snp = snp0[even_map] + snp0[odd_map] - 2 snp[snp < 0] = 9 bin_counts = sp.bincount(snp) if len(bin_counts) > 3: missing_count = bin_counts[-1] # Filtering SNPs with too many missing values if missing_count > missing_val_thr * 2 * num_indiv: num_missing_removed += 1 continue elif impute_type == 'mode': nt_counts = bin_counts[:3] v = sp.argmax(nt_counts) snp[snp == 9] = v bin_counts = sp.bincount(snp) else: raise Exception('Imputation type is unknown') else: missing_count = 0 assert len(bin_counts) < 4, 'Issues with nucleotides.' nt_counts = bin_counts[:3] if len(nt_counts) == 2: nt_counts = sp.array([nt_counts[0], nt_counts[1], 0]) elif len(nt_counts) == 1: nt_counts = sp.array([nt_counts[0], 0, 0]) # Removing monomorphic SNPs if filter_monomorphic_snps: if max(nt_counts) == sum(nt_counts): num_monomorphic_removed += 1 continue freq = sp.mean(snp) / 2.0 snps_mat.append(snp) positions.append(position) sids.append(sid) nts_list.append(nt) nt_counts_list.append(nt_counts) missing_counts.append(missing_count) freqs.append(freq) # Store everything and reset. print 'Number of SNPs removed due to too many missing values: %d' % num_missing_removed print 'Number of SNPs removed due to ambiguous location: %d' % num_ambiguous_loc_removed print 'Number of monomorphic SNPs removed: %d' % num_monomorphic_removed print 'Number of SNPs retained: %d' % len(positions) print 'Number of individuals: %d' % num_indiv snps = sp.array(snps_mat, dtype='int8') h5py_chrom_group = genot_group.create_group('chrom_%s' % chrom) h5py_chrom_group.create_dataset('raw_snps', compression='lzf', data=snps) h5py_chrom_group.create_dataset('positions', compression='lzf', data=positions) h5py_chrom_group.create_dataset('nts', compression='lzf', data=nts_list) h5py_chrom_group.create_dataset('nt_counts', compression='lzf', data=nt_counts_list) h5py_chrom_group.create_dataset('missing_counts', compression='lzf', data=missing_counts) h5py_chrom_group.create_dataset('freqs', compression='lzf', data=freqs) h5py_chrom_group.create_dataset('snp_ids', compression='lzf', data=sids) tot_num_snps += len(positions) tot_num_missing_val_snps_removed += num_missing_removed tot_num_ambiguous_loc_removed += num_ambiguous_loc_removed h5py_file.create_dataset('num_snps', data=sp.array(tot_num_snps)) h5py_file.flush() t1 = time.time() t = t1 - t0 print 'It took %d minutes and %0.2f seconds to parse chromosome %s.' % (t / 60, t % 60, chrom) gf.close() print 'Total number of SNPs parsed successfully was: %d' % tot_num_snps print 'Total number of SNPs removed due to too many missing values: %d' % tot_num_missing_val_snps_removed print 'Total number of SNPs removed due to ambiguous locations: %d' % tot_num_ambiguous_loc_removed h5py_file.close() print 'Done parsing genotypes.'
def find_neighbor_throats(self, pores, mode='union', flatten=True): r""" Returns a list of throats neighboring the given pore(s) Parameters ---------- pores : array_like Indices of pores whose neighbors are sought flatten : boolean, optional If flatten is True (default) a 1D array of unique throat ID numbers is returned. If flatten is False the returned array contains arrays of neighboring throat ID numbers for each input pore, in the order they were sent. mode : string, optional Specifies which neighbors should be returned. The options are: * 'union' : All neighbors of the input pores * 'intersection' : Only neighbors shared by all input pores * 'not_intersection' : Only neighbors not shared by any input pores Returns ------- neighborTs : 1D array (if flatten is True) or ndarray of arrays (if flatten if False) Examples -------- >>> import OpenPNM >>> pn = OpenPNM.Network.TestNet() >>> pn.find_neighbor_throats(pores=[0, 1]) array([0, 1, 2, 3, 4, 5]) >>> pn.find_neighbor_throats(pores=[0, 1],flatten=False) array([array([0, 1, 2]), array([0, 3, 4, 5])], dtype=object) """ pores = sp.array(pores, ndmin=1) if pores.dtype == bool: pores = self.toindices(pores) if sp.size(pores) == 0: return sp.array([], ndmin=1, dtype=int) # Test for existence of incidence matrix try: neighborTs = self._incidence_matrix['lil'].rows[[pores]] except: temp = self.create_incidence_matrix(sprsfmt='lil') self._incidence_matrix['lil'] = temp neighborTs = self._incidence_matrix['lil'].rows[[pores]] if [sp.asarray(x) for x in neighborTs if x] == []: return sp.array([], ndmin=1) if flatten: # All the empty lists must be removed to maintain data type after # hstack (numpy bug?) neighborTs = [sp.asarray(x) for x in neighborTs if x] neighborTs = sp.hstack(neighborTs) # Remove references to input pores and duplicates if mode == 'not_intersection': neighborTs = sp.unique(sp.where(sp.bincount(neighborTs) == 1)[0]) elif mode == 'union': neighborTs = sp.unique(neighborTs) elif mode == 'intersection': neighborTs = sp.unique(sp.where(sp.bincount(neighborTs) > 1)[0]) else: for i in range(0, sp.size(pores)): neighborTs[i] = sp.array(neighborTs[i]) return sp.array(neighborTs, ndmin=1)
def find_neighbor_throats(self, pores, mode='union', flatten=True): r""" Returns a list of throats neighboring the given pore(s) Parameters ---------- pores : array_like Indices of pores whose neighbors are sought flatten : boolean, optional If flatten is True (default) a 1D array of unique throat ID numbers is returned. If flatten is False the returned array contains arrays of neighboring throat ID numbers for each input pore, in the order they were sent. mode : string, optional Specifies which neighbors should be returned. The options are: **'union'** : All neighbors of the input pores **'intersection'** : Only neighbors shared by all input pores **'not_intersection'** : Only neighbors not shared by any input pores Returns ------- neighborTs : 1D array (if flatten is True) or ndarray of arrays (if flatten if False) Examples -------- >>> import OpenPNM >>> pn = OpenPNM.Network.TestNet() >>> pn.find_neighbor_throats(pores=[0, 1]) array([0, 1, 2, 3, 4, 5]) >>> pn.find_neighbor_throats(pores=[0, 1],flatten=False) array([array([0, 1, 2]), array([0, 3, 4, 5])], dtype=object) """ pores = self._parse_locations(pores) if sp.size(pores) == 0: return sp.array([], ndmin=1, dtype=int) # Test for existence of incidence matrix try: neighborTs = self._incidence_matrix['lil'].rows[[pores]] except: temp = self.create_incidence_matrix(sprsfmt='lil') self._incidence_matrix['lil'] = temp neighborTs = self._incidence_matrix['lil'].rows[[pores]] if flatten: # Convert rows of lil into single flat list neighborTs = itertools.chain.from_iterable(neighborTs) # Convert list to numpy array neighborTs = sp.fromiter(neighborTs, dtype=int) if mode == 'not_intersection': neighborTs = sp.unique(sp.where(sp.bincount(neighborTs) == 1)[0]) elif mode == 'union': neighborTs = sp.unique(neighborTs) elif mode == 'intersection': neighborTs = sp.unique(sp.where(sp.bincount(neighborTs) > 1)[0]) return sp.array(neighborTs, ndmin=1, dtype=int) else: # Convert lists in array to numpy arrays neighborTs = [sp.array(neighborTs[i]) for i in range(0, len(pores))] return sp.array(neighborTs, ndmin=1)
import scipy as sy lista2=[] c=0 for z in range(1000): lista=[] for i in range(33): x = sy.random.random_integers(2) lista.append(x) lista2.append(lista) if (sy.bincount(lista)[1])==18: c+=1 print(c) print(float(c)/1000) #porcentaje de la observacion
def find_neighbor_pores(self, pores, mode='union', flatten=True, excl_self=True): r""" Returns a list of pores neighboring the given pore(s) Parameters ---------- pores : array_like ID numbers of pores whose neighbors are sought. flatten : boolean, optional If flatten is True a 1D array of unique pore ID numbers is returned. If flatten is False the returned array contains arrays of neighboring pores for each input pore, in the order they were sent. excl_self : bool, optional (Default is False) If this is True then the input pores are not included in the returned list. This option only applies when input pores are in fact neighbors to each other, otherwise they are not part of the returned list anyway. mode : string, optional Specifies which neighbors should be returned. The options are: * 'union' : All neighbors of the input pores * 'intersection' : Only neighbors shared by all input pores * 'not_intersection' : Only neighbors not shared by any input pores Returns ------- neighborPs : 1D array (if flatten is True) or ndarray of ndarrays (if flatten if False) Examples -------- >>> import OpenPNM >>> pn = OpenPNM.Network.TestNet() >>> pn.find_neighbor_pores(pores=[0, 2]) array([ 1, 3, 5, 7, 25, 27]) >>> pn.find_neighbor_pores(pores=[0, 1]) array([ 2, 5, 6, 25, 26]) >>> pn.find_neighbor_pores(pores=[0, 1], mode='union', excl_self=False) array([ 0, 1, 2, 5, 6, 25, 26]) >>> pn.find_neighbor_pores(pores=[0, 2],flatten=False) array([array([ 1, 5, 25]), array([ 1, 3, 7, 27])], dtype=object) >>> pn.find_neighbor_pores(pores=[0, 2],mode='intersection') array([1]) >>> pn.find_neighbor_pores(pores=[0, 2],mode='not_intersection') array([ 3, 5, 7, 25, 27]) """ pores = sp.array(pores, ndmin=1) if pores.dtype == bool: pores = self.toindices(pores) if sp.size(pores) == 0: return sp.array([], ndmin=1, dtype=int) # Test for existence of incidence matrix try: neighborPs = self._adjacency_matrix['lil'].rows[[pores]] except: temp = self.create_adjacency_matrix(sprsfmt='lil') self._adjacency_matrix['lil'] = temp neighborPs = self._adjacency_matrix['lil'].rows[[pores]] if [sp.asarray(x) for x in neighborPs if x] == []: return sp.array([], ndmin=1) if flatten: # All the empty lists must be removed to maintain data type after # hstack (numpy bug?) neighborPs = [sp.asarray(x) for x in neighborPs if x] neighborPs = sp.hstack(neighborPs) neighborPs = sp.concatenate((neighborPs, pores)) # Remove references to input pores and duplicates if mode == 'not_intersection': neighborPs = sp.array(sp.unique(sp.where( sp.bincount(neighborPs) == 1)[0]), dtype=int) elif mode == 'union': neighborPs = sp.array(sp.unique(neighborPs), int) elif mode == 'intersection': neighborPs = sp.array(sp.unique(sp.where( sp.bincount(neighborPs) > 1)[0]), dtype=int) if excl_self: neighborPs = neighborPs[~sp.in1d(neighborPs, pores)] else: for i in range(0, sp.size(pores)): neighborPs[i] = sp.array(neighborPs[i], dtype=int) return sp.array(neighborPs, ndmin=1)
def itemfreq(a): items,ind, inv = sp.unique(a, return_inverse=True,return_index=True) freq = sp.bincount(inv) return sp.array([ind, items, freq]).T