def conditional_entropy(x, y): """Return H(Y|X). Parameters ---------- x: numpy.ndarray of float values y: numpy.ndarray of integer values Returns ------- float Conditional entropy value """ # discretize X hx, bx = histogram(x, bins=x.size / 10, density=True) Py = compute_distribution(y) Px = compute_distribution(digitize(x, bx)) res = 0 for ey in set(y): # P(X | Y) x1 = x[y == ey] condPxy = compute_distribution(digitize(x1, bx)) for k in condPxy: v = condPxy[k] res += (v * Py[ey] * (log2(Px[k]) - log2(v * Py[ey]))) return res
def conditional_entropy(x, y): """ x: vettore di numeri reali y: vettore di interi calcola H(Y|X) """ # discretizzazione di X hx, bx = histogram(x, bins=x.size / 10, density=True) Py = compute_distribution(y) Px = compute_distribution(digitize(x, bx)) res = 0 for ey in set(y): # P(X | Y) x1 = x[y == ey] condPxy = compute_distribution(digitize(x1, bx)) for k, v in condPxy.iteritems(): res += (v * Py[ey] * (log2(Px[k]) - log2(v * Py[ey]))) en_x = entropy(digitize(x, bx)) return res, en_x
def getBeamFluxSpline(beam, plasma, t, lim1, lim2, points=1000): """ generates a spline off of the beampath. Assumes that the change in flux is MONOTONIC""" lim = beam.norm.s beam.norm.s = scipy.linspace(0, lim[-1], points) h = time.time() psi = plasma.eq.rz2rmid(beam.r()[0], beam.r()[2], t) #evaluates all psi's at once print(time.time() - h) outspline = len(t) * [0] inspline = len(t) * [0] for i in range(t.size): temp = lim1 mask = scipy.logical_and(scipy.isfinite(psi[i]), psi[i] < lim2 + .02) try: minpos = scipy.argmin(psi[i][mask]) test = psi[i][mask][minpos] except ValueError: test = lim2 + .03 #plt.plot(beam.x()[0][mask],psi[i][mask]) #plt.show() sizer = psi[i][mask].size if not test > lim2: #plt.plot(beam.x()[0][mask][0:minpos],psi[i][mask][0:minpos],beam.x()[0][mask][minpos:],psi[i][mask][minpos:]) #plt.show() #limout = scipy.insert(lim,(2,2),(beam.norm.s[mask][minpos],beam.norm.s[mask][minpos])) # add minimum flux s for bound testing if lim1 < test: temp = test try: temp1 = scipy.clip( scipy.digitize((lim1, lim2), psi[i][mask][minpos::-1]), 0, minpos) outspline[i] = beam.norm.s[mask][minpos::-1][temp1] except ValueError: tempmask = (psi[i][mask] < lim2)[0] outspline[i] = scipy.array( [beam.norm.s[mask][minpos], beam.norm.s[mask][tempmask]]) try: temp2 = scipy.clip( scipy.digitize((lim1, lim2), psi[i][mask][minpos:]), 0, sizer - minpos - 1) inspline[i] = beam.norm.s[mask][minpos:][temp2] except ValueError: inspline[i] = scipy.array( [beam.norm.s[mask][minpos], beam.norm.s[mask][-1]]) else: outspline[i] = scipy.array([[], []]) inspline[i] = scipy.array([[], []]) return (outspline, inspline)
def getBeamFluxSpline(beam,plasma,t,lim1,lim2,points = 1000): """ generates a spline off of the beampath. Assumes that the change in flux is MONOTONIC""" lim = beam.norm.s beam.norm.s = scipy.linspace(0,lim[-1],points) h = time.time() psi = plasma.eq.rz2rmid(beam.r()[0],beam.r()[2],t) #evaluates all psi's at once print(time.time()-h) outspline = len(t)*[0] inspline = len(t)*[0] for i in xrange(t.size): temp = lim1 mask = scipy.logical_and(scipy.isfinite(psi[i]),psi[i] < lim2+.02) try: minpos = scipy.argmin(psi[i][mask]) test = psi[i][mask][minpos] except ValueError: test = lim2+.03 #plt.plot(beam.x()[0][mask],psi[i][mask]) #plt.show() sizer = psi[i][mask].size if not test > lim2: #plt.plot(beam.x()[0][mask][0:minpos],psi[i][mask][0:minpos],beam.x()[0][mask][minpos:],psi[i][mask][minpos:]) #plt.show() #limout = scipy.insert(lim,(2,2),(beam.norm.s[mask][minpos],beam.norm.s[mask][minpos])) # add minimum flux s for bound testing if lim1 < test: temp = test try: temp1 = scipy.clip(scipy.digitize((lim1,lim2),psi[i][mask][minpos::-1]),0,minpos) outspline[i] = beam.norm.s[mask][minpos::-1][temp1] except ValueError: tempmask = (psi[i][mask] < lim2)[0] outspline[i] = scipy.array([beam.norm.s[mask][minpos],beam.norm.s[mask][tempmask]]) try: temp2 = scipy.clip(scipy.digitize((lim1,lim2),psi[i][mask][minpos:]),0,sizer-minpos-1) inspline[i] = beam.norm.s[mask][minpos:][temp2] except ValueError: inspline[i] = scipy.array([beam.norm.s[mask][minpos],beam.norm.s[mask][-1]]) else: outspline[i] = scipy.array([[],[]]) inspline[i] = scipy.array([[],[]]) return (outspline,inspline)
def trend(tree, signal, shot): temp = MDS.Tree(tree, signal) xt = temp.getNode(signal).dim_of().data() x = temp.getNode(signal).dim_of().data() yt, y = globalpowerCalc(shot) a = scipy.digitize(xt, yt) return x, y[a]
def trend(tree,signal,shot): temp = MDS.Tree(tree,signal) xt = temp.getNode(signal).dim_of().data() x = temp.getNode(signal).dim_of().data() yt,y = globalpowerCalc(shot) a = scipy.digitize(xt,yt) return x,y[a]
def globalpowerCalc(shot): Tree = MDSplus.Tree('spectroscopy', shot) output = None temp2 = 2 * scipy.pi * (.68) * Tree.getNode( '\SPECTROSCOPY::TOP.BOLOMETER.RESULTS.DIODE.BPLY.BRIGHT').data( ) #does factor have the 4pi? temp2t = Tree.getNode( '\SPECTROSCOPY::TOP.BOLOMETER.RESULTS.DIODE.BPLY.BRIGHT').dim_of( ).data() for i in scipy.arange(20) + 2: string = str(i) if i < 10: string = '0' + string # try: temp = Tree.getNode( '\SPECTROSCOPY::TOP.BOLOMETER.RESULTS.DIODE.BPLY.AREA:CHORD_' + string).data() tempt = Tree.getNode( '\SPECTROSCOPY::TOP.BOLOMETER.RESULTS.DIODE.BPLY.AREA:CHORD_' + string).dim_of().data() a = scipy.digitize(tempt, temp2t) if output is None: output = temp * temp2[i][a] else: output = output + temp * temp2[i][a] #except ValueError: # print('no') return (tempt, output)
def assess_calibration(self): """Assess if PredPol is calibrated by conditioning on predicted intensity and checking the correlation between number of crimes and demographics. Returns: a 2D array where the first dimension is the number of days in the test set and the second dimension is the number of bins for the range of predicted intensities, as computed by `sp.histogram_bin_edges`. The entry in the ith row and jth column is the Pearson correlation coefficient between race and actual number of crimes in the jth bin of predicted intensity for the ith day. """ black = self.pred_obj.grid_cells.black not_nan = sp.logical_not(sp.isnan(black.values)) bins = sp.histogram_bin_edges(self.get_predicted_intensities(), bins='auto') correlations = sp.empty((len(self.lambda_columns), len(bins))) correlations[:] = sp.nan for i, (lambda_col, actual_col) in self._iterator(): idx_bins = sp.digitize(self.results[lambda_col], bins) for j in range(len(bins)): idx_selected = sp.logical_and(idx_bins == j, not_nan) if sp.sum(idx_selected) > 2: actual = self.results.loc[idx_selected, actual_col] demographics = black.loc[idx_selected] correlations[i, j] = sp.stats.pearsonr(actual, demographics)[0] return correlations
def globalpowerCalc(shot): Tree = MDSplus.Tree('spectroscopy',shot) output = None temp2 = 2*scipy.pi*(.68)*Tree.getNode('\SPECTROSCOPY::TOP.BOLOMETER.RESULTS.DIODE.BPLY.BRIGHT').data() #does factor have the 4pi? temp2t = Tree.getNode('\SPECTROSCOPY::TOP.BOLOMETER.RESULTS.DIODE.BPLY.BRIGHT').dim_of().data() for i in scipy.arange(20)+2: string = str(i) if i < 10: string = '0'+string # try: temp = Tree.getNode('\SPECTROSCOPY::TOP.BOLOMETER.RESULTS.DIODE.BPLY.AREA:CHORD_'+string).data() tempt = Tree.getNode('\SPECTROSCOPY::TOP.BOLOMETER.RESULTS.DIODE.BPLY.AREA:CHORD_'+string).dim_of().data() a = scipy.digitize(tempt,temp2t) if output is None: output = temp*temp2[i][a] else: output = output + temp*temp2[i][a] #except ValueError: # print('no') return (tempt,output)
def observe_mask(X, Y, m, n, fov=5, slit_size=None, ngals=40, width=1): """ can either give the slit size, or the number of objects. If the number of objects is given, the slit size parameter is ignored, and instead the slit size is calculated to have that number of objects in one observation. """ ngal = len(X) j = scipy.arange(ngal, dtype=int) dist = scipy.hypot(X, Y) # the BCG is always the first element (bcg = 0) bcg = j[dist == 0][0] # rotate the coordinates to the mask direction theta = scipy.arctan(m) Xrot = X * scipy.cos(theta) - Y * scipy.sin(theta) Yrot = X * scipy.sin(theta) + Y * scipy.cos(theta) pylab.plot(Xrot, Yrot, 'k.', mew=2) # bin the galaxies in Xrot in_fov = j[(abs(Xrot) < fov[0] / 2.) & (abs(Yrot) < fov[1] / 2.)] Xrot = Xrot[in_fov] Yrot = Yrot[in_fov] n_in_fov = len(in_fov) j = scipy.arange(n_in_fov, dtype=int) if ngals: slit_size = fov[0] / float(ngals) Xbins = scipy.linspace(-fov[0] / 2., fov[0] / 2., ngals + 1) else: Xbins = scipy.arange(-fov[0] / 2., fov[0] / 2., slit_size / 60.) Xbins += slit_size / 2. Xbinned = scipy.digitize(Xrot, Xbins) # observe one galaxy per Xbin with a Gaussian probability with width # defined below (in arcmin), and excluding the bin containing the # BCG, which is necessarily observed (we always do!). # This width is such that I preferentially observe galaxies near the # center of the image. observed = [j[Xbinned == i][scipy.argmin(abs(Yrot[Xbinned == i] - random.normal(0, width)))] \ for i in xrange(len(Xbins)-1) \ if (len(Yrot[Xbinned == i]) > 0) & (i != Xbinned[0])] observed = scipy.append(bcg, observed) # plot (just once, to show -- and make sure!) #for x in Xbins: #pylab.axvline(x, ls='-', color='0.7') #pylab.plot(0, 0, 'o', ms=8, mfc='orange', mec='orange') #pylab.plot(Xrot[observed], Yrot[observed], 'rx', ms=6, mew=2) ## field of view #pylab.plot([-fov[0]/2., -fov[0]/2.], [-fov[1]/2., fov[1]/2.], 'b-', lw=2) #pylab.plot([fov[0]/2., fov[0]/2.], [-fov[1]/2., fov[1]/2.], 'b-', lw=2) #pylab.plot([-fov[0]/2., fov[0]/2.], [-fov[1]/2., -fov[1]/2.], 'b-', lw=2) #pylab.plot([-fov[0]/2., fov[0]/2.], [fov[1]/2., fov[1]/2.], 'b-', lw=2) #pylab.xlabel('rotated x (arcmin)') #pylab.ylabel('rotated y (arcmin)') #pylab.xlim(-6, 6) #pylab.ylim(-6, 6) #output = 'plots/mask1_sample.png' #pylab.savefig(output, format=output[-3:]) #pylab.close() #print 'Saved to', output #exit() return in_fov[observed]
def share_slices(counts): cumcounts = scipy.cumsum(counts) cedges = scipy.linspace(0, cumcounts[-1] + 1, ncuts + 1) cutnumber = scipy.digitize(cumcounts, cedges) - 1 assert (cutnumber >= 0).all() and (cutnumber < ncuts).all() return [ scipy.flatnonzero(cutnumber == icut) for icut in range(ncuts) ]
def ROI_nominal(self): """ Set the ROIs nominally from file, for each slot separately. """ for iS in range(self.num_slots): self.raw_ROI[:, iS] = sp.digitize(self.data[:, iS], self.ROI_bins[:, iS]) - 1 self.corr_ROI[:, iS] = self.raw_ROI[:, iS]
def estimate_position_from_quadratures(eta, phix, N_phi=30, N_x=101): phi = phix[:,0] x = phix[:,1]/sqrt(eta) phi_edges = scipy.linspace(0, 2.*scipy.pi, N_phi) phi_centers = (phi_edges[:-1]+phi_edges[1:])/2. phi_idx = scipy.digitize(phi, phi_edges) xs = [x[phi_idx==n+1] for n in range(len(phi_centers))] means = scipy.array([scipy.mean(x) for x in xs]) stds = scipy.array([scipy.std(x) for x in xs]) m = interp1d(phi_centers, means) return -m(pi), m(pi/2.), stds.max()
def conditional_entropy(x, y): """ x: vector de numeros reales y: vector de numeros enteros devuelve H(Y|X) """ # discretizacion de X hx, bx = histogram(x, bins=x.size / 10, density=True) Py = compute_distribution(y) Px = compute_distribution(digitize(x, bx)) res = 0 for ey in set(y): # P(X | Y) x1 = x[y == ey] condPxy = compute_distribution(digitize(x1, bx)) for k, v in condPxy.iteritems(): res += (v * Py[ey] * (log2(Px[k]) - log2(v * Py[ey]))) return res
def conditional_entropy(x, y): """ x: vector de numeros reales y: vector de numeros enteros devuelve H(Y|X) """ # discretizacion de X hx, bx = histogram(x, bins=x.size / 10, density=True) Py = compute_distribution(y) Px = compute_distribution(digitize(x, bx)) res = 0 for ey in set(y): # P(X | Y) x1 = x[y == ey] condPxy = compute_distribution(digitize(x1, bx)) for k, v in condPxy.iteritems(): res += v * Py[ey] * (log2(Px[k]) - log2(v * Py[ey])) return res
def phase_of_times(self, times, sampling_rate=1000.): """ Give the phases of the oscillation at the specific 'times' The under underlying precision of phase sampling is given by 'sampling_rate' Return 'nan' for timepoints outside the range where the oscillation phase is known (Oscillation.time_line) Note: an oscillation detected with a very small sampling rate compared to its frequency will have a drift in its reconstructed phase. It is advised to have an original sampling rate of at least 4 times the oscillation frequency """ if self.time_line.size > 1: old_dt = self.time_line[1] - self.time_line[0] x = numpy.arange(self.time_start, self.time_stop + old_dt, 1. / sampling_rate) else: x = self.time_line v = self.value_line # BAD #y = numpy.angle(v) #y = signal.resample( y, x.size) # bad 2 #~ y = numpy.cos(numpy.angle(v)) #~ y = signal.resample( y, x.size) #~ ind = numpy.diff(y)>0 #~ ind = numpy.concatenate( (ind , [ind[-1]])) #~ y2 = numpy.arccos(y) #~ y2[ind] = -y2[ind] #ok # Before resampling, in order to avoid slow down due the use of ifft in scipy.resample # y is padded with 0 proportionnally to the distance from x.size to the next 2**N # QUESTION: does it lead to some strange edge effects??? N = numpy.ceil(numpy.log2(x.size)) vv = numpy.r_[ v, numpy.zeros(numpy.floor(v.size * (2**N - x.size) / x.size))] vv = signal.resample(vv, 2**N) v = vv[:x.size] #~ y = numpy.cos(numpy.angle(v)) y2 = numpy.angle(v) d = digitize(times, x) d[d == len( v )] = 0 # points above the highest time value where the oscillation phase is known phases = y2[d] phases[ d == 0] = nan # all points outside the range where the oscillation is known return phases
def phase_of_times(self, times , sampling_rate = 1000.): """ Give the phases of the oscillation at the specific 'times' The under underlying precision of phase sampling is given by 'sampling_rate' Return 'nan' for timepoints outside the range where the oscillation phase is known (Oscillation.time_line) Note: an oscillation detected with a very small sampling rate compared to its frequency will have a drift in its reconstructed phase. It is advised to have an original sampling rate of at least 4 times the oscillation frequency """ if self.time_line.size>1: old_dt = self.time_line[1]-self.time_line[0] x = numpy.arange(self.time_start, self.time_stop+old_dt, 1./sampling_rate) else: x=self.time_line v = self.value_line # BAD #y = numpy.angle(v) #y = signal.resample( y, x.size) # bad 2 #~ y = numpy.cos(numpy.angle(v)) #~ y = signal.resample( y, x.size) #~ ind = numpy.diff(y)>0 #~ ind = numpy.concatenate( (ind , [ind[-1]])) #~ y2 = numpy.arccos(y) #~ y2[ind] = -y2[ind] #ok # Before resampling, in order to avoid slow down due the use of ifft in scipy.resample # y is padded with 0 proportionnally to the distance from x.size to the next 2**N # QUESTION: does it lead to some strange edge effects??? N=numpy.ceil(numpy.log2(x.size)) vv=numpy.r_[v,numpy.zeros(numpy.floor(v.size*(2**N-x.size)/x.size))] vv = signal.resample( vv, 2**N) v = vv[:x.size] #~ y = numpy.cos(numpy.angle(v)) y2 = numpy.angle(v) d = digitize( times , x ) d[d==len(v)] = 0 # points above the highest time value where the oscillation phase is known phases = y2[d] phases[ d==0 ] = nan # all points outside the range where the oscillation is known return phases
def weight_radial(catalogue, rwidth=rwidth, redges=redges): self.logger.info('Radial integral constraint.') distance = catalogue.distance() dmin, dmax = distance.min(), distance.max() self.logger.info('Comoving distances: {:.1f} - {:.1f}.'.format( dmin, dmax)) if redges is not None: radialedges = scipy.array(redges) rwidth = scipy.mean(scipy.diff(radialedges)) rmin, rmax = radialedges.min(), radialedges.max() if (rmin > dmin) or (rmax < dmax): raise ValueError( 'Provided radial-edges ({:.1f} - {:.1f}) do not encompass the full survey ({:.1f} - {:.1f}).' .format(rmin, rmax, dmin, dmax)) self.logger.info( 'Provided radial-edges of width: {:.1f} and range: {:.1f} - {:.1f}.' .format(rwidth, rmin, rmax)) nbins = len(radialedges) - 1 else: self.logger.info( 'Provided radial-width: {:.1f}.'.format(rwidth)) nbins = scipy.rint((dmax - dmin) / rwidth).astype(int) radialedges = scipy.linspace(dmin, dmax + 1e-9, nbins + 1) self.logger.info( 'There are {:d} radial-bins with an average of {:.1f} objects.' .format(nbins, len(catalogue) * 1. / nbins)) ibin = scipy.digitize(distance, radialedges, right=False) - 1 for iaddbin in range(catalogue.attrs['naddbins']): mask = catalogue['iaddbin'] == iaddbin wcounts = scipy.bincount(ibin[mask], weights=catalogue['Weight'][mask], minlength=nbins) catalogue['Weight'][mask] /= wcounts[ibin[mask]] attrs = {'radialedges': radialedges, 'nbins': nbins} def bin(catalogue): return scipy.digitize( catalogue.distance(), radialedges, right=False) - 1 return attrs, bin
def setUpNextState(initial_state, server_prob, n_process=n_process): sum_initial_state = sum(initial_state) arrival_times = getRandomArrivalServiceTimes(n_process, arrival_rate, None)[0] time_start = arrival_times[sum_initial_state] # next two lines optimizes to avoid the processing of whole queeue, instead only processes which are needed # are processed arrival_times = arrival_times[arrival_times <= (time_start+time_interval)] n_process= arrival_times.size initial_states = zip(initial_state, [arrival_times[sum_initial_state]] * len(initial_state)) server_address_table_forced = random.permutation(concatenate([ones(state) * i for i,state in enumerate(initial_state)])) server_address_table = concatenate([server_address_table_forced, digitize(uniform.rvs(size = n_process- sum_initial_state), cumsum(server_prob))]) server_arrival_times = [arrival_times[server_address_table == i] for i in range(n_server)] server_service_times = [getRandomArrivalServiceTimes((server_address_table == i).sum(), None, service_rate[i])[1] for i in range(n_server)] results = map(mm1, server_arrival_times, server_service_times, initial_states) final_state = [r['queue_size_by_time'](time_start+time_interval, max_no_people) if r else 0 for r in results] return tuple(final_state)
def constant_interval(self, time_points): lvt = self.length_vs_time() max_time = lvt[0][lvt[0].size - 1] bins = sp.arange(0, max_time, max_time/time_points) time_spots = sp.digitize(lvt[0], bins) ts = bins ys = sp.zeros(bins.size) for i in range(0, lvt[0].size): spot = time_spots[i] ys[spot-1] = lvt[1][i] # Fill in zero values with previous length max_val = 0 for i in range(0, ts.size-1): if ys[i] > max_val: max_val = ys[i] else: ys[i] = max_val return sp.array([ts, ys])
def rebin(self,xnew): """ Rebin the spectrum on a new grid named xnew """ #Does not need equal spaced bins, but why would you not? xnew.sort() fbin = sp.zeros(xnew.size) efbin = sp.zeros(xnew.size) #up sampling is just interpolation m = (self.wv >= xnew[0])*(self.wv <= xnew[-1]) if self.wv[m].size <= xnew.size - 1: fbin,efbin = self.interp(xnew) else: #down sampling-- #1) define bins so that xnew is at the center. #2) interpolate to account for fractional pixel weights #3) take the mean within each bin db = 0.5*sp.diff(xnew) b2 = xnew[1::] - db b2 = sp.insert(b2,0,xnew[0]) insert = sp.searchsorted(self.wv,b2) xinsert = sp.insert(self.wv,insert,xnew) xinsert = sp.unique(xinsert) yinsert,zinsert = self.interp(xinsert) i = sp.digitize(xinsert,b2) for j in range(b2.size): iuse = sp.where(i == j+1)[0] fbin[j] = sp.mean(yinsert[iuse]) efbin[j] = sp.mean(zinsert[iuse]) self._wv = xnew if self.ef is not None: self._ef = efbin self.f = fbin assert self.wv.size == self.f.size
def rebin(self, xnew): """ Rebin the spectrum on a new grid named xnew """ #Does not need equal spaced bins, but why would you not? xnew.sort() fbin = sp.zeros(xnew.size) efbin = sp.zeros(xnew.size) #up sampling is just interpolation m = (self.wv >= xnew[0]) * (self.wv <= xnew[-1]) if self.wv[m].size <= xnew.size - 1: fbin, efbin = self.interp(xnew) else: #down sampling-- #1) define bins so that xnew is at the center. #2) interpolate to account for fractional pixel weights #3) take the mean within each bin db = 0.5 * sp.diff(xnew) b2 = xnew[1::] - db b2 = sp.insert(b2, 0, xnew[0]) insert = sp.searchsorted(self.wv, b2) xinsert = sp.insert(self.wv, insert, xnew) xinsert = sp.unique(xinsert) yinsert, zinsert = self.interp(xinsert) i = sp.digitize(xinsert, b2) for j in range(b2.size): iuse = sp.where(i == j + 1)[0] fbin[j] = sp.mean(yinsert[iuse]) efbin[j] = sp.mean(zinsert[iuse]) self._wv = xnew if self.ef is not None: self._ef = efbin self.f = fbin assert self.wv.size == self.f.size
def run(self, nbins=25): r""" Computes the pore size function of the image. This method calculates the distance transform of the void space, then computes a histogram of the occurances of each distance value. Parameters ---------- nbins : int The number of bins into which the distance values should be sorted. The default is 25. """ temp_img = spim.distance_transform_edt(self.image) dvals = temp_img[self.image].flatten() rmax = sp.amax(dvals) bins = sp.linspace(1, rmax, nbins) binned = sp.digitize(x=dvals, bins=bins) vals = namedtuple('PoreSizeFunction', ('distance', 'frequency')) vals.distance = bins vals.frequency = sp.bincount(binned, minlength=nbins)[1:] return vals
def get_freq_modes_over_f(power_mat, window_function, frequency, n_modes, plots=False): """Fines the most correlated frequency modes and fits thier noise.""" n_f = len(frequency) d_f = sp.mean(sp.diff(frequency)) dt = 1.0 / 2.0 / frequency[-1] n_chan = power_mat.shape[-1] n_time = window_function.shape[0] # The threshold for assuming there isn't enough data to measure anything. no_data_thres = 10.0 / n_time # Initialize the dictionary that will hold all the parameters. output_params = {} # First take the low frequency part of the spetrum matrix and average over # enough bins to get a well conditioned matrix. low_f_mat = sp.mean(power_mat[: 4 * n_chan, :, :].real, 0) # Factor the matrix to get the most correlated modes. e, v = linalg.eigh(low_f_mat) # Make sure they are sorted. if not sp.alltrue(sp.diff(e) >= 0): raise RuntimeError("Eigenvalues not sorted") # Power matrix striped of the biggest modes. reduced_power = sp.copy(power_mat) mode_list = [] # Solve for the spectra of these modes. for ii in range(n_modes): this_mode_params = {} # Get power spectrum and window function for this mode. mode = v[:, -1 - ii] mode_power = sp.sum(mode * power_mat.real, -1) mode_power = sp.sum(mode * mode_power, -1) mode_window = sp.sum(mode[:, None] ** 2 * window_function, 1) mode_window = sp.sum(mode_window * mode[None, :] ** 2, 1) # Protect against no data. if sp.mean(mode_window).real < no_data_thres: this_mode_params["amplitude"] = 0.0 this_mode_params["index"] = 0.0 this_mode_params["f_0"] = 1.0 this_mode_params["thermal"] = T_infinity ** 2 * dt else: # Fit the spectrum. p = fit_overf_const(mode_power, mode_window, frequency) # Put all the parameters we measured into the output. this_mode_params["amplitude"] = p[0] this_mode_params["index"] = p[1] this_mode_params["f_0"] = p[2] this_mode_params["thermal"] = p[3] this_mode_params["mode"] = mode output_params["over_f_mode_" + str(ii)] = this_mode_params # Remove the mode from the power matrix. tmp_amp = sp.sum(reduced_power * mode, -1) tmp_amp2 = sp.sum(reduced_power * mode[:, None], -2) tmp_amp3 = sp.sum(tmp_amp2 * mode, -1) reduced_power -= tmp_amp[:, :, None] * mode reduced_power -= tmp_amp2[:, None, :] * mode[:, None] reduced_power += tmp_amp3[:, None, None] * mode[:, None] * mode mode_list.append(mode) # Initialize the compensation matrix, that will be used to restore thermal # noise that gets subtracted out. See Jan 29, Feb 17th, 2012 of Kiyo's # notes. compensation = sp.eye(n_chan, dtype=float) for mode1 in mode_list: compensation.flat[:: n_chan + 1] -= 2 * mode1 ** 2 for mode2 in mode_list: mode_prod = mode1 * mode2 compensation += mode_prod[:, None] * mode_prod[None, :] # Now that we've striped the noisiest modes, measure the auto power # spectrum, averaged over channels. auto_spec_mean = reduced_power.view() auto_spec_mean.shape = (n_f, n_chan ** 2) auto_spec_mean = auto_spec_mean[:, :: n_chan + 1].real auto_spec_mean = sp.mean(auto_spec_mean, -1) diag_window = window_function.view() diag_window.shape = (n_time, n_chan ** 2) diag_window = diag_window[:, :: n_chan + 1] auto_spec_window = sp.mean(diag_window, -1) if sp.mean(auto_spec_window).real < no_data_thres: auto_cross_over = 0.0 auto_index = 0.0 auto_thermal = 0 else: auto_spec_params = fit_overf_const(auto_spec_mean, auto_spec_window, frequency) auto_thermal = auto_spec_params[3] if auto_spec_params[0] <= 0 or auto_spec_params[3] <= 0 or auto_spec_params[1] > -0.599: auto_cross_over = 0.0 auto_index = 0.0 else: auto_index = auto_spec_params[1] auto_cross_over = auto_spec_params[2] * (auto_spec_params[0] / auto_spec_params[3]) ** (-1.0 / auto_index) # if auto_cross_over < d_f: # auto_index = 0. # auto_cross_over = 0. # Plot the mean auto spectrum if desired. if plots: h = plt.gcf() a = h.add_subplot(*h.current_subplot) norm = sp.mean(auto_spec_window).real auto_plot = auto_spec_mean / norm plotable = auto_plot > 0 lines = a.loglog(frequency[plotable], auto_plot[plotable]) c = lines[-1].get_color() # And plot the fit in a light color. if auto_cross_over > d_f / 4.0: spec = npow.overf_power_spectrum(auto_thermal, auto_index, auto_cross_over, dt, n_time) else: spec = sp.zeros(n_time, dtype=float) spec += auto_thermal spec[0] = 0 spec = npow.convolve_power(spec, auto_spec_window) spec = npow.prune_power(spec) spec = spec[1:].real if norm > no_data_thres: spec /= norm plotable = spec > 0 a.loglog(frequency[plotable], spec[plotable], c=c, alpha=0.4, linestyle=":") output_params["all_channel_index"] = auto_index output_params["all_channel_corner_f"] = auto_cross_over # Finally measure the thermal part of the noise in each channel. cross_over_ind = sp.digitize([auto_cross_over * 4], frequency)[0] cross_over_ind = max(cross_over_ind, n_f // 2) cross_over_ind = min(cross_over_ind, int(9.0 * n_f / 10.0)) thermal = reduced_power[cross_over_ind:, :, :].real n_high_f = thermal.shape[0] thermal.shape = (n_high_f, n_chan ** 2) thermal = sp.mean(thermal[:, :: n_chan + 1], 0) thermal_norms = sp.mean(diag_window, 0).real bad_inds = thermal_norms < no_data_thres thermal_norms[bad_inds] = 1.0 # Compensate for power lost in mode subtraction. compensation[:, bad_inds] = 0 compensation[bad_inds, :] = 0 for ii in xrange(n_chan): if bad_inds[ii]: compensation[ii, ii] = 1.0 thermal = linalg.solve(compensation, thermal) # Normalize thermal /= thermal_norms thermal[bad_inds] = T_infinity ** 2 * dt # Occationally the compensation fails horribly on a few channels. # When this happens, zero out the offending indices. thermal[thermal < 0] = 0 output_params["thermal"] = thermal # Now that we know what thermal is, we can subtract it out of the modes we # already measured. for ii in range(n_modes): mode_params = output_params["over_f_mode_" + str(ii)] thermal_contribution = sp.sum(mode_params["mode"] ** 2 * thermal) # Subtract a maximum of 90% of the white noise to keep things positive # definate. new_white = max(mode_params["thermal"] - thermal_contribution, 0.1 * mode_params["thermal"]) if mode_params["thermal"] < 0.5 * T_infinity ** 2 * dt: mode_params["thermal"] = new_white return output_params
server_prob = array([0.25, 0.25, 0.5]) n_server = server_prob.size n_process = 100 time_interval = 10 initial_state = (7,2,3) arrival_times = getRandomArrivalServiceTimes(n_process, arrival_rate, None)[0] sum_initial_state = sum(initial_state) # preparing initial state for each mm1 simulation initial_states = zip(initial_state, [arrival_times[sum_initial_state]] * len(initial_state)) # maps kth process to ith server server_address_table_forced = random.permutation(concatenate([ones(state) * i for i,state in enumerate(initial_state)])) print "forced server address table", server_address_table_forced server_address_table = concatenate([server_address_table_forced, digitize(uniform.rvs(size = n_process-sum_initial_state), cumsum(server_prob))]) server_arrival_times = [arrival_times[server_address_table == i] for i in range(n_server)] server_service_times = [ getRandomArrivalServiceTimes((server_address_table == i).sum(), None, service_rate[i])[1] for i in range(n_server) ] results = map(mm1, server_arrival_times, server_service_times, initial_states) print "Mean QueueSize(1)", array([mean(result['queue_size']) for result in results]) print "Results[0]['queue_size']", results[0]['queue_size'] print "Results[1]['queue_size']", results[1]['queue_size'] print "Results[2]['queue_size']", results[2]['queue_size'] time_start = arrival_times[sum_initial_state] # I don't know why it shouldn't be sum_initial_state +1 instead print "queue_size_by_time", time_start, [r['queue_size_by_time'](time_start) for r in results] print "queue_size_by_time", time_start+time_interval, [r['queue_size_by_time'](time_start+time_interval) for r in results]
def collapse_correlation_1d(corr, f_lags, a_lags, weights=None): r"""Takes a 2D correlation function and collapses to a 1D correlation function. Parameters ---------- corr: 2D array Covariance matrix in terms of frequency lag and angular lag. The first output from `rebin_corr_freq_lag` right now. f_lags: 1D array The frequency lags in terms of Hz. The third output from `rebin_corr_freq_lag` right now. a_lags: 1D array The angular lags in terms of degrees. weights: 2D array The weights of `corr`. The second output from `rebin_corr_freq_lag` right now. Returns ------- out_corr: 1D array The 1D autocorrelation. out_weights: The weights for `out_corr`. x_axis: tuple of 3 1D arrays `x_axis[1]` is the x - values that correspond to `out_corr`. `x_axis[0]` and `x_axis[2]` are the left and rightmost points covered by each lag bin. Notes ----- `a_lags` are not the same as the lags from the .ini file. The lags from the .ini file are the right side of each lag bin, but you want the centre of the bin when you plot. To get the right values, you must do: (ask Eric or Liviu) lags = sp.array(F.params['lags']) a_lags = copy.deepcopy(lags) a_lags[0] = 0 a_lags[1:] -= sp.diff(lags)/2.0 """ if corr.ndim != 2: msg = "Must start with a 2D correlation function." raise ValueError(msg) if len(f_lags) != corr.shape[0] or len(a_lags) != corr.shape[1]: msg = ("corr.shape must be (len(f_lags), len(a_lags)). Passed: " + repr(corr.shape) + " vs (" + repr(len(f_lags)) + ", " + repr(len(a_lags)) + ").") raise ValueError(msg) if weights is None: weights = sp.ones_like(corr) corr = corr * weights # Hard code conversion factors to MPc/h for now. a_fact = 34.0 # Mpc/h per degree at 800MHz. f_fact = 4.5 # Mpc/h per MHz at 800MHz. # Hard code lags in MPc/h. #nbins = 10 nbins = 15 lags = sp.empty(nbins) lags[0] = 2.0 lags[1] = 4.0 for bin_index in range(2, nbins): lags[bin_index] = 1.5 * lags[bin_index - 1] # Calculate the total 1D lags. separation = a_lags separation = (a_fact * separation[sp.newaxis, :])**2 separation = separation + (f_fact * f_lags[:, sp.newaxis] / 1.0e6)**2 separation = sp.sqrt(separation) # Initialize memory for outputs. out_corr = sp.zeros(nbins) out_weights = sp.zeros(nbins) # Rebin. for lag_index in range(separation.shape[0]): bin_inds = sp.digitize(separation[lag_index, :], lags) for bin_index in range(nbins): out_corr[bin_index] += sp.sum(corr[lag_index, bin_inds == bin_index]) out_weights[bin_index] += sp.sum(weights[lag_index, bin_inds == bin_index]) # Normalize. bad_inds = out_weights < 1.0e-20 out_weights[bad_inds] = 1.0 out_corr /= out_weights out_weights[bad_inds] = 0.0 # Make real lags to be returned. x_left = sp.empty(nbins) x_left[0] = 0 x_left[1:] = lags[:-1] x_right = lags x_centre = (x_right + x_left) / 2.0 return out_corr, out_weights, (x_left, x_centre, x_right)
def get_freq_modes_over_f(power_mat, window_function, frequency, n_modes, plots=False): """Fines the most correlated frequency modes and fits thier noise.""" n_f = len(frequency) d_f = sp.mean(sp.diff(frequency)) dt = 1. / 2. / frequency[-1] n_chan = power_mat.shape[-1] n_time = window_function.shape[0] # The threshold for assuming there isn't enough data to measure anything. no_data_thres = 10. / n_time # Initialize the dictionary that will hold all the parameters. output_params = {} # First take the low frequency part of the spetrum matrix and average over # enough bins to get a well conditioned matrix. low_f_mat = sp.mean(power_mat[:4 * n_chan, :, :].real, 0) # Factor the matrix to get the most correlated modes. e, v = linalg.eigh(low_f_mat) # Make sure they are sorted. if not sp.alltrue(sp.diff(e) >= 0): raise RuntimeError("Eigenvalues not sorted") # Power matrix striped of the biggest modes. reduced_power = sp.copy(power_mat) mode_list = [] # Solve for the spectra of these modes. for ii in range(n_modes): this_mode_params = {} # Get power spectrum and window function for this mode. mode = v[:, -1 - ii] mode_power = sp.sum(mode * power_mat.real, -1) mode_power = sp.sum(mode * mode_power, -1) mode_window = sp.sum(mode[:, None]**2 * window_function, 1) mode_window = sp.sum(mode_window * mode[None, :]**2, 1) # Protect against no data. if sp.mean(mode_window).real < no_data_thres: this_mode_params['amplitude'] = 0. this_mode_params['index'] = 0. this_mode_params['f_0'] = 1. this_mode_params['thermal'] = T_infinity**2 * dt else: # Fit the spectrum. p = fit_overf_const(mode_power, mode_window, frequency) # Put all the parameters we measured into the output. this_mode_params['amplitude'] = p[0] this_mode_params['index'] = p[1] this_mode_params['f_0'] = p[2] this_mode_params['thermal'] = p[3] this_mode_params['mode'] = mode output_params['over_f_mode_' + str(ii)] = this_mode_params # Remove the mode from the power matrix. tmp_amp = sp.sum(reduced_power * mode, -1) tmp_amp2 = sp.sum(reduced_power * mode[:, None], -2) tmp_amp3 = sp.sum(tmp_amp2 * mode, -1) reduced_power -= tmp_amp[:, :, None] * mode reduced_power -= tmp_amp2[:, None, :] * mode[:, None] reduced_power += tmp_amp3[:, None, None] * mode[:, None] * mode mode_list.append(mode) # Initialize the compensation matrix, that will be used to restore thermal # noise that gets subtracted out. See Jan 29, Feb 17th, 2012 of Kiyo's # notes. compensation = sp.eye(n_chan, dtype=float) for mode1 in mode_list: compensation.flat[::n_chan + 1] -= 2 * mode1**2 for mode2 in mode_list: mode_prod = mode1 * mode2 compensation += mode_prod[:, None] * mode_prod[None, :] # Now that we've striped the noisiest modes, measure the auto power # spectrum, averaged over channels. auto_spec_mean = reduced_power.view() auto_spec_mean.shape = (n_f, n_chan**2) auto_spec_mean = auto_spec_mean[:, ::n_chan + 1].real auto_spec_mean = sp.mean(auto_spec_mean, -1) diag_window = window_function.view() diag_window.shape = (n_time, n_chan**2) diag_window = diag_window[:, ::n_chan + 1] auto_spec_window = sp.mean(diag_window, -1) if sp.mean(auto_spec_window).real < no_data_thres: auto_cross_over = 0. auto_index = 0. auto_thermal = 0 else: auto_spec_params = fit_overf_const(auto_spec_mean, auto_spec_window, frequency) auto_thermal = auto_spec_params[3] if (auto_spec_params[0] <= 0 or auto_spec_params[3] <= 0 or auto_spec_params[1] > -0.599): auto_cross_over = 0. auto_index = 0. else: auto_index = auto_spec_params[1] auto_cross_over = auto_spec_params[2] * ( auto_spec_params[0] / auto_spec_params[3])**(-1. / auto_index) #if auto_cross_over < d_f: # auto_index = 0. # auto_cross_over = 0. # Plot the mean auto spectrum if desired. if plots: h = plt.gcf() a = h.add_subplot(*h.current_subplot) norm = sp.mean(auto_spec_window).real auto_plot = auto_spec_mean / norm plotable = auto_plot > 0 lines = a.loglog(frequency[plotable], auto_plot[plotable]) c = lines[-1].get_color() # And plot the fit in a light color. if auto_cross_over > d_f / 4.: spec = npow.overf_power_spectrum(auto_thermal, auto_index, auto_cross_over, dt, n_time) else: spec = sp.zeros(n_time, dtype=float) spec += auto_thermal spec[0] = 0 spec = npow.convolve_power(spec, auto_spec_window) spec = npow.prune_power(spec) spec = spec[1:].real if norm > no_data_thres: spec /= norm plotable = spec > 0 a.loglog(frequency[plotable], spec[plotable], c=c, alpha=0.4, linestyle=':') output_params['all_channel_index'] = auto_index output_params['all_channel_corner_f'] = auto_cross_over # Finally measure the thermal part of the noise in each channel. cross_over_ind = sp.digitize([auto_cross_over * 4], frequency)[0] cross_over_ind = max(cross_over_ind, n_f // 2) cross_over_ind = min(cross_over_ind, int(9. * n_f / 10.)) thermal = reduced_power[cross_over_ind:, :, :].real n_high_f = thermal.shape[0] thermal.shape = (n_high_f, n_chan**2) thermal = sp.mean(thermal[:, ::n_chan + 1], 0) thermal_norms = sp.mean(diag_window, 0).real bad_inds = thermal_norms < no_data_thres thermal_norms[bad_inds] = 1. # Compensate for power lost in mode subtraction. compensation[:, bad_inds] = 0 compensation[bad_inds, :] = 0 for ii in xrange(n_chan): if bad_inds[ii]: compensation[ii, ii] = 1. thermal = linalg.solve(compensation, thermal) # Normalize thermal /= thermal_norms thermal[bad_inds] = T_infinity**2 * dt # Occationally the compensation fails horribly on a few channels. # When this happens, zero out the offending indices. thermal[thermal < 0] = 0 output_params['thermal'] = thermal # Now that we know what thermal is, we can subtract it out of the modes we # already measured. for ii in range(n_modes): mode_params = output_params['over_f_mode_' + str(ii)] thermal_contribution = sp.sum(mode_params['mode']**2 * thermal) # Subtract a maximum of 90% of the white noise to keep things positive # definate. new_white = max(mode_params['thermal'] - thermal_contribution, 0.1 * mode_params['thermal']) if mode_params['thermal'] < 0.5 * T_infinity**2 * dt: mode_params['thermal'] = new_white return output_params
def getBins(self, x, grid, domain): edges = scipy.r_[domain[0], (grid[1:] + grid[:-1]) / 2., domain[-1]] bins = scipy.digitize(x, edges) - 1 return bins
def rebin_corr_freq_lag(corr, freq1, freq2=None, weights=None, nfbins=20, return_fbins=False): r"""Collapses frequency pair correlation function to frequency lag. Basically this constructs the 2D correlation function. Parameters ---------- corr: 3D array Covariance matrix which is a function of frequency and frequency prime and angular lag. freq1, freq2: tuple of floats The REAL frequencies. ie. 744000Hz, not 0, 1, 2... freq2 is used if using a map at a different redshift, but we haven't looked at this yet. weights: 3D array The weights of the correlation. It is found in pair.counts right now. nfbins: int How many lag bins out you go in frequency. A higher number means a more accurate result at high lag. return_fbins: bool If `True`, `fbins` is returned. Returns ------- out_corr: 2D array `corr` from before but now only in terms of frequency lag and angular lag. out_weights: 2D array `weights` from before but now in 2D. The weights for `out_corr` fbins: 1D array The frequency lags in terms of Hz. Much like how `lags` in the rest of this module is angular lag in degrees. """ if freq2 is None: freq2 = freq1 # Default is equal weights. if weights is None: weights = sp.ones_like(corr) corr = corr * weights nf1 = corr.shape[0] nf2 = corr.shape[1] nlags = corr.shape[2] # Frequency bin size. delta_freq = min(abs(sp.diff(freq1))) # Frequency bin upper edges. fbins = (sp.arange(nfbins) + 0.5) * delta_freq # Allowcate memory for outputs. out_corr = sp.zeros((nfbins, nlags)) out_weights = sp.zeros((nfbins, nlags)) # Loop over all frequency pairs and bin by lag. for freq1_index in range(nf1): for freq2_index in range(nf2): f_lag = abs(freq1[freq1_index] - freq2[freq2_index]) bin_ind = sp.digitize([f_lag], fbins)[0] if bin_ind < nfbins: out_corr[bin_ind, :] += corr[freq1_index, freq2_index, :] out_weights[bin_ind, :] += weights[freq1_index, freq2_index, :] # Normalize dealing with 0 weight points explicitly. bad_inds = out_weights < 1.0e-20 out_weights[bad_inds] = 1.0 out_corr /= out_weights out_weights[bad_inds] = 0.0 out_corr[bad_inds] = 0.0 if return_fbins: return out_corr, out_weights, fbins - delta_freq * 0.5 else: return out_corr, out_weights
def gen_ld_plots(snps_hdf5_file=snp_file, max_dist=2000, min_maf=0, bin_size=10, fig_dir=results_dir, filter_pop=None, genes=filtered_genes): #Calculating LD just for chromosomal and chromids genes: gene_groups = pd.read_csv(genes) chrom_genes = gene_groups['Gene.group'].tolist() pop_map = parse_pop_map() xs = [] ys = [] #from itertools import izip h5f = h5py.File(snps_hdf5_file, mode="r") gene_groups = sorted(h5f.keys()) ld_dist_dict = {'all': {}, 'nonsyn': {}, 'syn': {}} distances = range(0, max_dist) for dist in distances: ld_dist_dict['all'][dist] = {'r2_sum': 0.0, 'snp_count': 0.0} #ld_dist_dict['nonsyn'][dist]={'r2_sum':0.0, 'snp_count':0.0} #ld_dist_dict['syn'][dist]={'r2_sum':0.0, 'snp_count':0.0} for i, gg in enumerate(chrom_genes): #for i, gg in enumerate(gene_groups): gg = gg[5::] print(gg) if gg in chrom_genes: print(gg) #gg = str(gg.encode('utf-8')) #print(type(gg)) if i % 100 == 0: print('%d: Gene %s' % (i, gg)) g = h5f[str(gg)] print(g) # Look at genes that have at least 10 SNPS if g['codon_snp_freqs'].size > 10: if filter_pop is not None: strains = g['strains'][...] indiv_filter = sp.zeros((len(strains)), dtype='bool8') for s_i, s in enumerate(strains): try: s = str(s, 'utf-8') if pop_map[s]['genospecies'] == filter_pop: indiv_filter[s_i] = True except: continue if sp.sum(indiv_filter) < 2: continue codon_snps = g['codon_snps'][...] print(codon_snps) codon_snps = codon_snps[:, indiv_filter] print(codon_snps.shape) norm_codon_snps = sp.transpose(codon_snps) freqs = sp.mean(norm_codon_snps, 0) norm_codon_snps = (norm_codon_snps - freqs) / sp.sqrt( freqs * (1 - freqs)) norm_codon_snps = sp.transpose(norm_codon_snps) mafs = sp.minimum(freqs, 1 - freqs) maf_filter = mafs > min_maf if sp.sum(maf_filter) > 1: all_norm_snps = norm_codon_snps all_positions = g['codon_snp_positions'][...] norm_snps = all_norm_snps[maf_filter] positions = all_positions[maf_filter] M, N = norm_snps.shape is_synonimous_snp = g['is_synonimous_snp'][...] is_nonsynonimous_snp = ~is_synonimous_snp syn_snp_filter = is_synonimous_snp * maf_filter nonsyn_snp_filter = is_nonsynonimous_snp * maf_filter if sp.sum(syn_snp_filter) > sp.sum(nonsyn_snp_filter): all_norm_snps = norm_codon_snps all_positions = g['codon_snp_positions'][...] norm_snps = all_norm_snps[maf_filter] positions = all_positions[maf_filter] M, N = norm_snps.shape ld_mat = sp.dot(norm_snps, norm_snps.T) / float(N) assert M == len(positions), 'A bug detected.' for i in range(M - 1): for j in range(i + 1, M): dist = positions[j] - positions[i] if dist < max_dist: ld_dist_dict['all'][dist][ 'r2_sum'] += ld_mat[i, j]**2 ld_dist_dict['all'][dist][ 'snp_count'] += 1.0 print(ld_dist_dict) pairs = 0 #for plot_type in ld_dist_dict.keys(): avg_r2s = [] plot_distances = [] for dist in distances: if ld_dist_dict['all'][dist]['snp_count'] >= 1: avg_r2 = ld_dist_dict['all'][dist]['r2_sum'] / float( ld_dist_dict['all'][dist]['snp_count']) pairs += 1 avg_r2s.append(avg_r2) plot_distances.append(dist) plot_distances = sp.array(plot_distances) avg_r2s = sp.array(avg_r2s) print(avg_r2s) bins = sp.arange(0, max(plot_distances), bin_size) digitize = sp.digitize(plot_distances, bins) for bin_i in range(len(bins)): bin_filter = digitize == (bin_i + 1) if len(plot_distances[bin_filter]) > 0: xs.append(sp.mean(plot_distances[bin_filter])) ys.append(sp.mean(avg_r2s[bin_filter])) # plt.plot(xs, ys, color='k', linestyle='None', marker='.', alpha=0.5) # plt.xlabel(r'Pairwise distance ($d$)') # plt.ylabel(r'Squared correlation ($r^2$)') # if filter_pop is not None: # plt.title('LD decay of 0.99 < ANI <= 01') # plt.savefig('%s/ld_%s_codons_nuc_0.99_1_gsA_chromosome_maf_01_core_%s.pdf'%(fig_dir,plot_type,filter_pop)) plot_list = pd.DataFrame({ 'X': xs, 'Y': ys, }) plot_list.to_csv( "{dir_res}/plotting_intergenic_LD_{maf}_{bin_size}_{geno}.csv".format( dir_res=fig_dir, maf=min_maf, bin_size=bin_size, geno=filter_pop)) return (plot_list)
def collapse_correlation_1d(corr, f_lags, a_lags, weights=None): r"""Takes a 2D correlation function and collapses to a 1D correlation function. Parameters ---------- corr: 2D array Covariance matrix in terms of frequency lag and angular lag. The first output from `rebin_corr_freq_lag` right now. f_lags: 1D array The frequency lags in terms of Hz. The third output from `rebin_corr_freq_lag` right now. a_lags: 1D array The angular lags in terms of degrees. weights: 2D array The weights of `corr`. The second output from `rebin_corr_freq_lag` right now. Returns ------- out_corr: 1D array The 1D autocorrelation. out_weights: The weights for `out_corr`. x_axis: tuple of 3 1D arrays `x_axis[1]` is the x - values that correspond to `out_corr`. `x_axis[0]` and `x_axis[2]` are the left and rightmost points covered by each lag bin. Notes ----- `a_lags` are not the same as the lags from the .ini file. The lags from the .ini file are the right side of each lag bin, but you want the centre of the bin when you plot. To get the right values, you must do: (ask Eric or Liviu) lags = sp.array(F.params['lags']) a_lags = copy.deepcopy(lags) a_lags[0] = 0 a_lags[1:] -= sp.diff(lags)/2.0 """ if corr.ndim != 2: msg = "Must start with a 2D correlation function." raise ValueError(msg) if len(f_lags) != corr.shape[0] or len(a_lags) != corr.shape[1]: msg = ("corr.shape must be (len(f_lags), len(a_lags)). Passed: " + repr(corr.shape) + " vs (" + repr(len(f_lags)) + ", " + repr(len(a_lags)) + ").") raise ValueError(msg) if weights is None: weights = sp.ones_like(corr) corr = corr * weights # Hard code conversion factors to MPc/h for now. a_fact = 34.0 # Mpc/h per degree at 800MHz. f_fact = 4.5 # Mpc/h per MHz at 800MHz. # Hard code lags in MPc/h. #nbins = 10 nbins = 15 lags = sp.empty(nbins) lags[0] = 2.0 lags[1] = 4.0 for bin_index in range(2, nbins): lags[bin_index] = 1.5 * lags[bin_index - 1] # Calculate the total 1D lags. separation = a_lags separation = (a_fact * separation[sp.newaxis, :]) ** 2 separation = separation + (f_fact * f_lags[:, sp.newaxis] / 1.0e6) ** 2 separation = sp.sqrt(separation) # Initialize memory for outputs. out_corr = sp.zeros(nbins) out_weights = sp.zeros(nbins) # Rebin. for lag_index in range(separation.shape[0]): bin_inds = sp.digitize(separation[lag_index, :], lags) for bin_index in range(nbins): out_corr[bin_index] += sp.sum(corr[lag_index, bin_inds == bin_index]) out_weights[bin_index] += sp.sum(weights[lag_index, bin_inds == bin_index]) # Normalize. bad_inds = out_weights < 1.0e-20 out_weights[bad_inds] = 1.0 out_corr /= out_weights out_weights[bad_inds] = 0.0 # Make real lags to be returned. x_left = sp.empty(nbins) x_left[0] = 0 x_left[1:] = lags[:-1] x_right = lags x_centre = (x_right + x_left) / 2.0 return out_corr, out_weights, (x_left, x_centre, x_right)
def corr_est(map1, map2, noise1, noise2, freq1, freq2, lags=(), speedup=False, verbose=False): r"""Calculate the cross correlation function of the maps. The cross correlation function is a function of f1, f2 and angular lag. The angular lag bins are passed, all pairs of frequencies are calculated. Parameters ---------- lags: array like Angular lags bins (upper side bin edges). speedup: boolean Speeds up the correlation. This works fine, yes? Should be the normal way if so. Returns ------- corr: array The correlation between 2 maps. counts: array The weighting of the correlation based on the maps' weights. """ map1_ra = map1.get_axis('ra') map2_ra = map2.get_axis('ra') map1_dec = map1.get_axis('dec') map2_dec = map2.get_axis('dec') input_map1 = map1[freq1, :, :] input_map2 = map2[freq2, :, :] input_noise1 = noise1[freq1, :, :] input_noise2 = noise2[freq2, :, :] # Noise weight input_map1 *= input_noise1 input_map2 *= input_noise2 nlags = len(lags) nfreq = len(freq1) corr = sp.zeros((nfreq, nfreq, nlags), dtype=float) counts = sp.zeros(corr.shape, dtype=float) # Noting that if DEC != 0, then a degree of RA is less than a degree. ra_fact = sp.cos(sp.pi * map1.info['dec_centre'] / 180.0) # Calculate the pairwise lags. dra = (map1_ra[:, None] - map2_ra[None, :]) * ra_fact ddec = map1_dec[:, None] - map2_dec[None, :] lag = dra[:, None, :, None] ** 2 + ddec[None, :, None, :] ** 2 lag = sp.sqrt(lag) # Bin this up. lag_inds = sp.digitize(lag.flatten(), lags) if speedup: print "Starting Correlation (sparse version)" (nr1, nd1) = (len(map1_ra), len(map1_dec)) (nr2, nd2) = (len(map2_ra), len(map2_dec)) (r1ind, d1ind) = (sp.arange(nr1), sp.arange(nd1)) (r2ind, d2ind) = (sp.arange(nr2), sp.arange(nd2)) ra1_pairind = r1ind.repeat(nr2 * nd1 * nd2) ra2_pairind = sp.tile(r2ind.repeat(nd2), (1, nr1 * nd1)).flatten() dec1_pairind = sp.tile(d1ind.repeat(nr2 * nd2), (1, nr1)).flatten() dec2_pairind = sp.tile(d2ind, (1, nr1 * nr2 * nd1)).flatten() # precalculate the pair indices for a given lag # could also imagine calculating the map slices here posmaskdict = {} for klag in range(nlags): mask = (lag_inds == klag) posmaskdict[repr(klag)] = (ra1_pairind[mask], ra2_pairind[mask], dec1_pairind[mask], dec2_pairind[mask]) for if1 in range(len(freq1)): for jf2 in range(len(freq2)): start = time.time() data1 = input_map1[if1, :, :] data2 = input_map2[jf2, :, :] weights1 = input_noise1[if1, :, :] weights2 = input_noise2[jf2, :, :] for klag in range(nlags): (r1m, r2m, d1m, d2m) = posmaskdict[repr(klag)] dprod = data1[r1m, d1m] * data2[r2m, d2m] wprod = weights1[r1m, d1m] * weights2[r2m, d2m] corr[if1, jf2, klag] += sp.sum(dprod) counts[if1, jf2, klag] += sp.sum(wprod) if verbose: print if1, jf2, (time.time() - start) print counts[if1, jf2, :] else: print "Starting Correlation (full version)" for if1 in range(len(freq1)): for jf2 in range(len(freq2)): start = time.time() # Calculate the pairwise products. data1 = input_map1[if1, :, :] data2 = input_map2[jf2, :, :] weights1 = input_noise1[if1, :, :] weights2 = input_noise2[jf2, :, :] dprod = data1[..., None, None] * data2[None, None, ...] wprod = weights1[..., None, None] * \ weights2[None, None, ...] for klag in range(nlags): mask = (lag_inds == klag) corr[if1, jf2, klag] += sp.sum(dprod.flatten()[mask]) counts[if1, jf2, klag] += sp.sum(wprod.flatten()[mask]) if verbose: print if1, jf2, (time.time() - start) print counts[if1, jf2, :] mask = (counts < 1e-20) counts[mask] = 1 corr /= counts corr[mask] = 0 counts[mask] = 0 return corr, counts
def getBins(self,x,grid,domain): edges = scipy.r_[domain[0],(grid[1:]+grid[:-1])/2.,domain[-1]] bins = scipy.digitize(x,edges)-1 return bins
def count_ld_indep_regions(res_file, num_traits=None, ss_file=None, ld_reg_map='/project/PCMA/faststorage/1_DATA/fourier_ls.hdf5'): # parse results.. print 'Parsing PCMA results' if ss_file is not None: chrom_res_dict = parse_PCMA_results(ss_file, res_file) else: chrom_res_dict = parse_PCMA_comb_results(res_file, num_traits) # Filter for good SNPs? # parse ldetect map print 'Loading ldetect map' ldr = h5py.File(ld_reg_map, 'r') num_new_hits = 0 num_comb_hits = 0 num_marg_hits = 0 num_shared_hits = 0 num_missed_hits = 0 chrom_bin_dict = {} res_summary_dict = {} for chrom in range(1, 23): print 'Working on chromosome %d' % chrom chrom_str = 'chr%d' % chrom res_dict = chrom_res_dict[chrom_str] chrom_bins = ldr[chrom_str] bin_indices = sp.digitize(res_dict['positions'], chrom_bins) chrom_bin_dict[chrom_str] = {'bin_indices':bin_indices, 'chrom_bins':chrom_bins, 'num_bins':len(chrom_bins) - 1} # Count things.. print 'Counting hits' # assert len(chrom_bins)-1==bin_indices.max()+1, 'WTF?' for bin_i in range(bin_indices.max() + 1): bin_filter = bin_indices == bin_i if sp.any(bin_filter): min_marg_pv = (res_dict['min_marg_ps'][bin_filter]).min() marg_hit = min_marg_pv < 5E-8 comb_ps = res_dict['comb_ps'][bin_filter] min_i = comb_ps.idxmin() min_comb_pv = comb_ps[min_i] min_sid = res_dict['sids'][min_i] comb_hit = min_comb_pv < 5E-8 if marg_hit: num_marg_hits += 1 if comb_hit: num_shared_hits += 1 num_comb_hits += 1 else: num_missed_hits += 1 elif comb_hit: num_new_hits += 1 num_comb_hits += 1 start_pos = chrom_bins[bin_i] if bin_i < len(chrom_bins) - 1: end_pos = chrom_bins[bin_i + 1] else: end_pos = -1 res_summary_dict[bin_i] = {'min_marg_pv':min_marg_pv, 'min_comb_pv':min_comb_pv, 'min_PC_pv': res_dict['pc_ps'].loc[min_i], 'min_sid':min_sid, 'chromsome':chrom, 'positions_bin':(start_pos, end_pos)} # More information on new hits somewhere print '\nResults summary: \n# new hits: %d \n# missed hits: %d \n# of shared hits: %d \n# multivar. hits: %d \n# marg. hits: %d \n' % (num_new_hits, num_missed_hits, num_shared_hits, num_comb_hits, num_marg_hits) print res_summary_dict
def corr_est(map1, map2, noise1, noise2, freq1, freq2, lags=(), speedup=False, verbose=False): r"""Calculate the cross correlation function of the maps. The cross correlation function is a function of f1, f2 and angular lag. The angular lag bins are passed, all pairs of frequencies are calculated. Parameters ---------- lags: array like Angular lags bins (upper side bin edges). speedup: boolean Speeds up the correlation. This works fine, yes? Should be the normal way if so. Returns ------- corr: array The correlation between 2 maps. counts: array The weighting of the correlation based on the maps' weights. """ map1_ra = map1.get_axis('ra') map2_ra = map2.get_axis('ra') map1_dec = map1.get_axis('dec') map2_dec = map2.get_axis('dec') input_map1 = map1[freq1, :, :] input_map2 = map2[freq2, :, :] input_noise1 = noise1[freq1, :, :] input_noise2 = noise2[freq2, :, :] # Noise weight input_map1 *= input_noise1 input_map2 *= input_noise2 nlags = len(lags) nfreq = len(freq1) corr = sp.zeros((nfreq, nfreq, nlags), dtype=float) counts = sp.zeros(corr.shape, dtype=float) # Noting that if DEC != 0, then a degree of RA is less than a degree. ra_fact = sp.cos(sp.pi * map1.info['dec_centre'] / 180.0) # Calculate the pairwise lags. dra = (map1_ra[:, None] - map2_ra[None, :]) * ra_fact ddec = map1_dec[:, None] - map2_dec[None, :] lag = dra[:, None, :, None]**2 + ddec[None, :, None, :]**2 lag = sp.sqrt(lag) # Bin this up. lag_inds = sp.digitize(lag.flatten(), lags) if speedup: print "Starting Correlation (sparse version)" (nr1, nd1) = (len(map1_ra), len(map1_dec)) (nr2, nd2) = (len(map2_ra), len(map2_dec)) (r1ind, d1ind) = (sp.arange(nr1), sp.arange(nd1)) (r2ind, d2ind) = (sp.arange(nr2), sp.arange(nd2)) ra1_pairind = r1ind.repeat(nr2 * nd1 * nd2) ra2_pairind = sp.tile(r2ind.repeat(nd2), (1, nr1 * nd1)).flatten() dec1_pairind = sp.tile(d1ind.repeat(nr2 * nd2), (1, nr1)).flatten() dec2_pairind = sp.tile(d2ind, (1, nr1 * nr2 * nd1)).flatten() # precalculate the pair indices for a given lag # could also imagine calculating the map slices here posmaskdict = {} for klag in range(nlags): mask = (lag_inds == klag) posmaskdict[repr(klag)] = (ra1_pairind[mask], ra2_pairind[mask], dec1_pairind[mask], dec2_pairind[mask]) for if1 in range(len(freq1)): for jf2 in range(len(freq2)): start = time.time() data1 = input_map1[if1, :, :] data2 = input_map2[jf2, :, :] weights1 = input_noise1[if1, :, :] weights2 = input_noise2[jf2, :, :] for klag in range(nlags): (r1m, r2m, d1m, d2m) = posmaskdict[repr(klag)] dprod = data1[r1m, d1m] * data2[r2m, d2m] wprod = weights1[r1m, d1m] * weights2[r2m, d2m] corr[if1, jf2, klag] += sp.sum(dprod) counts[if1, jf2, klag] += sp.sum(wprod) if verbose: print if1, jf2, (time.time() - start) print counts[if1, jf2, :] else: print "Starting Correlation (full version)" for if1 in range(len(freq1)): for jf2 in range(len(freq2)): start = time.time() # Calculate the pairwise products. data1 = input_map1[if1, :, :] data2 = input_map2[jf2, :, :] weights1 = input_noise1[if1, :, :] weights2 = input_noise2[jf2, :, :] dprod = data1[..., None, None] * data2[None, None, ...] wprod = weights1[..., None, None] * \ weights2[None, None, ...] for klag in range(nlags): mask = (lag_inds == klag) corr[if1, jf2, klag] += sp.sum(dprod.flatten()[mask]) counts[if1, jf2, klag] += sp.sum(wprod.flatten()[mask]) if verbose: print if1, jf2, (time.time() - start) print counts[if1, jf2, :] mask = (counts < 1e-20) counts[mask] = 1 corr /= counts corr[mask] = 0 counts[mask] = 0 return corr, counts
def bin(catalogue): return scipy.digitize( catalogue.distance(), radialedges, right=False) - 1
def count_ld_indep_regions( res_file, num_traits=None, ss_file=None, ld_reg_map='/project/PCMA/faststorage/1_DATA/fourier_ls.hdf5'): # parse results.. print 'Parsing PCMA results' if ss_file is not None: chrom_res_dict = parse_PCMA_results(ss_file, res_file) else: chrom_res_dict = parse_PCMA_comb_results(res_file, num_traits) # Filter for good SNPs? # parse ldetect map print 'Loading ldetect map' ldr = h5py.File(ld_reg_map, 'r') num_new_hits = 0 num_comb_hits = 0 num_marg_hits = 0 num_shared_hits = 0 num_missed_hits = 0 chrom_bin_dict = {} res_summary_dict = {} for chrom in range(1, 23): print 'Working on chromosome %d' % chrom chrom_str = 'chr%d' % chrom res_dict = chrom_res_dict[chrom_str] chrom_bins = ldr[chrom_str] bin_indices = sp.digitize(res_dict['positions'], chrom_bins) chrom_bin_dict[chrom_str] = { 'bin_indices': bin_indices, 'chrom_bins': chrom_bins, 'num_bins': len(chrom_bins) - 1 } # Count things.. print 'Counting hits' # assert len(chrom_bins)-1==bin_indices.max()+1, 'WTF?' for bin_i in range(bin_indices.max() + 1): bin_filter = bin_indices == bin_i if sp.any(bin_filter): min_marg_pv = (res_dict['min_marg_ps'][bin_filter]).min() marg_hit = min_marg_pv < 5E-8 comb_ps = res_dict['comb_ps'][bin_filter] min_i = comb_ps.idxmin() min_comb_pv = comb_ps[min_i] min_sid = res_dict['sids'][min_i] comb_hit = min_comb_pv < 5E-8 if marg_hit: num_marg_hits += 1 if comb_hit: num_shared_hits += 1 num_comb_hits += 1 else: num_missed_hits += 1 elif comb_hit: num_new_hits += 1 num_comb_hits += 1 start_pos = chrom_bins[bin_i] if bin_i < len(chrom_bins) - 1: end_pos = chrom_bins[bin_i + 1] else: end_pos = -1 res_summary_dict[bin_i] = { 'min_marg_pv': min_marg_pv, 'min_comb_pv': min_comb_pv, 'min_PC_pv': res_dict['pc_ps'].loc[min_i], 'min_sid': min_sid, 'chromsome': chrom, 'positions_bin': (start_pos, end_pos) } # More information on new hits somewhere print '\nResults summary: \n# new hits: %d \n# missed hits: %d \n# of shared hits: %d \n# multivar. hits: %d \n# marg. hits: %d \n' % ( num_new_hits, num_missed_hits, num_shared_hits, num_comb_hits, num_marg_hits) print res_summary_dict
from scipy import mean, digitize, cumsum, array, concatenate, sort, split, set_printoptions from scipy.stats import uniform from QueueingTheory import mm1, getRandomArrivalServiceTimes # QueueingTheory Module available on https://gist.github.com/siddhant3s/5665696 set_printoptions(precision = 3) arrival_rate = 1 service_rate = 4/3.0 n_process = 10000 arrival_times, service_times = getRandomArrivalServiceTimes(n_process, arrival_rate, service_rate) server_prob = array([0.2, 0.2, 0.2, 0.2, 0.2]) n_server = server_prob.size # maps kth process to ith server server_address_table = digitize(uniform.rvs(size = n_process), cumsum(server_prob)) server_arrival_times = [arrival_times[server_address_table == i] for i in range(n_server)] server_service_times = [service_times[server_address_table == i] for i in range(n_server)] results = map(mm1, server_arrival_times, server_service_times) print "Mean Wait(1)", array([mean(result['wait_times']) for result in results]) print "Mean QueueSize(1)", array([mean(result['queue_size']) for result in results]) server_prob_matrix = array([[ 0.2, 0.2, 0.2, 0.2, 0.2], [ 0.2, 0.2, 0.2, 0.2, 0.2], [ 0.2, 0.2, 0.2, 0.2, 0.2], [ 0.2, 0.2, 0.2, 0.2, 0.2], [ 0.2, 0.2, 0.2, 0.2, 0.2]]) server_prob_matrix_cumsumed = cumsum(server_prob_matrix, axis = 1) server_address_tables = [ digitize(uniform.rvs(size = len(server_arrival_times[i])), server_prob_matrix_cumsumed[i]) for i in range(n_server) ] server_arrival_times = [ sort(concatenate([results[i]['completion_times'][server_address_tables[i] == k] for i in range(n_server)]))
def plot_gw_r2_decay(file_prefix, num_random_xs=200, max_dist=1000000, call_method_id=78, mac_filter=15, debug_filter=1): """ Plots r2 decay on the genome-wide scale """ dtype = 'single' #To increase matrix multiplication speed... using 32 bits. sd = dp.load_snps_call_method(call_method_id=call_method_id, debug_filter=debug_filter, min_mac=mac_filter) #sd.filter_mac_snps(mac_filter) h_inverse_matrix_file = env[ 'data_dir'] + 'snp_cov_mat_h_inv_cm%d.pickled' % (call_method_id) if not os.path.isfile(h_inverse_matrix_file): K = sd.get_snp_cov_matrix() H_sqrt = lm.cholesky(K) H_sqrt_inv = (H_sqrt).I with file(h_inverse_matrix_file, 'wb') as f: cPickle.dump(H_sqrt_inv, f, protocol=2) else: with file(h_inverse_matrix_file) as f: H_sqrt_inv = cPickle.load(f) cps_list = sd.getChrPosSNPList() x_cps = random.sample(cps_list, num_random_xs) y_cps = cps_list result_dict = {} n = len(sd.accessions) print 'Starting calculation' sys.stdout.flush() dists = [] r2s = [] t_r2s = [] x_macs = [] y_macs = [] n_saved = 0 s1 = time.time() for i, (x_c, x_p, x_snp) in enumerate(x_cps): print '%d: chromosome=%d, position=%d' % (i, x_c, x_p) #Normalize SNP.. xs = sp.array(x_snp) x_mac = sum(xs) t_x_snp = sp.dot(((xs - sp.mean(xs)) / sp.std(xs)), H_sqrt_inv).T for (y_c, y_p, y_snp) in reversed(y_cps): if x_c != y_c: continue if abs(x_p - y_p) > max_dist: continue ys = sp.array(y_snp) x_macs.append(x_mac) y_macs.append(sum(ys)) (r, pearson_pval) = st.pearsonr(xs, ys) r2 = r * r t_y_snp = sp.dot(((ys - sp.mean(ys)) / sp.std(ys)), H_sqrt_inv).T (t_r, t_pearson_pval) = st.pearsonr( t_x_snp, t_y_snp) #Done twice, but this is fast.. t_r, t_pearson_pval = float(t_r), float(t_pearson_pval) t_r2 = t_r * t_r dists.append(abs(x_p - y_p)) r2s.append(r2) t_r2s.append(t_r2) n_saved += 1 time_secs = time.time() - s1 print 'It took %d minutes and %d seconds to finish.' % (time_secs / 60, time_secs % 60) print '%d values were saved.' % n_saved sys.stdout.flush() #Now plotting and binning.. for m_dist in [50000, 100000, 200000, 500000, 1000000]: kbs = m_dist / 1000 bin_ids = sp.digitize(dists, sp.arange(0, m_dist, m_dist / 100)) - 1 bin_dict = {} for bid in range(100): bin_dict[bid] = {'r2s': [], 't_r2s': []} filtered_r2s = [] filtered_t_r2s = [] filtered_dists = [] for bid, r2, t_r2, dist in izip(bin_ids, r2s, t_r2s, dists): if dist > m_dist: continue bin_dict[bid]['r2s'].append(r2) filtered_r2s.append(r2) bin_dict[bid]['t_r2s'].append(t_r2) filtered_t_r2s.append(t_r2) filtered_dists.append(dist) pylab.figure() pylab.plot(filtered_dists, filtered_r2s, alpha=0.3, color='k', marker='.', ls='None') pylab.xlabel('Distance (bases)') pylab.ylabel(r'$r^2$') pylab.savefig(file_prefix + '_%dkb_r2s.png' % (kbs)) pylab.figure() pylab.plot(filtered_dists, filtered_t_r2s, alpha=0.3, color='k', marker='.', ls='None') pylab.xlabel('Distance (bases)') pylab.ylabel(r'$r^2$') pylab.savefig(file_prefix + '_%dkb_t_r2s.png' % (kbs)) r2_avgs = [] t_r2_avgs = [] xs = [] l = sp.arange(0, m_dist, m_dist / 100) + (m_dist / 200) for bid in range(100): n = len(bin_dict[bid]['r2s']) if n > 0: r2_avgs.append(sp.sum(bin_dict[bid]['r2s']) / n) t_r2_avgs.append(sp.sum(bin_dict[bid]['t_r2s']) / n) xs.append(l[bid]) pylab.figure() pylab.plot(xs, r2_avgs, alpha=0.7, color='b', lw=1.8, label=r'standard $r^2$') pylab.plot(xs, t_r2_avgs, alpha=0.7, color='m', lw=1.8, label=r'transformed $r^2$') pylab.legend(loc=1) pylab.xlabel('Distance (bases)') pylab.ylabel(r'$r^2$') pylab.savefig(file_prefix + '_%dkb_r2s_avgs.png' % (kbs))