def _elbo_t(logp, uw, inarray, n_mcsamples, random_seed): """Create Theano tensor of approximate ELBO by Monte Carlo sampling. """ l = (uw.size / 2).astype('int64') u = uw[:l] w = uw[l:] # Callable tensor logp_ = lambda input: theano.clone(logp, {inarray: input}, strict=False) # Naive Monte-Carlo r = MRG_RandomStreams(seed=random_seed) if n_mcsamples == 1: n = r.normal(size=inarray.tag.test_value.shape) q = n * exp(w) + u elbo = logp_(q) + tt.sum(w) + 0.5 * l * (1 + np.log(2.0 * np.pi)) else: n = r.normal(size=(n_mcsamples, u.tag.test_value.shape[0])) qs = n * exp(w) + u logps, _ = theano.scan(fn=lambda q: logp_(q), outputs_info=None, sequences=[qs]) elbo = tt.mean(logps) + tt.sum(w) + 0.5 * l * (1 + np.log(2.0 * np.pi)) return elbo
def show_overlay(img3d, cc3d, ncc=10, s=85, xyz = 'xy',alpha=.8): """Shows the connected components overlayed over img3d Input ====== img3d -- 3d array cc3d -- 3d array ( preferably of same shape as img3d, use get_3d_cc(...) ) ncc -- where to cut off the color scale s -- slice to show xyz -- which projection to use in {'xy','xz','yz'} """ cc = get_slice(cc3d,s,xyz) img = get_slice(img3d,s,xyz) notcc = np.isnan(cc) incc = np.not_equal(notcc,True) img4 = plt.cm.gray(img/np.nanmax(img)) if ncc is not np.Inf: cc = plt.cm.jet(cc/float(ncc)) else: cc = plt.cm.jet(np.log(cc)/np.log(np.nanmax(cc))) cc[notcc,:]=img4[notcc,:] cc[incc,3] = 1-img[incc]/(2*np.nanmax(img)) plt.imshow(cc)
def chol_logdet(U): if isinstance(U, np.ndarray): return 2*np.sum(np.log(np.diag(U))) elif isinstance(U, cholmod.Factor): return np.sum(np.log(U.D())) else: raise ValueError("Unknown type of Cholesky factor")
def joint_logdist(pi, alpha, sigma, tau, u): abs_pi = len(pi) n = np.sum(pi) tmp = abs_pi * log(alpha) + (n - 1.) * log(u) - gammaln(n) - (n - sigma * abs_pi) * log(u + tau) \ - (alpha / sigma) * ((u + tau) ** sigma - tau ** sigma) tmp += np.sum(gammaln(pi - sigma) - gammaln(1. - sigma)) return tmp
def _lmvnpdffull(obs, means, covars): """ Log probability for full covariance matrices. WARNING: In certain cases, this function will modify in-place some of the covariance matrices """ from scipy import linalg import itertools if hasattr(linalg, 'solve_triangular'): # only in scipy since 0.9 solve_triangular = linalg.solve_triangular else: # slower, but works solve_triangular = linalg.solve n_obs, n_dim = obs.shape nmix = len(means) log_prob = np.empty((n_obs, nmix)) for c, (mu, cv) in enumerate(itertools.izip(means, covars)): try: cv_chol = linalg.cholesky(cv, lower=True) except linalg.LinAlgError: # The model is most probabily stuck in a component with too # few observations, we need to reinitialize this components cv[:] = 10 * np.eye(cv.shape[0]) cv_chol = cv cv_log_det = 2 * np.sum(np.log(np.diagonal(cv_chol))) cv_sol = solve_triangular(cv_chol, (obs - mu).T, lower=True).T log_prob[:, c] = -.5 * (np.sum(cv_sol ** 2, axis=1) + \ n_dim * np.log(2 * np.pi) + cv_log_det) return log_prob
def ll(actual, predicted): """ Computes the log likelihood. This function computes the log likelihood between two numbers, or for element between a pair of lists or numpy arrays. Parameters ---------- actual : int, float, list of numbers, numpy array The ground truth value predicted : same type as actual The predicted value Returns ------- score : double or list of doubles The log likelihood error between actual and predicted """ actual = np.array(actual) predicted = np.array(predicted) err = np.seterr(all='ignore') score = -(actual * np.log(predicted) + (1 - actual) * np.log(1 - predicted)) np.seterr(divide=err['divide'], over=err['over'], under=err['under'], invalid=err['invalid']) if type(score) == np.ndarray: score[np.isnan(score)] = 0 else: if np.isnan(score): score = 0 return score
def log_diff_exp(x, axis=0): """ Calculates the logarithm of the diffs of e to the power of input 'x'. The method tries to avoid overflows by using the relationship: log(diff(exp(x))) = alpha + log(diff(exp(x-alpha))). :Parameter: x: data. -type: float or numpy array axis: Sums along the given axis. -type: int :Return: Logarithm of the sum of exp of x. -type: float or numpy array. """ alpha = x.max(axis) - numx.log(numx.finfo(numx.float64).max)/2.0 if axis == 1: return numx.squeeze(alpha + numx.log( numx.diff( numx.exp(x.T - alpha) , n=1, axis=0))) else: return numx.squeeze(alpha + numx.log( numx.diff( numx.exp(x - alpha) , n=1, axis=0)))
def hqic(self): nobs = self.nobs # Lutkepohl # return np.log(self.sigma2)+ 2 * np.log(np.log(nobs))/nobs * self.k_ar # R uses all estimated parameters rather than just lags return (np.log(self.sigma2) + 2 * np.log(np.log(nobs))/nobs * (1 + self.df_model))
def computeCost(theta, X, y): theta.shape = (1, 3) m = y.size z=X.dot(theta.T) h = 1.0 / (1.0 + e ** (-1.0 * z)) J = (1.0 / m) * ((-y.T.dot(log(h))) - ((1.0 - y.T).dot(log(1.0 - h)))) return 1 * J.sum()
def sigma2(powerSpec, V, kmin, kmax, points=500,\ window='tophat_x'): P = interp1d(powerSpec[:,0], powerSpec[:,1], kind='linear', \ bounds_error=False, fill_value=0.) # P is a function # the following are arrays lnk = np.linspace(np.log(kmin), np.log(kmax), points, \ endpoint=False) + (np.log(kmax)-np.log(kmin))/points/2 k = np.exp(lnk) if isinstance(window, str): if window=='gaus': W = gaussian(k, V) elif window=='tophat_x': W = tophat_xspace(k, V) elif window=='tophat_k': W = tophat_kspace(k, V) elif window=='triangle_k': W = triangle_kspace(k, V) elif window=='tukey_k': W = tukey_kspace(k, V) elif isinstance(window, tuple): if window[0]=='tukey_k': W = tukey_kspace(k, V, window[1]) integrand = k**3 * P(k) / (2.*np.pi**2) * np.abs(W)**2 return np.sum(integrand) * (lnk[1]-lnk[0])
def compute_cost( X, y, theta, lam ): '''Compute cost for logistic regression.''' # Number of training examples m = y.shape[0] # Compute the prediction based on theta and X predictions = X.dot( theta ) # Preprocessing values before sending to sigmoid function. # If the argument to sigmoid function >= 0, we know that the # sigmoid value is 1. Similarly for the negative values. predictions[ where( predictions >= 20 ) ] = 20 predictions[ where( predictions <= -500 ) ] = -500 hypothesis = sigmoid( predictions ) hypothesis[ where( hypothesis == 1.0 ) ] = 0.99999 # Part of the cost function without regularization J1 = ( -1.0 / m ) * sum( ( y * np.log( hypothesis ) ) + ( ( 1.0 - y ) * np.log( 1.0 - hypothesis ) ) ) # Computing the regularization term J2 = lam / ( 2.0 * m ) * sum( theta[ 1:, ] * theta[ 1:, ] ) error = hypothesis - y return J1 + J2
def _fgreen3d(self, z, y, x): ''' Return the periodic integrated greens funcion on the 'original' domain Qiang, Lidia, Ryne,Limborg-Deprey, PRSTAB 10, 129901 (2007) Args: x,y,z: arrays, e.g. x, y, z = np.meshgrid(xx, yy, zz) ''' abs_r = np.sqrt(x * x + y * y + z * z) inv_abs_r = 1./abs_r tmpfgreen = (-( + z*z * np.arctan(x*y*inv_abs_r/z) + y*y * np.arctan(x*z*inv_abs_r/y) + x*x * np.arctan(y*z*inv_abs_r/x) )/2. + y*z*np.log(x+abs_r) + x*z*np.log(y+abs_r) + x*y*np.log(z+abs_r)) fgreen = np.zeros((2 * self.mesh.nz, 2 * self.mesh.ny, 2 * self.mesh.nx), dtype=np.complex128) # evaluate the indefinite integral per cell (int_a^b f = F(b) - F(a)) fgreen[:self.mesh.nz, :self.mesh.ny, :self.mesh.nx] = ( tmpfgreen[ 1:, 1:, 1:] -tmpfgreen[:-1, 1:, 1:] -tmpfgreen[ 1:, :-1, 1:] +tmpfgreen[:-1, :-1, 1:] -tmpfgreen[ 1:, 1:, :-1] +tmpfgreen[:-1, 1:, :-1] +tmpfgreen[ 1:, :-1, :-1] -tmpfgreen[:-1, :-1, :-1] ) * 1./self.mesh.volume_elem # divide by vol_elem to average! return fgreen
def __init__(self, shape, successes, trials=None, coef=1., offset=None, quadratic=None, initial=None): smooth_atom.__init__(self, shape, offset=offset, quadratic=quadratic, initial=initial, coef=coef) if sparse.issparse(successes): #Convert sparse success vector to an array self.successes = successes.toarray().flatten() else: self.successes = np.asarray(successes) if trials is None: if not set([0,1]).issuperset(np.unique(self.successes)): raise ValueError("Number of successes is not binary - must specify number of trials") self.trials = np.ones(self.successes.shape, np.float) else: if np.min(trials-self.successes) < 0: raise ValueError("Number of successes greater than number of trials") if np.min(self.successes) < 0: raise ValueError("Response coded as negative number - should be non-negative number of successes") self.trials = trials * 1. saturated = self.successes / self.trials deviance_terms = np.log(saturated) * self.successes + np.log(1-saturated) * (self.trials - self.successes) deviance_constant = -2 * coef * deviance_terms[~np.isnan(deviance_terms)].sum() devq = identity_quadratic(0,0,0,-deviance_constant) self.quadratic += devq
def convergence_rates(m, solver_function, num_periods=8): """ Возвращает m-1 эмпирическую оценку скорости сходимости, полученную на основе m расчетов, для каждого из которых шаг по времени уменьшается в два раза. solver_function(U, omega, tau, T) решает каждую задачу, для которой T, получается на основе вычислений для num_periods периодов. """ from math import pi omega = 0.35; U = 0.3 # просто заданные значения P = 2*pi/omega # период tau = P/30 # 30 шагов на период 2*pi/omega T = P*num_periods tau_values = [] E_values = [] for i in range(m): u, t = solver_function(U, omega, tau, T, 1) u_e = u_exact(t, U, omega) E = np.sqrt(tau*np.sum((u_e-u)**2)) tau_values.append(tau) E_values.append(E) tau = tau/2 r = [np.log(E_values[i-1]/E_values[i])/ np.log(tau_values[i-1]/tau_values[i]) for i in range(1, m, 1)] return r
def klBern(x, y): r""" Kullback-Leibler divergence for Bernoulli distributions. https://en.wikipedia.org/wiki/Bernoulli_distribution#Kullback.E2.80.93Leibler_divergence .. math:: \mathrm{KL}(\mathcal{B}(x), \mathcal{B}(y)) = x \log(\frac{x}{y}) + (1-x) \log(\frac{1-x}{1-y}).""" x = min(max(x, eps), 1 - eps) y = min(max(y, eps), 1 - eps) return x * np.log(x / y) + (1 - x) * np.log((1 - x) / (1 - y))
def llr(sf_data, sf_map, coreference_map, sums, i1, i2): #i1 = sf_index(sf_data, sf1) #i2 = sf_index(sf_data, sf2) # compute actuaL cell frequencies # - outer cells ndd = float(sf_data['next-mention-index']) npd = float(sums[0,i1]) ndp = float(sums[0,i2]) nnd = ndd - npd ndn = ndd - ndp # - inner cells mentions = [i for i in get_mentions(sf_data, sf_map, coreference_map, i1) if i in get_mentions(sf_data, sf_map, coreference_map, i2)] npp = float(len(mentions)) npn = npd - npp nnp = ndp - npp nnn = nnd - nnp # compute (randomly) predicted cell frequencies enn = nnd * ndn / ndd enp = nnd * ndp / ndd epn = npd * ndn / ndd epp = npd * ndp / ndd #print npd, ndp, npp, ndd # compute log-likelihood ratio result = 0.0 if nnn > 0: result += nnn * np.log(nnn / enn) if nnp > 0: result += nnp * np.log(nnp / enp) if npn > 0: result += npn * np.log(npn / epn) if npp > 0: result += npp * np.log(npp / epp) return 2.0 * result
def compute_edge_weights( edge_ids, edge_probabilities, beta ): """ Convert edge probabilities to energies for the multicut problem. edge_ids: The list of edges in the graph. shape=(N, 2) edge_probabilities: 1-D, float (1.0 means edge is CUT, disconnecting the two SPs) beta: scalar (float) Special behavior: If any node has ID 0, all of it's edges will be given an artificially low energy, to prevent it from merging with its neighbors, regardless of what the edge_probabilities say. """ p1 = edge_probabilities # P(Edge=CUT) p1 = np.clip(p1, 0.001, 0.999) p0 = 1.0 - p1 # P(Edge=NOT CUT) edge_weights = np.log(p0/p1) + np.log( (1-beta)/(beta) ) # See note special behavior, above edges_touching_zero = edge_ids[:,0] == 0 if edges_touching_zero.any(): logger.warn("Volume contains label 0, which will be excluded from the segmentation.") MINIMUM_ENERGY = -1000.0 edge_weights[edges_touching_zero] = MINIMUM_ENERGY return edge_weights
def test_skewed_chi2_sampler(): """test that RBFSampler approximates kernel on random data""" # compute exact kernel c = 0.03 # appreviations for easier formular X_c = (X + c)[:, np.newaxis, :] Y_c = (Y + c)[np.newaxis, :, :] # we do it in log-space in the hope that it's more stable # this array is n_samples_x x n_samples_y big x n_features log_kernel = ((np.log(X_c) / 2.) + (np.log(Y_c) / 2.) + np.log(2.) - np.log(X_c + Y_c)) # reduce to n_samples_x x n_samples_y by summing over features in log-space kernel = np.exp(log_kernel.sum(axis=2)) # approximate kernel mapping transform = SkewedChi2Sampler(skewedness=c, n_components=1000, random_state=42) X_trans = transform.fit_transform(X) Y_trans = transform.transform(Y) kernel_approx = np.dot(X_trans, Y_trans.T) assert_array_almost_equal(kernel, kernel_approx, 1) # test error is raised on negative input Y_neg = Y.copy() Y_neg[0, 0] = -1 assert_raises(ValueError, transform.transform, Y_neg)
def B(x,XW,n1,n2,kernel,logproductExpectations=None): """Computes B(x)=\int\Sigma_{0}(x,w,XW[0:n1],XW[n1:n1+n2])dp(w). Args: x: Vector of points where B is evaluated XW: Point (x,w) n1: Dimension of x n2: Dimension of w kernel logproductExpectations: Vector with the logarithm of the product of the expectations of np.exp(-alpha2[j]*((z-W[i,j])**2)) where W[i,:] is a point in the history. """ x=np.array(x).reshape((x.shape[0],n1)) results=np.zeros(x.shape[0]) #parameterLamb=parameterSetsPoisson X=XW[0:n1] inda=n1+n2 W=XW[n1:inda] alpha2=0.5*((kernel.alpha[n1:n1+n2])**2)/scaleAlpha[n1:n1+n2]**2 alpha1=0.5*((kernel.alpha[0:n1])**2)/scaleAlpha[0:n1]**2 variance0=kernel.variance if logproductExpectations is None: logproductExpectations=0.0 for j in xrange(n2): temp=expectation(W[j],alpha2[j]) logproductExpectations+=np.log(temp) for i in xrange(x.shape[0]): results[i]=logproductExpectations+np.log(variance0)-np.sum(alpha1*((x[i,:]-X)**2)) return np.exp(results)
def normalize_input(params): if pc_id == 0: print 'normalize_input' dt = params['dt_rate'] # [ms] time step for the non-homogenous Poisson process L_input = np.zeros((params['n_exc'], params['t_stimulus']/dt)) v_max = params['v_max'] if params['log_scale']==1: v_rho = np.linspace(v_max/params['N_V'], v_max, num=params['N_V'], endpoint=True) else: v_rho = np.logspace(np.log(v_max/params['N_V'])/np.log(params['log_scale']), np.log(v_max)/np.log(params['log_scale']), num=params['N_V'], endpoint=True, base=params['log_scale']) v_theta = np.linspace(0, 2*np.pi, params['N_theta'], endpoint=False) index = 0 for i_RF in xrange(params['N_RF_X']*params['N_RF_Y']): index_start = index for i_v_rho, rho in enumerate(v_rho): for i_theta, theta in enumerate(v_theta): fn = params['input_rate_fn_base'] + str(index) + '.dat' L_input[index, :] = np.loadtxt(fn) print 'debug', fn index += 1 index_stop = index print 'before', i_RF, L_input[index_start:index_stop, :].sum() if (L_input[index_start:index_stop, :].sum() > 1): L_input[index_start:index_stop, :] /= L_input[index_start:index_stop, :].sum() print 'after', i_RF, L_input[index_start:index_stop, :].sum() for i in xrange(params['n_exc']): output_fn = params['input_rate_fn_base'] + str(i) + '.dat' print 'output_fn:', output_fn np.savetxt(output_fn, L_input[i, :]) if comm != None: comm.barrier()
def gm_assign_to_cluster(X, center_list, cov_list, p_k): """Assigns each sample to one of the Gaussian clusters given. Returns an array with numbers, 0 corresponding to the first cluster in the cluster list. """ # Reused code from E-step, should be unified somehow: samples = X.shape[0] K = len(center_list) log_p_Xn_mat = np.zeros((samples, K)) for k in range(K): log_p_Xn_mat[:, k] = logmulnormpdf(X, center_list[k], cov_list[k]) + np.log(p_k[k]) pmax = np.max(log_p_Xn_mat, axis=1) log_p_Xn = pmax + np.log(np.sum(np.exp(log_p_Xn_mat.T - pmax), axis=0).T) logL = np.sum(log_p_Xn) log_p_nk = np.zeros((samples, K)) for k in range(K): # log_p_nk[:,k] = logmulnormpdf(X, center_list[k], cov_list[k]) + np.log(p_k[k]) - log_p_Xn log_p_nk[:, k] = log_p_Xn_mat[:, k] - log_p_Xn print log_p_nk # Assign to cluster: maxP_k = np.c_[np.max(log_p_nk, axis=1)] == log_p_nk # print np.max(log_p_nk, axis=1) maxP_k = maxP_k * (np.array(range(K)) + 1) return np.sum(maxP_k, axis=1) - 1
def all_GL(self, q, maxpiv=None): """return (piv, f_binodal_gas, f_binodal_liquid, f_spinodal_gas, f_spinodal_liquid) at insersion works piv sampled between the critical point and maxpiv (default to 2.2*critical pressure)""" fc, pivc = self.critical_point(q) Fc = np.log(fc) #start sensibly above the critical point startp = pivc*1.1 fm = fminbound(self.mu, fc, self.maxf(), args=(startp, q)) fM = fminbound(lambda f: -self.pv(f, startp, q), 0, fc) initial_guess = np.log([0.5*fM, 0.5*(fm+self.maxf())]) #construct the top of the GL binodal if maxpiv is None: maxpiv = startp*2 topp = 1./np.linspace(1./startp, 1./maxpiv) topGL = [initial_guess] for piv in topp: topGL.append(self.binodalGL(piv, q, topGL[-1])) #construct the GL binodal between the starting piv and the critical point botp = np.linspace(startp, pivc)[:-1] botGL = [initial_guess] for piv in botp: botGL.append(self.binodalGL(piv, q, botGL[-1])) #join the two results and convert back from log binodal = np.vstack(( [[pivc, fc, fc]], np.column_stack((botp, np.exp(botGL[1:])))[::-1], np.column_stack((topp, np.exp(topGL[1:])))[1:] )) #spinodal at the same pivs spinodal = self.spinodalGL(q, binodal[:,0]) #join everything return np.column_stack((binodal, spinodal[:,1:]))
def loglike(self, endog, mu, scale=1.): """ Loglikelihood function for Gamma exponential family distribution. Parameters ---------- endog : array-like Endogenous response variable mu : array-like Fitted mean response variable scale : float, optional The default is 1. Returns ------- llf : float The value of the loglikelihood function evaluated at (endog,mu,scale) as defined below. Notes -------- llf = -1/scale * sum(endog/mu + log(mu) + (scale-1)*log(endog) +\ log(scale) + scale*gammaln(1/scale)) where gammaln is the log gamma function. """ return - 1./scale * np.sum(endog/mu + np.log(mu) + (scale - 1) * np.log(endog) + np.log(scale) + scale * special.gammaln(1./scale))
def nie_all(xi1, xi2, xc1, xc2, b, s, q, rot, ys1, ys2): x1, x2 = xy_rotate(xi1, xi2, xc1, xc2, rot) wx = np.sqrt(q * q * (x1 * x1 + s * s) + x2 * x2) al1 = b / np.sqrt(1 - q * q) * np.arctan(x1 * np.sqrt(1 - q * q) / (wx + s)) al2 = b / np.sqrt(1 - q * q) * np.arctanh(x2 * np.sqrt(1 - q * q) / (wx + q * q * s)) kappa = b / (2.0 * wx) hx = np.sqrt((wx + s) ** 2.0 + (1 - q * q) * x1 * x1) phi = x1 * al1 + x2 * al2 - b * s * np.log(hx) + b * q * s * np.log((1 + q) * s) Kc = 1.0 # Kc = (1.0+zl)/c*(Dl*Ds/Dls) td = Kc * (0.5 * ((al1) ** 2.0 + (al2) ** 2.0) - phi) # td = Kc*(0.5*((x1-ys1)**2.0+(x2-ys2)**2.0)-phi) y1 = x1 - al1 y2 = x2 - al2 y1, y2 = xy_rotate(y1, y2, xc1, xc2, -rot) # ------------------------------------------------------------------ demon1 = ((wx + s) ** 2 + (1.0 - q * q) * x1 * x1) * wx demon2 = ((wx + q * q * s) ** 2 - (1.0 - q * q) * x2 * x2) * wx y11 = 1 - b * (wx * (wx + s) - q * q * x1 * x1) / demon1 y22 = 1 - b * (wx * (wx + q * q * s) - x2 * x2) / demon2 y12 = -b * x1 * x2 / demon1 y21 = -b * x1 * x2 * q * q / demon2 mu = 1.0 / (y11 * y22 - y12 * y21) return phi, td, al1, al2, kappa, mu, y1, y2
def loglike(self, endog, mu, scale=1.): """ Loglikelihood function for inverse Gaussian distribution. Parameters ---------- endog : array-like Endogenous response variable mu : array-like Fitted mean response variable scale : float, optional The default is 1. Returns ------- llf : float The value of the loglikelihood function evaluated at (endog,mu,scale) as defined below. Notes ----- `llf` = -(1/2.)*sum((endog-mu)**2/(endog*mu**2*scale) + log(scale*endog**3) + log(2*pi)) """ return -.5 * np.sum((endog - mu)**2/(endog * mu**2 * scale) + np.log(scale * endog**3) + np.log(2 * np.pi))
def _ComputeSampledLogitsNP(self, true_w, true_b, sampled_w, sampled_b, hidden_acts, num_true=1, true_expected=None, sampled_expected=None): batch_size, dim = hidden_acts.shape true_logits = np.sum( hidden_acts.reshape((batch_size, 1, dim)) * true_w.reshape( (batch_size, num_true, dim)), axis=2) true_b = true_b.reshape((batch_size, num_true)) true_logits += true_b sampled_logits = np.dot(hidden_acts, sampled_w.T) + sampled_b if true_expected is not None: true_logits -= np.log(true_expected) if sampled_expected is not None: sampled_logits -= np.log(sampled_expected[np.newaxis, :]) out_logits = np.concatenate([true_logits, sampled_logits], axis=1) out_labels = np.hstack((np.ones_like(true_logits) / num_true, np.zeros_like(sampled_logits))) return out_logits, out_labels
def _SigmoidCrossEntropyWithLogits(logits, targets): # logits, targets: float arrays of the same shape. assert logits.shape == targets.shape pred = 1. / (1. + np.exp(-logits)) eps = 0.0001 pred = np.minimum(np.maximum(pred, eps), 1 - eps) return -targets * np.log(pred) - (1. - targets) * np.log(1. - pred)
def __init__(self, ps=None, sigma_v=0.0, redshift=0.0, **kwargs): if ps == None: from os.path import join, dirname #psfile = join(dirname(__file__),"data/ps_z1.5.dat") #psfile = join(dirname(__file__),"data/wigglez_halofit_z1.5.dat") psfile = join(dirname(__file__),"data/wigglez_halofit_z0.8.dat") print "loading matter power file: " + psfile redshift = 0.8 #pk_interp = cs.LogInterpolater.fromfile(psfile) pwrspec_data = np.genfromtxt(psfile) (log_k, log_pk) = (np.log(pwrspec_data[:,0]), \ np.log(pwrspec_data[:,1])) logpk_interp = interpolate.interp1d(log_k, log_pk, bounds_error=False, fill_value=np.min(log_pk)) pk_interp = lambda k: np.exp(logpk_interp(np.log(k))) kstar = 7.0 ps = lambda k: np.exp(-0.5 * k**2 / kstar**2) * pk_interp(k) self._sigma_v = sigma_v RedshiftCorrelation.__init__(self, ps_vv=ps, redshift=redshift)
def test_anisotropic_power(): for n_coeffs in [6, 15, 28, 45, 66, 91]: for norm_factor in [0.0005, 0.00001]: # Create some really simple cases: coeffs = np.ones((3, n_coeffs)) max_order = calculate_max_order(coeffs.shape[-1]) # For the case where all coeffs == 1, the ap is simply log of the # number of even orders up to the maximal order: analytic = (np.log(len(range(2, max_order + 2, 2))) - np.log(norm_factor)) answers = [analytic] * 3 apvals = anisotropic_power(coeffs, norm_factor=norm_factor) assert_array_almost_equal(apvals, answers) # Test that this works for single voxel arrays as well: assert_array_almost_equal( anisotropic_power(coeffs[1], norm_factor=norm_factor), answers[1]) # Test that even when we look at an all-zeros voxel, this # avoids a log-of-zero warning: with warnings.catch_warnings(record=True) as w: assert_equal(anisotropic_power(np.zeros(6)), 0) assert len(w) == 0
def test_der_log(): x = np.linspace(0.001, 5, 6) h = 1e-15 der1 = np.log(bicomplex(x + h * 1j, 0)).imag1 / h np.testing.assert_allclose(der1, 1./x) der2 = np.log(bicomplex(x + h * 1j, h)).imag12 / h**2 np.testing.assert_allclose(der2, -1./x**2)
e1 = np.zeros((len(n))) e2 = np.zeros((len(n))) e3 = np.zeros((len(n))) for i in range(len(n)): I1[i], I2[i], I3[i] = SimpsonIntegrate(func1, 0, 1, n[i]) for i in range(len(n)): e1[i] = np.abs(I1[i] - solu) e2[i] = np.abs(I2[i] - solu) e3[i] = np.abs(I3[i] - solu) xmesh = np.arange(0, 1, .001) y = np.zeros(len(xmesh)) for i in range(len(xmesh)): y[i] = func1(xmesh[i]) plt.plot(xmesh, y, label='Given Integrand') plt.legend() plt.grid(True) plt.show() plt.plot(np.log(n), np.log(e1), label='Trapezoidal') plt.plot(np.log(n), np.log(e2), label='Simpsons 1/3') plt.plot(np.log(n), np.log(e3), label='Simpsons 3/8') plt.grid(True) plt.xlabel('log of n') plt.ylabel('log of error') plt.legend() plt.show()
from __future__ import division from __future__ import absolute_import from future import standard_library standard_library.install_aliases() from rlpy.Domains import PST from rlpy.Agents import Greedy_GQ from rlpy.Representations import * from rlpy.Policies import eGreedy from rlpy.Experiments import Experiment import numpy as np from hyperopt import hp param_space = { # 'discretization': hp.quniform("discretization", 5, 50, 1), 'boyan_N0': hp.loguniform("boyan_N0", np.log(1e1), np.log(1e5)), 'initial_learn_rate': hp.loguniform("initial_learn_rate", np.log(5e-2), np.log(1)) } def make_experiment(exp_id=1, path="./Results/Temp/{domain}/{agent}/{representation}/", lambda_=0., boyan_N0=3019.313, initial_learn_rate=0.965830): opt = {} opt["path"] = path opt["exp_id"] = exp_id opt["max_steps"] = 500000 opt["num_policy_checks"] = 30
def train(epoch, model): model = parallelize(model) model.train() total = 0 correct = 0 end = time.time() for i, (x, y) in enumerate(train_loader): global_itr = epoch * len(train_loader) + i update_lr(optimizer, global_itr) # Training procedure: # for each sample x: # compute z = f(x) # maximize log p(x) = log p(z) - log |det df/dx| x = x.to(device) beta = beta = min( 1, global_itr / args.annealing_iters) if args.annealing_iters > 0 else 1. bpd, logits, logpz, neg_delta_logp = compute_loss(x, model, beta=beta) if args.task in ['density', 'hybrid']: firmom, secmom = estimator_moments(model) bpd_meter.update(bpd.item()) logpz_meter.update(logpz.item()) deltalogp_meter.update(neg_delta_logp.item()) firmom_meter.update(firmom) secmom_meter.update(secmom) if args.task in ['classification', 'hybrid']: y = y.to(device) crossent = criterion(logits, y) ce_meter.update(crossent.item()) # Compute accuracy. _, predicted = logits.max(1) total += y.size(0) correct += predicted.eq(y).sum().item() # compute gradient and do SGD step if args.task == 'density': loss = bpd elif args.task == 'classification': loss = crossent else: if not args.scale_dim: bpd = bpd * (args.imagesize * args.imagesize * im_dim) loss = bpd + crossent / np.log( 2) # Change cross entropy from nats to bits. loss.backward() if global_itr % args.update_freq == args.update_freq - 1: if args.update_freq > 1: with torch.no_grad(): for p in model.parameters(): if p.grad is not None: p.grad /= args.update_freq grad_norm = torch.nn.utils.clip_grad.clip_grad_norm_( model.parameters(), 1.) if args.learn_p: compute_p_grads(model) optimizer.step() optimizer.zero_grad() update_lipschitz(model) ema.apply() gnorm_meter.update(grad_norm) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.print_freq == 0: s = ('Epoch: [{0}][{1}/{2}] | Time {batch_time.val:.3f} | ' 'GradNorm {gnorm_meter.avg:.2f}'.format( epoch, i, len(train_loader), batch_time=batch_time, gnorm_meter=gnorm_meter)) if args.task in ['density', 'hybrid']: s += ( ' | Bits/dim {bpd_meter.val:.4f}({bpd_meter.avg:.4f}) | ' 'Logpz {logpz_meter.avg:.0f} | ' '-DeltaLogp {deltalogp_meter.avg:.0f} | ' 'EstMoment ({firmom_meter.avg:.0f},{secmom_meter.avg:.0f})' .format(bpd_meter=bpd_meter, logpz_meter=logpz_meter, deltalogp_meter=deltalogp_meter, firmom_meter=firmom_meter, secmom_meter=secmom_meter)) if args.task in ['classification', 'hybrid']: s += ' | CE {ce_meter.avg:.4f} | Acc {0:.4f}'.format( 100 * correct / total, ce_meter=ce_meter) logger.info(s) if i % args.vis_freq == 0: visualize(epoch, model, i, x) del x torch.cuda.empty_cache() gc.collect()
for i in range(len(names)): aa = y_pres[i] - array_train[:, 0] loss = np.dot(weights, np.multiply(aa, aa)) losses.append(loss) min_index = np.argmin(losses) print('****************Update precedure****************') iteration = 1000 w = [] index = [min_index] for iters in range(iteration): epsilon = np.dot(weights, np.abs(y_pres[index[-1]] - array_train[:, 0]) / 2) w.append(0.5 * np.log(1 / epsilon - 1) + 0.3) if w[-1] < 0.01: break temp = np.multiply( weights, np.exp(-w[-1] * np.multiply(y_pres[index[-1]], array_train[:, 0]))) weights = temp / np.sum(temp) losses = [] for i in range(len(names)): aa = y_pres[i] - array_train[:, 0] loss = np.dot(weights, np.multiply(aa, aa)) losses.append(loss) index.append(np.argmin(losses)) print('Iteration = ', iters + 1, ' Index = ', index[-2],
def main(): # gt = sys.argv[1] lmin = -100 # int(sys.argv[2]) lmax = 200 # int(sys.argv[3]) fpaths = sys.argv[1:-1] out_pdf = sys.argv[-1] fig = plt.figure() ax1 = fig.add_subplot(211) ax2 = fig.add_subplot(212) # ax1.twinx() tools = ['MALVA', 'GATK', 'BCFtools', 'discoSnp++'] colors = ['red', 'green', 'blue', 'orange'] i = 0 for fpath in fpaths: tps = {} fps = {} tots = {} for line in open(fpath): if line[0:4] == 'chr,': continue chrom, _gt, l, tp, fp, tot = line.strip('\n').split(',') if tot == "0": # These are FPs, we don't need them continue # if _gt != gt: # continue l = int(l) if lmin <= l <= lmax: tps[l] = tps[l] + int(tp) if l in tps else int(tp) fps[l] = fps[l] + int(fp) if l in fps else int(fp) tots[l] = tots[l] + int(tot) if l in tots else int(tot) tots_mod = {l: tots[l] + 1 for l in tots} ax1.scatter(sorted(tps.keys()), [tps[l] / tots[l] for l in sorted(tps.keys())], color=colors[i], label=tools[i], linewidths=0.0001, alpha=0.75, s=23) if i == 0: ax2.bar(sorted(tps.keys()), [np.log(tots_mod[l]) for l in sorted(tps.keys())], color="grey") i += 1 # plt.xticks(np.arange(min(Xs), max(Xs)+1, 25)) ax1.legend(loc=4, bbox_to_anchor=(1, -0.17), ncol=4) #ax1.set_title("Recall on {} indels".format(gt)) ax1.get_xaxis().set_visible(False) ax1.set_ylabel("Recall") ax2.set_xlabel("Indel length (#bp)") ax2.set_ylabel("#indels (log scale)") ax2.set_ylim(0, 12) # xlabel('Item (s)') # ylabel('Value') # title('Python Line Chart: Plotting numbers') # grid(True) plt.subplots_adjust(top=0.99, bottom=0.09, right=0.99, left=0.07) DPI = fig.get_dpi() fig.set_size_inches(1366.0 / float(DPI), 768.0 / float(DPI)) fig.savefig(out_pdf, dpi=DPI)
from sklearn.model_selection import train_test_split import numpy as np import scipy as sp fname = "C:/Users/AlessioB/Desktop/REFTEP ANN/sub-1_band-mu_iplv.mat" mat1 = h5py.File(fname) fname = "C:/Users/AlessioB/Desktop/REFTEP ANN/sub-1_band-betalow_iplv.mat" mat2 = h5py.File(fname) fname = "C:/Users/AlessioB/Desktop/REFTEP ANN/sub-1_band-betahigh_iplv.mat" mat3 = h5py.File(fname) X = np.hstack((mat1['iPLV'].value[:, ::20], mat2['iPLV'].value[:, ::20], mat3['iPLV'].value[:, ::20])) Y = mat1['AmpsMclean'].value Y = np.log(Y.T) #Y=sp.stats.zscore(Y) #plt.hist(Y) Y = Y[:, 0] threshold = np.median(Y) Y[Y < threshold] = 0 Y[Y >= threshold] = 1 X = X[:, np.std(X, 0) > 0] X = np.log(np.abs(X) / (1 - np.abs(X))) #X=sp.stats.zscore(X) #pca = PCA(n_components=2) #pca.fit(X.T) #Xred=pca.components_.T
def func1(x): if (x == 0): return 1 else: return ((np.log(1 + x)) / x)
output = nb_test(testMatrix, state) return evaluate(output, testCategory) trainMatrix, tokenlist, trainCategory = readMatrix('MATRIX.TRAIN') testMatrix, tokenlist, testCategory = readMatrix('MATRIX.TEST') state = nb_train(trainMatrix, trainCategory) output = nb_test(testMatrix, state) evaluate(output, testCategory) #problem b b=[] for i in range(1448): b.append((i,np.log(state[i][1])-np.log(state[i][0]))) b.sort(key=lambda i:i[-1],reverse=True) key = b[:5] word = [] for i in key: word.append(tokenlist[i[0]]) print(word) #problem c size = ['.50','.100','.200','.400','.800','.1400'] size1 = [50, 100, 200, 400, 800, 1400] train = "MATRIX.TRAIN" error = []
def limLog(self, x): MINLOG = 1e-1000 return np.log(np.maximum(x, MINLOG))
def LiftedCondensationLevelTemp(init_temp_k, dew_init_temp_k): if (init_temp_k<100.): init_temp_k = init_temp_k +273.15 if (dew_init_temp_k<100.): dew_init_temp_k = dew_init_temp_k +273.15 return (1./(1./(dew_init_temp_k-56) + log(init_temp_k/dew_init_temp_k)/800.)) + 56
def build_rpn_targets(image_shape, anchors, gt_class_ids, gt_boxes, config): """Given the anchors and GT boxes, compute overlaps and identify positive anchors and deltas to refine them to match their corresponding GT boxes. anchors: [num_anchors, (y1, x1, y2, x2)] gt_class_ids: [num_gt_boxes] Integer class IDs. gt_boxes: [num_gt_boxes, (y1, x1, y2, x2)] Returns: rpn_match: [N] (int32) matches between anchors and GT boxes. 1 = positive anchor, -1 = negative anchor, 0 = neutral rpn_bbox: [N, (dy, dx, log(dh), log(dw))] Anchor bbox deltas. """ ## RPN Match: 1 = positive anchor, -1 = negative anchor, 0 = neutral rpn_match = np.zeros([anchors.shape[0]], dtype=np.int32) ## RPN bounding boxes: [max anchors per image, (dy, dx, log(dh), log(dw))] rpn_bbox = np.zeros((config.RPN_TRAIN_ANCHORS_PER_IMAGE, 4)) ## Handle COCO crowds ## A crowd box in COCO is a bounding box around several instances. Exclude ## them from training. A crowd box is given a negative class ID. no_crowd_bool = np.ones([anchors.shape[0]], dtype=bool) ## Compute overlaps [num_anchors, num_gt_boxes] overlaps = compute_overlaps(anchors, gt_boxes) ## Match anchors to GT Boxes ## If an anchor overlaps a GT box with IoU >= 0.7 then it's positive. ## If an anchor overlaps a GT box with IoU < 0.3 then it's negative. ## Neutral anchors are those that don't match the conditions above, ## and they don't influence the loss function. ## However, don't keep any GT box unmatched (rare, but happens). Instead, ## match it to the closest anchor (even if its max IoU is < 0.3). # ## 1. Set negative anchors first. They get overwritten below if a GT box is ## matched to them. Skip boxes in crowd areas. anchor_iou_argmax = np.argmax(overlaps, axis=1) anchor_iou_max = overlaps[np.arange(overlaps.shape[0]), anchor_iou_argmax] rpn_match[(anchor_iou_max < 0.3) & (no_crowd_bool)] = -1 ## 2. Set an anchor for each GT box (regardless of IoU value). ## TODO: If multiple anchors have the same IoU match all of them gt_iou_argmax = np.argmax(overlaps, axis=0) rpn_match[gt_iou_argmax] = 1 ## 3. Set anchors with high overlap as positive. rpn_match[anchor_iou_max >= 0.7] = 1 ## Subsample to balance positive and negative anchors ## Don't let positives be more than half the anchors ids = np.where(rpn_match == 1)[0] extra = len(ids) - (config.RPN_TRAIN_ANCHORS_PER_IMAGE // 2) if extra > 0: ## Reset the extra ones to neutral ids = np.random.choice(ids, extra, replace=False) rpn_match[ids] = 0 ## Same for negative proposals ids = np.where(rpn_match == -1)[0] extra = len(ids) - (config.RPN_TRAIN_ANCHORS_PER_IMAGE - np.sum(rpn_match == 1)) if extra > 0: ## Rest the extra ones to neutral ids = np.random.choice(ids, extra, replace=False) rpn_match[ids] = 0 ## For positive anchors, compute shift and scale needed to transform them ## to match the corresponding GT boxes. ids = np.where(rpn_match == 1)[0] ix = 0 ## index into rpn_bbox ## TODO: use box_refinment() rather than duplicating the code here for i, a in zip(ids, anchors[ids]): ## Closest gt box (it might have IoU < 0.7) gt = gt_boxes[anchor_iou_argmax[i]] ## Convert coordinates to center plus width/height. ## GT Box gt_h = gt[2] - gt[0] gt_w = gt[3] - gt[1] gt_center_y = gt[0] + 0.5 * gt_h gt_center_x = gt[1] + 0.5 * gt_w ## Anchor a_h = a[2] - a[0] a_w = a[3] - a[1] a_center_y = a[0] + 0.5 * a_h a_center_x = a[1] + 0.5 * a_w ## Compute the bbox refinement that the RPN should predict. rpn_bbox[ix] = [ (gt_center_y - a_center_y) / a_h, (gt_center_x - a_center_x) / a_w, np.log(gt_h / a_h), np.log(gt_w / a_w), ] ## Normalize rpn_bbox[ix] /= config.RPN_BBOX_STD_DEV ix += 1 return rpn_match, rpn_bbox
step_all, probabilities = random_neighbour_avoiding(random_walks) else: step_all, probabilities = random_neighbour(random_walks[step]) random_walks.append(step_all) weights.append( generate_weights(random_walks, weights[step], probabilities)) if do_resample: random_walks, weights[-1] = resample(random_walks, weights[-1]) print("-", end="") return random_walks, weights walks, weights = init_walk(10, 20, 10**2, self_avoiding=True, do_resample=True) print("") cn = np.mean(weights, axis=1) length = len(cn) y = np.log(cn).T X = np.hstack((np.ones(length), np.arange(length) + 1, np.log(np.arange(length) + 1))).reshape(3, length).T theta = np.linalg.inv(X.T @ X) @ X.T @ y print(np.e**theta) # %%
# Here we confirm that all of the transactions in `train_identity` print(np.sum(train_transaction.index.isin(train_identity.index.unique()))) print(np.sum(test_transaction.index.isin(test_identity.index.unique()))) train_transaction['TransactionDT'].head() train_transaction['TransactionDT'].shape[0], train_transaction[ 'TransactionDT'].nunique() train_transaction['TransactionDT'].value_counts().head(10) fig, ax = plt.subplots(1, 2, figsize=(18, 4)) time_val = train_transaction['TransactionDT'].values sns.distplot(time_val, ax=ax[0], color='r') ax[0].set_title('Distribution of TransactionDT', fontsize=14) ax[1].set_xlim([min(time_val), max(time_val)]) sns.distplot(np.log(time_val), ax=ax[1], color='b') ax[1].set_title('Distribution of LOG TransactionDT', fontsize=14) ax[1].set_xlim([min(np.log(time_val)), max(np.log(time_val))]) plt.show() fig, ax = plt.subplots(1, 2, figsize=(18, 4)) time_val = train_transaction.loc[train_transaction['isFraud'] == 1]['TransactionDT'].values sns.distplot(np.log(time_val), ax=ax[0], color='r') ax[0].set_title('Distribution of LOG TransactionDT, isFraud=1', fontsize=14) ax[1].set_xlim([min(np.log(time_val)), max(np.log(time_val))]) time_val = train_transaction.loc[train_transaction['isFraud'] == 0]['TransactionDT'].values
def num_steps(self): if self._num_steps is None: return 2 * int(np.round(16.0 / np.log(np.abs(self.step_ratio)))) + 1 return self._num_steps
sent_vec.append(1) else: sent_vec.append(0) sentence_vectors.append(sent_vec) sentence_vectors = np.asarray(sentence_vectors) #tf ifd n=total number of documents N = 1000 feture = np.zeros([1000, 200]) df = np.sum(sentence_vectors, axis=0) for i in range(1000): for j in range(200): # print(np.log(N/df[j])) feture[i, j] = (float)(sentence_vectors[i, j] * np.log(N / df[j])) def pca(data, k): cov_data = np.cov(np.transpose(data)) eig_val, eig_vector = np.linalg.eig(cov_data) def eigen_sort(value, vector): idx = value.argsort()[::-1] eigenValues = value[idx] eigenVectors = vector[:, idx] return (eigenValues, eigenVectors) eig_vals, eig_vectors = eigen_sort(eig_val, eig_vector) def final_projection(eigen_matrix, x, k):
def cross_entropy_loss(self,y,y_pred,l2_penalty): cross_entropy = (np.sum(np.square(self.ip_to_hl.w)) + np.sum(np.square(self.hl_to_op.w))) * (l2_penalty/ (2 * len(y))) + (np.dot(np.transpose(y), np.log(y_pred + 1e-12)) + np.dot((1 - np.transpose(y)), np.log(1 - y_pred + 1e-12))) #print(cumulative) avg_batch_cross_entropy_loss = (-1.0)* np.sum(cross_entropy)/len(y) return avg_batch_cross_entropy_loss
def __load_dataset__(self, genome_scores, genome_tags, tags_applies, movies_df): tags_applies[TAG] = tags_applies[TAG].str.lower() values_with_pop = [] for tid, t in genome_tags.as_matrix(): tags_a = tags_applies.loc[tags_applies[TAG] == t.lower()] tags_total = np.array(tags_a[MOVIE_ID]) values_with_pop.append([tid, t, len(tags_total)]) self.logger.info( "Tag popularity for tag {} is: {}".format(t, len(tags_total)) ) if len(tags_total) == 0: self.logger.info((tid, t)) self.logger.info( "Tag popularity for tag {} is: {}".format(t, len(tags_total)) ) values_with_pop.append([tid, t, 2]) cols = list(genome_tags.columns) cols.append(TAG_POPULARITY) genome_tags = pd.DataFrame(values_with_pop, columns=cols) doc_freq = [] for i, tag in enumerate(genome_tags.values): df = genome_scores.loc[genome_scores["tagId"] == tag[0]] freq = np.sum(np.array(df["relevance"]) > 0.5) if freq == 0 or freq == 1: freq = 2 self.logger.info("Document frequency for tag {} is: zero".format(tag)) doc_freq.append(freq) genome_tags[DOC_FREQUENCY] = doc_freq genome_tags.to_csv(self.tags_info_file, index=False) self.logger.info( "Done loading the tag popularity and doc frequency for the tags" ) self.weights = np.log(np.array(genome_tags[TAG_POPULARITY])) / np.log( np.array(genome_tags[DOC_FREQUENCY]) ) objects = [] tags_rel = genome_scores.as_matrix() movie_ids = np.unique(np.array(genome_scores[MOVIE_ID])) for i, movie_id in enumerate(movie_ids): a = i * self.n_features b = (i + 1) * self.n_features objects.append(tags_rel[a:b, 2]) objects = np.array(objects) movies_df = movies_df[movies_df[MOVIE_ID].isin(movie_ids)] for i, tag in enumerate(genome_tags.values): movies_df[tag[1]] = objects[:, i] movies_df.to_csv(self.movies_file, index=False) self.logger.info("Done loading the features for the movies") num_of_movies = movie_ids.shape[0] combinations_list = np.array(list(combinations(range(num_of_movies), 2))) similarity_matrix = dict() features = movies_df.as_matrix()[:, 3:] for i, j in combinations_list: similarity_matrix[get_key_for_indices(i, j)] = weighted_cosine_similarity( self.weights )(features[i], features[j]) self.logger.info( "Calculating similarity {},{}, {}".format( i, j, similarity_matrix[get_key_for_indices(i, j)] ) ) for i in range(num_of_movies): similarity_matrix[get_key_for_indices(i, i)] = 1.0 series = pd.Series(similarity_matrix) matrix_df = pd.DataFrame( {"col_major_index": series.index, "similarity": series.values} ) matrix_df.to_csv(self.similarity_matrix_file, index=False) self.logger.info( "Done calculating the similarity matrix stored at: {}".format( self.similarity_matrix_file ) )
def multimeter_error(value, scale, multimeter_type, measure_type, ignore_gain=False, ignore_digit=False): """ value is the value measured by the multimeter scale is the "end of scale" given by the multimeter, or the v/div on the oscilloscope multimeter_type is: 'a' for agilent 'm' for metrix 'o' for oscilloscope measure_type is: 'a' for current 'v' for tension 'ohm' for resistance 'c' for capacity 's' for time Returns the error of the measure. """ if (multimeter_type == 'a'): if (measure_type == 'a'): scale_array = np.array([6 * 10**(-5), 6 * 10**(-4), 6, 10]) resolution_array = float(10)**(np.array([-8, -7, -3, -2])) percent_accuracy = np.array([1, 1, 1, 1]) digit_accuracy = np.array([2, 2, 3, 3]) elif (measure_type == 'v'): scale_array = 6 * np.array([10**(-1), 1, 10, 100]) resolution_array = float(10)**(np.array([-4, -3, -2, -1])) percent_accuracy = 0.5 * np.array([1, 1, 1, 1]) digit_accuracy = 2 * np.array([1, 1, 1, 1]) elif (measure_type == 'ohm'): scale_array = 6 * float(10)**(np.array([2, 3, 4, 5, 6, 7])) resolution_array = float(10)**(np.array([2, 3, 4, 5, 6, 7]) - 3) percent_accuracy = np.array([0.9, 0.9, 0.9, 0.9, 0.9, 1.5]) digit_accuracy = 3 * np.array([1, 1, 1, 1, 1, 1]) else: print(f'{measure_type} is not a valid measure type') return (None) elif (multimeter_type == 'm'): if (measure_type == 'v'): scale_array = np.array([1, 10, 100, 1000]) resolution_array = float(10)**(np.array([-5, -4, -3, -2])) percent_accuracy = np.array([0.05, 0.03, 0.03, 0.035]) digit_accuracy = 8 * np.array([1, 1, 1, 1]) elif (measure_type == 'a'): scale_array = float(10)**(np.array([-3, -2, -1, 0, 1])) resolution_array = float(10)**(np.array([-3, -2, -1, 0, 1]) - 5) percent_accuracy = np.array([0.1, 0.08, 0.08, 0.15, 0.5]) digit_accuracy = np.array([15, 8, 8, 8, 15]) elif (measure_type == 'ohm'): scale_array = float(10)**(np.array([3, 4, 5, 6, 7, 8])) resolution_array = float(10)**(np.array([3, 4, 5, 6, 7, 8]) - 5) percent_accuracy = np.array([0.1, 0.07, 0.07, 0.07, 1, 3]) digit_accuracy = np.array([8, 8, 8, 8, 80, 80]) elif (measure_type == 'c'): scale_array = float(10)**(np.array( [-9, -8, -7, -6, -5, -4, -3, -2])) resolution_array = float(10)**( np.array([-9, -8, -7, -6, -5, -4, -3, -2]) - 3) percent_accuracy = np.array([2.5, 1, 1, 1, 1, 1, 1, 1.5]) digit_accuracy = np.array([15, 8, 8, 10, 10, 10, 15, 15]) else: print(f'{measure_type} is not a valid measure type') return (None) elif (multimeter_type == 'o'): if (measure_type == 'v'): mag_ord = float(10)**(np.arange(-9, 2)) scale_array = np.concatenate((mag_ord, 2 * mag_ord, 5 * mag_ord)) resolution_array = scale_array / 10 percent_accuracy = 0.01 * np.ones(len(scale_array)) digit_accuracy = np.ones(len(scale_array)) elif (measure_type == 's'): scale_array = float(10)**(np.arange(-9, 2)) resolution_array = scale_array / 10 percent_accuracy = 3 * np.ones(len(scale_array)) digit_accuracy = np.ones(len(scale_array)) else: print(f'{measure_type} is not a valid measure type') return (None) else: print(f'{multimeter_type} is not a valid multimeter type') return (None) if (not (len(scale_array) == len(resolution_array) == len(percent_accuracy) == len(digit_accuracy))): print('Error in hardcoded values') return (None) tolerance = 0.01 index = -1 for i in range(len(scale_array)): if (np.abs(np.log(scale_array[i]) - np.log(scale)) < tolerance): index = i if (index == -1): print(f'{scale} is an invalid scale') return (None) distribution_factor = 1 / np.sqrt(3) if (ignore_gain == False and ignore_digit == False): error = np.sqrt((percent_accuracy[index] * value / 100)**2 + (digit_accuracy[index] * resolution_array[index])**2) * distribution_factor elif (ignore_gain == True and ignore_digit == False): error = digit_accuracy[index] * resolution_array[ index] * distribution_factor elif (ignore_gain == False and ignore_digit == True): error = percent_accuracy[index] * value / 100 * distribution_factor else: print('Cannot ignore both errors') return (None) return (error)
def interpolated_broadening(sigma=None, points=None, bins=None, center=None, weights=1, is_hist=False, limit=3, function='gaussian', spacing='sqrt2'): """Return a fast estimate of frequency-dependent broadening Consider a spectrum of two peaks, in the case where (as in indirect-geometry INS) the peak width increases with frequency. | | | | | ----------------- In the traditional scheme we broaden each peak individually and combine: * | * * | + | * = * * * * | | * * * * * * ----------------- ----------------- ----------------- Instead of summing over broadening kernels evaluated at each peak, the approximate obtains a spectrum corresponding to the following scheme: - For each sigma value, the entire spectrum is convolved with an appropriate-width broadening function - At each frequency, the final spectrum value is drawn from the spectrum broadened with corresponding sigma. Compared to a summation over broadened peaks, this method introduces an asymmetry to the spectrum about each peak. * * * * * * * * * * * * --> ** * * * * * * * * * * * * * * * * ----------------- , ----------------- , ----------------- ----------------- This asymmetry should be tolerable as long as the broadening function varies slowly in frequency relative to its width. The benefit of this scheme is that we do not need to evaluate the convolution at every sigma value; nearby spectra can be interpolated. Trial-and-error finds that with optimal mixing the error of a Gaussian approximated by mixing a wider and narrower Gaussian is ~ 5% when the sigma range is factor of 2, and ~ 1% when the sigma range is a factor of sqrt(2). A pre-optimised transfer function can be used for a fixed ratio between the reference functions. :param sigma: widths of broadening functions (passed to "sigma" argument of function) :type sigma: float or Nx1 array :param bins: sample bins for function evaluation. This _must_ be evenly-spaced. :type bins: 1-D array :param points: regular grid of points for which function should be evaluated. :type points: 1-D array :param center: centers of broadening functions :type center: float or Nx1 array :param weights: weights of peaks for summation :type weights: float or array corresponding to "center" :param is_hist: If "weights" is already a histogram corresponding to evenly-spaced frequencies, set this to True to avoid a redundant binning operation. :type is_hist: bool :param function: broadening function; currently only 'gaussian' is accepted :type function: str :param limit: range (as multiple of sigma) for cutoff :type limit: float :param spacing: Spacing factor between Gaussian samples on log scale. This is not a free parameter as a pre-computed curve is used for interpolation. Allowed values: '2', 'sqrt2', with error ~5% and ~1% respectively. :type spacing: str :returns: (points, spectrum) :returntype: (1D array, 1D array) """ mix_functions = {'gaussian': {'2': {'lower': [-0.1873, 1.464, -4.079, 3.803], 'upper': [0.2638, -1.968, 5.057, -3.353]}, 'sqrt2': {'lower': [-0.6079, 4.101, -9.632, 7.139], 'upper': [0.7533, -4.882, 10.87, -6.746]}}} log_bases = {'2': 2, 'sqrt2': np.sqrt(2)} log_base = log_bases[spacing] # Sample on appropriate log scale: log_b(x) = log(x) / log(b) n_kernels = int(np.ceil(np.log(max(sigma) / min(sigma)) / np.log(log_base))) + 1 if n_kernels == 1: # Special case: same width everywhere, only need one kernel sigma_samples = np.array([min(sigma)]) else: sigma_samples = log_base**np.arange(n_kernels) * min(sigma) bin_width = bins[1] - bins[0] # Get set of convolved spectra for interpolation if is_hist: hist = weights else: hist, _ = np.histogram(center, bins=bins, weights=weights, density=False) freq_range = 3 * max(sigma) kernel_npts_oneside = np.ceil(freq_range / bin_width) if function == 'gaussian': kernels = mesh_gaussian(sigma=sigma_samples[:, np.newaxis], points=np.arange(-kernel_npts_oneside, kernel_npts_oneside + 1, 1) * bin_width, center=0) else: raise ValueError('"{}" kernel not supported for "interpolate" broadening method.'.format(function)) spectra = np.array([convolve(hist, kernel, mode='same') for kernel in kernels]) # Interpolate with parametrised relationship sigma_locations = np.searchsorted(sigma_samples, sigma) # locations in sampled values of points from sigma spectrum = np.zeros_like(points) # Samples with sigma == min(sigma) are a special case: copy directly from spectrum spectrum[sigma_locations==0] = spectra[0, sigma_locations==0] for i in range(1, len(sigma_samples)): masked_block = (sigma_locations == i) sigma_factors = sigma[masked_block] / sigma_samples[i - 1] lower_mix = np.polyval(mix_functions[function][spacing]['lower'], sigma_factors) upper_mix = np.polyval(mix_functions[function][spacing]['upper'], sigma_factors) spectrum[masked_block] = (lower_mix * spectra[i-1, masked_block] + upper_mix * spectra[i, masked_block]) return points, spectrum
def get_network_numpool(patch_size, maxpool_cap=999, min_feature_map_size=4): network_numpool_per_axis = np.floor([np.log(i / min_feature_map_size) / np.log(2) for i in patch_size]).astype(int) network_numpool_per_axis = [min(i, maxpool_cap) for i in network_numpool_per_axis] return network_numpool_per_axis
def bubble_plot(df, x, y, ordered_x_values=None, ordered_y_values=None, bins_x=10, bins_y=10, fontsize=16, figsize=(15, 10), maximal_bubble_size=5000, normalization_by_all=False, log=False): """ :param df: dataframe :param x: name of first numerical/categorical field (string) (for x-axis) :param y: name of second numerical/categorical field (string) (for y-axis) :param ordered_x_values: the values we would like to map from x categorical variable according to the order we would like to present them :param ordered_y_values: the values we would like to map from the y categorical variable according to the order we would like to present them :param bins_x: the bins for x values if x is numberic :param bins_y: the bins for y values if y is numberic :param normalization_by_all: True - shows joint distribution p(x,y), False - shows conditional distribution p(y|x) :param maximal_bubble_size: if the bubbles are too big or too small this is the parameter you should change! :param log: whether to apply log on the count (influence the size of the bubbles) :return: nice bubble plot :) """ plt.figure(figsize=figsize) x_is_numeric = df[x].dtype in (float, int) y_is_numeric = df[y].dtype in (float, int) count_table = pd.concat([ pd.cut(df[x], bins=bins_x) if x_is_numeric else df[x], pd.cut(df[y], bins=bins_y) if y_is_numeric else df[y] ], axis=1) count_table = count_table.groupby(x)[y].value_counts().unstack().fillna(0) ordered_x_values = count_table.index.values if ordered_x_values is None else ordered_x_values ordered_y_values = count_table.columns if ordered_y_values is None else ordered_y_values if normalization_by_all: count_table /= count_table.sum().sum() else: for col in count_table.columns: count_table[col] /= count_table[col].sum() if log: count_table = np.log(count_table) maximal_bubble_size /= 2 size_factor = maximal_bubble_size / count_table.max().max() count_table_long = pd.melt(count_table.reset_index(), id_vars=x) x_values_dict = {x:i for i, x in enumerate(ordered_x_values)} \ if not x_is_numeric else {xx:get_point(xx) for xx in ordered_x_values} y_values_dict = {x:i for i, x in enumerate(ordered_y_values)} \ if not y_is_numeric else {xx: get_point(xx) for xx in ordered_y_values} count_table_long[x] = count_table_long[x].map(x_values_dict) count_table_long[y] = count_table_long[y].map(y_values_dict) xticks = np.arange(count_table.shape[0]) if not x_is_numeric else [ get_point(xx) for xx in ordered_x_values ] yticks = np.arange(count_table.shape[1]) if not y_is_numeric else [ get_point(xx) for xx in ordered_y_values ] xticklabels = ordered_x_values if not x_is_numeric else [ get_point(xx) for xx in ordered_x_values ] yticklabels = ordered_y_values if not y_is_numeric else [ get_point(xx) for xx in ordered_y_values ] plt.scatter(count_table_long[x], count_table_long[y], s=size_factor * count_table_long['value'], c=count_table_long['value'], cmap='cool') plt.xticks(xticks, xticklabels, fontsize=fontsize) plt.yticks(yticks, yticklabels, fontsize=fontsize) plt.xlabel(x, fontsize=fontsize) plt.ylabel(y, fontsize=fontsize) plt.title("{} vs {} ".format(y, x), fontsize=fontsize + 4)
def trading(stock_1,stock_2,train=60,trade=1,delta=1/252, interest= 0.02): ts_1 = np.asarray(test_df.iloc[stock_1,:]) ts_2 = np.asarray(test_df.iloc[stock_2,:]) price_1 = np.asarray(test_price_df.iloc[stock_1,:]) price_2 = np.asarray(test_price_df.iloc[stock_2,:]) t=train initial_wealth = 1. duration = len(ts_1)-train+1 q_stock_1 = np.zeros(duration) q_stock_2 = np.zeros(duration) wealth = np.full(duration,initial_wealth) bank = wealth #print(type(bank[0])) cash = initial_wealth kappa = None while(t+trade<len(ts_1)): train_ts_1 = ts_1[t-train:t] train_ts_2 = ts_2[t-train:t] LR_model = LinearRegression().fit(train_ts_2.reshape(-1,1),train_ts_1) beta = LR_model.coef_ train_res= train_ts_1 - (beta)*train_ts_2 model = statsmodels.tsa.api.ARMA(train_res,order=(1,0)).fit(disp=False) a, b = model.params xi = model.resid #kappa = (-b+1)/delta #mean = a/(kappa*delta) #sigmaeq = np.sqrt(np.var(xi)/delta) previous_kappa = kappa if b>0: kappa= -np.log(b)/delta else: if previous_kappa is None: kappa = 10e5 #print(kappa) mean = a/(1-b) sigmasq = np.var(xi)*2*kappa/(1-b**2) sigmaeq = np.sqrt(sigmasq/(2*kappa)) for i in range(trade): if t+i>train: q_stock_1[t+i-train] = q_stock_1[t+i-1-train] q_stock_2[t+i-train] = q_stock_2[t+i-1-train] cash = bank[t+i-1-train]*((1+0.02/252)**(1/252)) signal = ((ts_1[t+i]-(beta)*ts_2[t+i])-mean)/sigmaeq if signal > ssen: if q_stock_1[t+i-train] ==0: q_stock_1[t+i-train] -= 1 q_stock_2[t+i-train] += beta cash = cash + price_1[t+i] - beta*price_2[t+i] elif (signal <ssex and signal > slex): cash = cash + q_stock_1[t+i-train]*price_1[t+i] + q_stock_2[t+i-train]*price_2[t+i] q_stock_1[t+i-train] = 0 q_stock_2[t+i-train] = 0 elif (signal < slen): if q_stock_1[t+i-train] == 0: q_stock_1[t+i-train] += 1 q_stock_2[t+i-train] -= beta cash = cash - price_1[t+i] + beta*price_2[t+i] bank[t+i-train] = cash wealth[t+i-train] = cash+ q_stock_1[t+i-train]*price_1[t+i]+q_stock_2[t+i-train]*price_2[t+i] t=t+trade return wealth, q_stock_1, train_res
def compute_entropy(self, x): H = 0 for i in range(len(x)): H += (x[i] * np.log(x[i])) return H
#plt.show() # fetch scattering kernel sigma = getKernel(__screenfile__) # sample positions t1 = np.tile(np.arange(m.nx), m.nx) t0 = np.repeat(np.arange(m.nx), m.nx, axis=0) t = np.hstack([t0[:, np.newaxis], t1[:, np.newaxis]]) # fit w = m.source.sum() / np.sum(noise**2 * m.source) tic = time.time() #initial = np.array([np.log(1./w),np.log(0.5),ftot, fwhm]) initial = np.array([np.log(1. / w), np.log(2), ftot, fwhm]) res = minimize(lnprob,initial,\ args=(m,mNoisy,w,sigma,t),\ method='Nelder-Mead',\ options={'disp':True,'maxiter':int(1000)}) print 'optimization took %0.2fs' % (time.time() - tic) print 'result:', res.x #tic = time.time() #args = (m,mNoisy,w,sigma,t) #f = lnprob(initial,*args) #print '1 exec took %0.2f' % (time.time() - tic) # best fit noise model #p = initial
def interpolate(self, S_0, S_1, S_2, interpolation='gaussian'): """ Use interpolation to refine an FFT frequency estimate. .. image:: /_static/interpolation_diagram.png :align: center :alt: Interpolation diagram For an FFT bin spacing of :math:`\delta f`, the input frequency is estimated as: .. math:: f_{in} \\approx \delta f (k + \Delta) Where :math:`k` is the FFT bin with the maximum magnitude and :math:`\Delta \in [-\\frac{1}{2}, \\frac{1}{2}]` is a correction found by interpolation. **Parabolic interpolation:** .. math:: \Delta = \\frac{1}{2} \\frac{S[k+1] - S[k-1]}{2S[k] - S[k-1] - S[k+1]} Where :math:`S[n]` is the magnitude of FFT bin :math:`n`. **Gaussian interpolation:** .. math:: \Delta = \\frac{1}{2} \\frac{\ln(S[k+1]) - \ln(S[k-1])}{2\ln(S[k]) - \ln(S[k-1]) - \ln(S[k+1])} The Gaussian interpolation method gives better results, especially when used with a Gaussian window function, at the expense of computational complexity. See [1]_ for detailed comparison. Parameters ---------- S_0 : float :math:`S[k-1]`, i.e. the magnitude of FFT bin one before the maxima. S_1 : float :math:`S[k]` i.e. the magnitude of the maximum FFT. S_2 : float :math:`S[k+1]`, i.e. the magnitude of FFT bin one after the maxima. Returns ------- out : float The fractional number of FFT bins :math:`\Delta` that the interpolated maximum is from the maximum point :math:`S[k]`. References ---------- .. [1] Gasior, M. et al., "Improving FFT frequency measurement resolution by parabolic and Gaussian spectrum interpolation" AIP Conf.Proc. 732 (2004) 276-285 `CERN-AB-2004-023-BDI <http://cdsweb.cern.ch/record/738182>`_ """ if interpolation == 'parabolic': # Parabolic interpolation. return 0.5 * (S_2 - S_0) / (2 * S_1 - S_0 - S_2) elif interpolation == 'gaussian': # Gaussian interpolation. ln_S_0 = np.log(S_0) ln_S_1 = np.log(S_1) ln_S_2 = np.log(S_2) return 0.5 * (ln_S_2 - ln_S_0) / (2 * ln_S_1 - ln_S_0 - ln_S_2) elif interpolation == 'none': return 0 else: raise ValueError("Unknown interpolation mode '%s'", interpolation)
# ### Score features # # There are several ways to score features : # - Compute the number of samples in the actual importances that are away from the null importances recorded distribution. # - Compute ratios like Actual / Null Max, Actual / Null Mean, Actual Mean / Null Max # # In a first step I will use the log actual feature importance divided by the 75 percentile of null distribution. # In[13]: feature_scores = [] for _f in actual_imp_df['feature'].unique(): f_null_imps_gain = null_imp_df.loc[null_imp_df['feature'] == _f, 'importance_gain'].values f_act_imps_gain = actual_imp_df.loc[actual_imp_df['feature'] == _f, 'importance_gain'].mean() gain_score = np.log(1e-10 + f_act_imps_gain / (1 + np.percentile(f_null_imps_gain, 75))) # Avoid didvide by zero f_null_imps_split = null_imp_df.loc[null_imp_df['feature'] == _f, 'importance_split'].values f_act_imps_split = actual_imp_df.loc[actual_imp_df['feature'] == _f, 'importance_split'].mean() split_score = np.log(1e-10 + f_act_imps_split / (1 + np.percentile(f_null_imps_split, 75))) # Avoid didvide by zero feature_scores.append((_f, split_score, gain_score)) scores_df = pd.DataFrame(feature_scores, columns=['feature', 'split_score', 'gain_score']) plt.figure(figsize=(16, 16)) gs = gridspec.GridSpec(1, 2) # Plot Split importances ax = plt.subplot(gs[0, 0]) sns.barplot(x='split_score', y='feature', data=scores_df.sort_values('split_score', ascending=False).iloc[0:70], ax=ax) ax.set_title('Feature scores wrt split importances', fontweight='bold', fontsize=14) # Plot Gain importances ax = plt.subplot(gs[0, 1])
fig = plt.figure(figsize=(width, height)) xlabels = size**2; plt.loglog(xlabels, timing_LowFreq, label='Solve', color='b', linewidth=2, linestyle='--', marker='.', markersize=8.0, zorder=2) #plt.ticklabel_format(style='sci', axis='x', scilimits=(0,0)) # plt.plt.loglog(xlabels, timing_LowFreq, label='LowFrequency', color='b', linewidth=2, linestyle='--', marker='.', markersize=8.0, zorder=2) plt.loglog(size**2, timing_LowFact, label='Setup', color='g', linewidth=2, linestyle='--', marker='o', markersize=8.0, zorder=2) # plt.loglog(size**2, timing_gauss, label='Gaussian bumps', color='g', linewidth=2, linestyle='--', marker='.', markersize=8.0, zorder=2) plt.loglog(xlabels, (xlabels*np.log(xlabels)**4/(xlabels[0]*np.log(xlabels[0])**4))*timing_LowFreq[0]*1.05, label=r'$\mathcal{O}(N \log^3{N})$', color='k', linewidth=2, linestyle='solid', markersize=8.0, zorder=2) # #plt.ticklabel_format(style='sci', axis='x', scilimits=(0,0)) plt.loglog(xlabels, (xlabels*np.log(xlabels)/(xlabels[0]*np.log(xlabels[0])))*timing_LowFact[0]*1.05, label=r'$\mathcal{O}(N \log{N})$', color='r', linewidth=2, linestyle='solid', markersize=8.0, zorder=2) # # plt.loglog(N_x**2, N_x**2 / 4.0e4, label=r' ', color='white', linewidth=0.0) plt.legend(loc=2, ncol=1, frameon=False, fontsize=14.85) # plt.title('Normalized run-time for inner loop') plt.xlabel(r'$N=n^2$', fontsize=18) plt.ylabel('Time [s]', fontsize=18) plt.gca().tick_params(labelsize=14) plt.autoscale(True, 'both', True)
def _log_sigmoid(self, x): return np.log(self._sigmoid(x))
def softmax_loss_naive(W, X, y, reg): """ Softmax loss function, naive implementation (with loops) Inputs have dimension D, there are C classes, and we operate on minibatches of N examples. Inputs: - W: A numpy array of shape (D, C) containing weights. - X: A numpy array of shape (N, D) containing a minibatch of data. - y: A numpy array of shape (N,) containing training labels; y[i] = c means that X[i] has label c, where 0 <= c < C. - reg: (float) regularization strength Returns a tuple of: - loss as single float - gradient with respect to weights W; an array of same shape as W """ # Initialize the loss and gradient to zero. loss = 0.0 dW = np.zeros_like(W) N = X.shape[0] C = W.shape[1] y_pred = X.dot(W) ############################################################################# # TODO: Compute the softmax loss and its gradient using explicit loops. # # Store the loss in loss and the gradient in dW. If you are not careful # # here, it is easy to run into numeric instability. Don't forget the # # regularization! # ############################################################################# for i in range(N): current_scores = y_pred[i, :] # Fix for numerical stability by subtracting max from score vector. shift_scores = current_scores - np.max(current_scores) # Calculate loss for this example. loss_ii = -shift_scores[y[i]] + np.log(np.sum(np.exp(shift_scores))) loss += loss_ii for j in range(C): softmax_score = np.exp(shift_scores[j]) / np.sum( np.exp(shift_scores)) # Gradient calculation. if j == y[i]: dW[:, j] += (-1 + softmax_score) * X[i] else: dW[:, j] += softmax_score * X[i] # Average over the batch and add our regularization term. loss /= N loss += reg * np.sum(W * W) # Average over the batch and add derivative of regularization term. dW /= N dW += 2 * reg * W ############################################################################# # END OF YOUR CODE # ############################################################################# return loss, dW
def identify(self, genome, actual_node, segment_detector): id_map = self._population.id_mapping length_classifier = self._length_classifier # TODO: Eliminated shared_list and use shared_dict everywhere shared_list = [] anchors = set(length_classifier._labeled_nodes) - self.exclude_anchors sorted_labeled = sorted(anchors) np_sorted_labeled = np.array(sorted_labeled, dtype=np.uint32) sorted_shared = [] for labeled_node_id in sorted_labeled: labeled_node = id_map[labeled_node_id] s = segment_detector.shared_segment_length( genome, labeled_node.suspected_genome) shared_list.append((labeled_node_id, s)) sorted_shared.append(s) write_log("positive ibd count", sum(0.0 < x for x in sorted_shared)) #write_log("shared", sorted_shared) shared_dict = dict(shared_list) sorted_shared = np.array(sorted_shared, dtype=np.float64) labeled_nodes_cryptic, all_lengths = list(zip(*shared_dict.items())) np_cryptic = np.log( length_classifier.get_batch_smoothing_gamma(sorted_shared)) node_data = [] batch_shape = [] batch_scale = [] batch_zero_prob = [] batch_lengths = [] # Keep for logging purposes # batch_cryptic_lengths = [] nodes = self._to_search(shared_list, actual_node.sex) if len(nodes) == 0: # We have no idea which node it is return RawIdentified(set(), float("-inf"), None) for node in nodes: node_start_i = len(batch_shape) node_id = node._id #node_cryptic_log_probs[node] = 0 if node_id in length_classifier._distributions: labeled_ids, shape, scale, zero_prob = length_classifier._distributions[ node_id] else: labeled_ids = np.array([], dtype=np.uint32) shape = scale = zero_prob = np.array([], dtype=np.float64) calc_data = calculate_probabilities(labeled_ids, shape, scale, zero_prob, sorted_shared, np_sorted_labeled, np_cryptic, node_id) cur_lengths, cur_shapes, cur_scales, cur_zero_prob, cur_cryptic = calc_data batch_lengths.extend(cur_lengths) batch_shape.extend(cur_shapes) batch_scale.extend(cur_scales) batch_zero_prob.extend(cur_zero_prob) node_stop_i = len(batch_shape) node_data.append( ProbabilityData(node, node_start_i, node_stop_i, cur_cryptic)) assert len(node_data) > 0 if len(batch_lengths) > 0: pdf_vals = length_classifier.batch_pdf_distributions( batch_lengths, batch_shape, batch_scale, batch_zero_prob) calc_prob, zero_replace = pdf_vals else: calc_prob = [] log_calc_prob_cum = np.cumsum(np.log(calc_prob)) del calc_prob log_calc_prob_cum = np.concatenate(([0.0], log_calc_prob_cum)) node_probabilities = dict() for node, start_i, stop_i, cryptic_prob in node_data: log_prob = (log_calc_prob_cum[stop_i] - log_calc_prob_cum[start_i]) + cryptic_prob node_probabilities[node] = log_prob assert len(node_probabilities) > 0 if self.probability_logging: write_log( "identify", { "node": actual_node._id, "probs": { node._id: prob for node, prob in node_probabilities.items() } }) if len(node_probabilities) == 0: return RawIdentified(set(), -INF, None) # The value 8 is somewhat arbitrary. We are always able to # generate our confidence value with the top 8, as sibships # tend to be small. This number may need to be larger for # populations with large sibships. potential_nodes = nlargest(8, node_probabilities.items(), key=lambda x: x[1]) top, top_log_prob = potential_nodes[0] sibling_group = get_suspected_sibling_group(top) for node, log_prob in potential_nodes[1:]: if node in sibling_group: continue next_node = node next_log_prob = log_prob break else: if len(potential_nodes) > 1: next_node, next_log_prob = potential_nodes[1] if len(potential_nodes) > 1: log_ratio = top_log_prob - next_log_prob else: log_ratio = -INF return RawIdentified(get_sibling_group(top), log_ratio, top)