def euclidean_distances(X, Y, squared=False, inverse=True): """ Considering the rows of X (and Y=X) as vectors, compute the distance matrix between each pair of vectors. An implementation of a "similarity" based on the Euclidean "distance" between two vectors X and Y. Thinking of items as dimensions and preferences as points along those dimensions, a distance is computed using all items (dimensions) where both users have expressed a preference for that item. This is simply the square root of the sum of the squares of differences in position (preference) along each dimension. Parameters ---------- X: array of shape (n_samples_1, n_features) Y: array of shape (n_samples_2, n_features) squared: boolean, optional This routine will return squared Euclidean distances instead. inverse: boolean, optional This routine will return the inverse Euclidean distances instead. Returns ------- distances: array of shape (n_samples_1, n_samples_2) Examples -------- >>> from scikits.crab.metrics.pairwise import euclidean_distances >>> X = [[2.5, 3.5, 3.0, 3.5, 2.5, 3.0],[3.0, 3.5, 1.5, 5.0, 3.5,3.0]] >>> # distrance between rows of X >>> euclidean_distances(X, X) array([[ 1. , 0.29429806], [ 0.29429806, 1. ]]) >>> # get distance to origin >>> X = [[1.0, 0.0],[1.0,1.0]] >>> euclidean_distances(X, [[0.0, 0.0]]) array([[ 0.5 ], [ 0.41421356]]) """ # should not need X_norm_squared because if you could precompute that as # well as Y, then you should just pre-compute the output and not even # call this function. if X is Y: X = Y = np.asanyarray(X) else: X = np.asanyarray(X) Y = np.asanyarray(Y) if X.shape[1] != Y.shape[1]: raise ValueError("Incompatible dimension for X and Y matrices") if squared: return ssd.cdist(X, Y, 'sqeuclidean') XY = ssd.cdist(X, Y) return np.divide(1.0, (1.0 + XY)) if inverse else XY
def compute_matrices_for_gradient_totalcverr(self, train_x, train_y, train_z): if self.kernelX_use_median: sigmax = self.kernelX.get_sigma_median_heuristic(train_x) self.kernelX.set_width(float(sigmax)) if self.kernelY_use_median: sigmay = self.kernelY.get_sigma_median_heuristic(train_y) self.kernelY.set_width(float(sigmay)) kf = KFold( n_splits=self.K_folds) matrix_results = [[[None] for _ in range(self.K_folds)]for _ in range(8)] # xx=[[None]*10]*6 will give the same id to xx[0][0] and xx[1][0] etc. as # this command simply copied [None] many times. But the above gives different ids. count = 0 for train_index, test_index in kf.split(np.ones((self.num_samples,1))): X_tr, X_tst = train_x[train_index], train_x[test_index] Y_tr, Y_tst = train_y[train_index], train_y[test_index] Z_tr, Z_tst = train_z[train_index], train_z[test_index] matrix_results[0][count] = self.kernelX.kernel(X_tst, X_tr) #Kx_tst_tr matrix_results[1][count] = self.kernelX.kernel(X_tr, X_tr) #Kx_tr_tr matrix_results[2][count] = self.kernelX.kernel(X_tst, X_tst) #Kx_tst_tst matrix_results[3][count] = self.kernelY.kernel(Y_tst, Y_tr) #Ky_tst_tr matrix_results[4][count] = self.kernelY.kernel(Y_tr, Y_tr) #Ky_tr_tr matrix_results[5][count] = self.kernelY.kernel(Y_tst,Y_tst) #Ky_tst_tst matrix_results[6][count] = cdist(Z_tst, Z_tr, 'sqeuclidean') #D_tst_tr: square distance matrix matrix_results[7][count] = cdist(Z_tr, Z_tr, 'sqeuclidean') #D_tr_tr: square distance matrix count = count + 1 return matrix_results
def online_k_means(k,b,t,X_in): random_number = 11232015 random_num = np.random.randint(X_in.shape[0], size =300 ) rng = np.random.RandomState(random_number) permutation1 = rng.permutation(len(random_num)) random_num = random_num[permutation1] x_input = X_in[random_num] c,l = mykmeansplusplus(x_input,k,t) v = np.zeros((k)) for i in range(t): random_num = np.random.randint(X_in.shape[0], size = b) rng = np.random.RandomState(random_number) permutation1 = rng.permutation(len(random_num)) random_num = random_num[permutation1] M = X_in[random_num] Y = cdist(M,c,metric='euclidean', p=2, V=None, VI=None, w=None) clust_index = np.argmin(Y,axis = 1) for i in range(M.shape[0]): c_in = clust_index[i] v[c_in] += 1 ita = 1 / v[c_in] c[c_in] = np.add(np.multiply((1 - ita),c[c_in]),np.multiply(ita,M[i])) Y_l = cdist(X_in,c,metric='euclidean', p=2, V=None, VI=None, w=None) l = np.argmin(Y_l,axis = 1) return c,l
def test_init_corr(self, other, T = 5e-3, outlierprior=1e-1, outlierfrac=1e-2, outliercutoff=1e-2, ): import scipy.spatial.distance as ssd import sys self.transform_points() other.transform_points() init_prob_nm(self.pt_ptrs, other.pt_ptrs, self.pt_w_ptrs, other.pt_w_ptrs, self.dims_gpu, other.dims_gpu, self.N, outlierprior, outlierfrac, T, self.corr_cm_ptrs, self.corr_rm_ptrs) gpu_corr_rm = self.corr_rm[0].get() gpu_corr_rm = gpu_corr_rm.flatten()[:(self.dims[0] + 1) * (other.dims[0] + 1)].reshape(self.dims[0]+1, other.dims[0]+1) s_pt_w = self.pts_w[0].get() s_pt = self.pts[0].get() o_pt_w = other.pts_w[0].get() o_pt = other.pts[0].get() d1 = ssd.cdist(s_pt_w, o_pt, 'euclidean') d2 = ssd.cdist(s_pt, o_pt_w, 'euclidean') p_nm = np.exp( -(d1 + d2) / (2 * T)) for i in range(self.dims[0]): for j in range(other.dims[0]): if abs(p_nm[i, j] - gpu_corr_rm[i, j]) > 1e-7: print "INIT CORR MATRICES DIFFERENT" print i, j, p_nm[i, j], gpu_corr_rm[i, j] ipy.embed() sys.exit(1)
def ch(X, cIDX, distance="euclidean"): Nclusters = cIDX.max() + 1 Npoints = len(X) n = np.ndarray(shape=(Nclusters), dtype=float) j = 0 for i in range(cIDX.min(), cIDX.max() + 1): aux = np.asarray([float(b) for b in (cIDX == i)]) n[j] = aux.sum() j = j + 1 # Clusters A = np.array([X[np.where(cIDX == i)] for i in range(Nclusters)]) # Centroids v = np.array([np.sum(Ai, axis=0) / float(Ai.shape[0]) for Ai in A]) ssb = 0 for i in range(Nclusters): ssb = n[i] * (cdist([v[i]], [np.mean(X, axis=0)], metric=distance)[0][0] ** 2) + ssb z = np.ndarray(shape=(Nclusters), dtype=float) for i in range(cIDX.min(), cIDX.max() + 1): aux = np.array([(cdist([x], [v[i]], metric=distance)[0][0] ** 2) for x in X[cIDX == i]]) z[i] = aux.sum() ssw = z.sum() return (ssb / (Nclusters - 1)) / (ssw / (Npoints - Nclusters))
def _compute_nearest(xhs, rr, use_balltree=True, return_dists=False): """Find nearest neighbors Note: The rows in xhs and rr must all be unit-length vectors, otherwise the result will be incorrect. Parameters ---------- xhs : array, shape=(n_samples, n_dim) Points of data set. rr : array, shape=(n_query, n_dim) Points to find nearest neighbors for. use_balltree : bool Use fast BallTree based search from scikit-learn. If scikit-learn is not installed it will fall back to the slow brute force search. return_dists : bool If True, return associated distances. Returns ------- nearest : array, shape=(n_query,) Index of nearest neighbor in xhs for every point in rr. distances : array, shape=(n_query,) The distances. Only returned if return_dists is True. """ if use_balltree: try: from sklearn.neighbors import BallTree except ImportError: logger.info('Nearest-neighbor searches will be significantly ' 'faster if scikit-learn is installed.') use_balltree = False if xhs.size == 0 or rr.size == 0: if return_dists: return np.array([], int), np.array([]) return np.array([], int) if use_balltree is True: ball_tree = BallTree(xhs) if return_dists: out = ball_tree.query(rr, k=1, return_distance=True) return out[1][:, 0], out[0][:, 0] else: nearest = ball_tree.query(rr, k=1, return_distance=False)[:, 0] return nearest else: from scipy.spatial.distance import cdist if return_dists: nearest = list() dists = list() for r in rr: d = cdist(r[np.newaxis, :], xhs) idx = np.argmin(d) nearest.append(idx) dists.append(d[0, idx]) return (np.array(nearest), np.array(dists)) else: nearest = np.array([np.argmin(cdist(r[np.newaxis, :], xhs)) for r in rr]) return nearest
def test_pairwise_distances_data_derived_params(n_jobs, metric, dist_function, y_is_x): # check that pairwise_distances give the same result in sequential and # parallel, when metric has data-derived parameters. with config_context(working_memory=1): # to have more than 1 chunk rng = np.random.RandomState(0) X = rng.random_sample((1000, 10)) if y_is_x: Y = X expected_dist_default_params = squareform(pdist(X, metric=metric)) if metric == "seuclidean": params = {'V': np.var(X, axis=0, ddof=1)} else: params = {'VI': np.linalg.inv(np.cov(X.T)).T} else: Y = rng.random_sample((1000, 10)) expected_dist_default_params = cdist(X, Y, metric=metric) if metric == "seuclidean": params = {'V': np.var(np.vstack([X, Y]), axis=0, ddof=1)} else: params = {'VI': np.linalg.inv(np.cov(np.vstack([X, Y]).T)).T} expected_dist_explicit_params = cdist(X, Y, metric=metric, **params) dist = np.vstack(tuple(dist_function(X, Y, metric=metric, n_jobs=n_jobs))) assert_allclose(dist, expected_dist_explicit_params) assert_allclose(dist, expected_dist_default_params)
def __init__(self,gifts,nb_neighbors=50,metric=None): """ metric=None uses the chord distance """ self.gifts = gifts self.X = gifts[['Latitude','Longitude']].values self.N = len(self.X) self.wgt = gifts.Weight.values #root of subtree -> list of nodes in this subtree self.subtrees = {i:[i] for i in range(self.N)} #node -> root of subtree self.Xto = range(self.N) #weight of subtrees self.subtree_weights = {i: self.wgt[i] for i in range(self.N)} #cartesian coordinates (ignoring earth radius) self.Z = np.apply_along_axis(self.to_cartesian,1,self.X) #distance from north pole to root points to_pole = cdist(np.atleast_2d(self.to_cartesian(north_pole)),self.Z) if metric is None: self.gates = to_pole[0].tolist() else: if isinstance(metric,Thin_Metric): self.gates = (AVG_EARTH_RADIUS * to_pole[0]).tolist() else: self.gates = cdist(np.atleast_2d(north_pole),self.X)[0].to_list() self.subtree_costs = {i:self.gates[i] for i in range(self.N)} self.total_cost = sum(self.subtree_costs.values()) self.nb_neighbors = nb_neighbors import sklearn.neighbors self.kdtree = sklearn.neighbors.KDTree(self.Z) self.metric=metric
def cdist_sparse( X, Y, **kwargs ): """ -> |X| x |Y| cdist array, any cdist metric X or Y may be sparse -- best csr """ # todense row at a time, v slow if both v sparse sxy = 2*issparse(X) + issparse(Y) if sxy == 0: if kwargs["metric"] == "cosine": return 1 - cdist( X, Y, **kwargs ) else: return d d = np.empty( (X.shape[0], Y.shape[0]), np.float64 ) if sxy == 2: for j, x in enumerate(X): d[j] = cdist( x.todense(), Y, **kwargs ) [0] elif sxy == 1: for k, y in enumerate(Y): d[:,k] = cdist( X, y.todense(), **kwargs ) [0] else: for j, x in enumerate(X): for k, y in enumerate(Y): d[j,k] = cdist( x.todense(), y.todense(), **kwargs ) [0] if kwargs["metric"] == "cosine": return 1 - d else: return d
def eccentricity(data, exponent=1., metricpar={}, callback=None): if data.ndim==1: assert metricpar=={}, 'No optional parameter is allowed for a dissimilarity matrix.' ds = squareform(data, force='tomatrix') if exponent in (np.inf, 'Inf', 'inf'): return ds.max(axis=0) elif exponent==1.: ds = np.power(ds, exponent) return ds.sum(axis=0)/float(np.alen(ds)) else: ds = np.power(ds, exponent) return np.power(ds.sum(axis=0)/float(np.alen(ds)), 1./exponent) else: progress = progressreporter(callback) N = np.alen(data) ecc = np.empty(N) if exponent in (np.inf, 'Inf', 'inf'): for i in range(N): ecc[i] = cdist(data[(i,),:], data, **metricpar).max() progress((i+1)*100//N) elif exponent==1.: for i in range(N): ecc[i] = cdist(data[(i,),:], data, **metricpar).sum()/float(N) progress((i+1)*100//N) else: for i in range(N): dsum = np.power(cdist(data[(i,),:], data, **metricpar), exponent).sum() ecc[i] = np.power(dsum/float(N), 1./exponent) progress((i+1)*100//N) return ecc
def bandwidth(self, X): """ Estimate bandwidth TODO Replace this with a method which treats the data like a distorted doughnut by estimating limit cycle, and integrating an ellipsoid swept along the trajectory of the limit cycle with apropriate lengths """ N = X.shape[0] D = X.shape[1] points_in_cluster = N / float(self.Ncl) # Wanted points in a cluster debugLog(self, "Estimating bandwidth") # Sample points from an n-box for numerical intergration S = 10 * (6 ** D) # Grab subsamples to of points to look at idx = randint(0, N, (S)) v = X[idx] # Sample points y = (X.max() - X.min()) * randn(S, D) + X.min() # Find how close together points in our subsample typically are w = std(cdist(X[list(set(arange(N)).difference(idx))], v).min(0)) # Count points in our box that are approximately this close c = sum(cdist(X, y).min(0) < self.sf * w) # Compute volume from length with sphere prefactor V = nSphereVolume(D) * ((c / float(S)) * (X.max() - X.min()) ** D) # Calculate bandwidth by return ((V * points_in_cluster) / N) ** (1.0 / D)
def learn(self,learndataset,pipp_normalise=True): """learn the tree structure required to perform evaluation :param learndataset: learning instances :type learndataset: :class:`~classifip.dataset.arff.ArffFile` :param pipp_normalise: normalise the input features or not :type pipp_normalise: boolean .. note:: learndataset should come from a xarff file tailored for lable ranking """ self.labels=learndataset.attribute_data['L'][:] learndata=[row[0:len(row)-1] for row in learndataset.data] data_array=np.array(learndata).astype(float) if pipp_normalise == True: span=data_array.max(axis=0)-data_array.min(axis=0) self.normal.append(True) self.normal.append(span) self.normal.append(data_array.min(axis=0)) data_array=(data_array-data_array.min(axis=0))/span else: self.normal.append(False) #Initalise radius as average distance between all learning instances if len(data_array) > 1000: data_red=np.random.permutation(data_array)[0:1000] distances=distance.cdist(data_red,data_red) else: distances=distance.cdist(data_array,data_array) self.radius=distances.sum()/(2*(len(distances)**2-len(distances))) self.tree=kdtree.KDTree(data_array) self.truerankings=[ranking_matrices(row[-1],self.labels) for row in learndataset.data]
def covSEisoU(hyp=None, x=None, z=None, der=None): # Squared Exponential covariance function with isotropic distance measure with # unit magnitude. The covariance function is parameterized as: # # k(x^p,x^q) = exp( -(x^p - x^q)' * inv(P) * (x^p - x^q) / 2 ) # # where the P matrix is ell^2 times the unit matrix. # # The hyperparameters of the function are: # # hyp = [ log(ell) ] if hyp == None: # report number of parameters return [1] ell = np.exp(hyp[0]) # characteristic length scale n,D = x.shape if z == 'diag': A = np.zeros((n,1)) elif z == None: A = spdist.cdist(x/ell, x/ell, 'sqeuclidean') else: # compute covariance between data sets x and z A = spdist.cdist(x/ell, z/ell, 'sqeuclidean') # self covariances if der == None: # compute covariance matix for dataset x A = np.exp(-0.5*A) else: if der == 0: # compute derivative matrix wrt 1st parameter A = np.exp(-0.5*A) * A else: raise Exception("Wrong derivative index in covSEisoU") return A
def lloyd2(data, init_cent, metric='e', verbose=False): k = init_cent.shape[0] cent = np.copy(init_cent) labels = spdist.cdist(data, cent, metric).argmin(axis=1) converged = False t, tmax = 0, 1000 while not converged and t < tmax: t += 1 converged = True cent_ = np.array([np.mean(data[labels == l], axis=0) for l in range(k)]) labels_ = spdist.cdist(data, cent_, metric).argmin(axis=1) if not np.allclose(cent_, cent) or \ not np.alltrue(labels == labels_): converged = False labels = labels_ cent = cent_ if not converged: # raise UserWarning("did not converge after {} iterations".format(t)) print("did not converge after {} iterations".format(t)) elif verbose: print("Converged after {} iterations".format(t)) return cent, labels
def sim_compute(word, dis_type, topk=100): # Get the index of word and the corresponding vector try: index = word2idx[word] wordvec = myembed[index, :].reshape(1,-1) except KeyError: print "Word %s is not present in Vocablury" % sys.exc_value return # For cosine and correlation the similarity is 1 - distance # Else for others just inverse the distance by multipying with -1 if (dis_type == 'cosine' or dis_type == 'correlation' ): sim = 1 - cdist(wordvec, myembed, dis_type) else: sim = -1 * cdist(wordvec, myembed, dis_type) # Now operations to get sim the shape we need i.e from (1,N) to (N,) final = sim[0].T zipped = zip(range(len(final)), final) del zipped[index] zipped.sort(key=lambda t: t[1], reverse=True) return zipped
def proceed(self, x=None, z=None, der=None): n, D = x.shape ell = 1./np.exp(self.hyp[0:D]) # characteristic length scale sf2 = np.exp(2.*self.hyp[D]) # signal variance alpha = np.exp(self.hyp[D+1]) if z == 'diag': D2 = np.zeros((n,1)) elif z == None: tmp = np.dot(np.diag(ell),x.T).T D2 = spdist.cdist(tmp, tmp, 'sqeuclidean') else: D2 = spdist.cdist(np.dot(np.diag(ell),x.T).T, np.dot(np.diag(ell),z.T).T, 'sqeuclidean') if der == None: # compute covariance matix for dataset x A = sf2 * ( ( 1.0 + 0.5*D2/alpha )**(-alpha) ) else: if der < D: # compute derivative matrix wrt length scale parameters if z == 'diag': A = D2*0 elif z == None: tmp = np.atleast_2d(x[:,der])/ell[der] A = sf2 * ( 1.0 + 0.5*D2/alpha )**(-alpha-1) * spdist.cdist(tmp, tmp, 'sqeuclidean') else: A = sf2 * ( 1.0 + 0.5*D2/alpha )**(-alpha-1) * spdist.cdist(np.atleast_2d(x[:,der]).T/ell[der], np.atleast_2d(z[:,der]).T/ell[der], 'sqeuclidean') elif der==D: # compute derivative matrix wrt magnitude parameter A = 2. * sf2 * ( ( 1.0 + 0.5*D2/alpha )**(-alpha) ) elif der==(D+1): # compute derivative matrix wrt magnitude parameter K = ( 1.0 + 0.5*D2/alpha ) A = sf2 * K**(-alpha) * ( 0.5*D2/K - alpha*np.log(K) ) else: raise Exception("Wrong derivative index in covRQard") return A
def remove_redundant_mot(mot, rev_mot, max_d): """Reads a list of PWMs and removes redundant ones based on correlation. Args: - mot - rev_mot - max_d: Maximum distance for motifs to be considered redundant. Return value: A boolean numpy array is_good, that indicates whether each motif should be kept or not. """ nmot = mot.shape[0] assert(rev_mot.shape[0] == nmot) assert(rev_mot.shape[1] == mot.shape[1]) is_good = np.ones((nmot, ), dtype = np.bool) for i in range(nmot - 1): if not is_good[i]: continue # Get all the indices that are higher than i and have not been removed already. others = np.argwhere(np.logical_and(is_good, np.arange(0, nmot) > i)).flatten() d = cdist(np.reshape(mot[i, :], (1, mot.shape[1])), mot[others, :], metric = 'correlation').flatten() rev_d = cdist(np.reshape(mot[i, :], (1, mot.shape[1])), rev_mot[others, :], metric = 'correlation').flatten() # Get minimum distance so maximum correlation. d = np.minimum(d, rev_d) # PWMs that are similar to the i-th one and have worse pvalue, then they will # get marked as redundant bad = others[np.logical_and(d < max_d, pvals[i] < pvals[others])] is_good[bad] = False if np.any(np.logical_and(d < max_d, pvals[i] > pvals[others])): is_good[i] = False return is_good
def proceed(self, x=None, z=None, der=None): ell = np.exp(self.hyp[0]) # characteristic length scale p = np.exp(self.hyp[1]) # period sf2 = np.exp(2.*self.hyp[2]) # signal variance n,D = x.shape if z == 'diag': A = np.zeros((n,1)) elif z == None: A = np.sqrt(spdist.cdist(x, x, 'sqeuclidean')) else: A = np.sqrt(spdist.cdist(x, z, 'sqeuclidean')) A = np.pi*A/p if der == None: # compute covariance matix for dataset x A = np.sin(A)/ell A = A * A A = sf2 *np.exp(-2.*A) else: if der == 0: # compute derivative matrix wrt 1st parameter A = np.sin(A)/ell A = A * A A = 4. *sf2 *np.exp(-2.*A) * A elif der == 1: # compute derivative matrix wrt 2nd parameter R = np.sin(A)/ell A = 4 * sf2/ell * np.exp(-2.*R*R)*R*np.cos(A)*A elif der == 2: # compute derivative matrix wrt 3rd parameter A = np.sin(A)/ell A = A * A A = 2. * sf2 * np.exp(-2.*A) else: raise Exception("Wrong derivative index in covPeriodic") return A
def proceed(self, x=None, z=None, der=None): ell = np.exp(self.hyp[0]) # characteristic length scale sf2 = np.exp(2.*self.hyp[1]) # signal variance alpha = np.exp(self.hyp[2]) n,D = x.shape if z == 'diag': D2 = np.zeros((n,1)) elif z == None: D2 = spdist.cdist(x/ell, x/ell, 'sqeuclidean') else: D2 = spdist.cdist(x/ell, z/ell, 'sqeuclidean') if der == None: # compute covariance matix for dataset x A = sf2 * ( ( 1.0 + 0.5*D2/alpha )**(-alpha) ) else: if der == 0: # compute derivative matrix wrt 1st parameter A = sf2 * ( 1.0 + 0.5*D2/alpha )**(-alpha-1) * D2 elif der == 1: # compute derivative matrix wrt 2nd parameter A = 2.* sf2 * ( ( 1.0 + 0.5*D2/alpha )**(-alpha) ) elif der == 2: # compute derivative matrix wrt 3rd parameter K = ( 1.0 + 0.5*D2/alpha ) A = sf2 * K**(-alpha) * (0.5*D2/K - alpha*np.log(K) ) else: raise Exception("Wrong derivative index in covRQ") return A
def proceed(self, x=None, z=None, der=None): n, D = x.shape ell = 1./np.exp(self.hyp[0:D]) # characteristic length scale sf2 = np.exp(2.*self.hyp[D]) # signal variance if z == 'diag': A = np.zeros((n,1)) elif z == None: tem = np.dot(np.diag(ell),x.T).T A = spdist.cdist(tem,tem,'sqeuclidean') else: # compute covariance between data sets x and z A = spdist.cdist(np.dot(np.diag(ell),x.T).T,np.dot(np.diag(ell),z.T).T,'sqeuclidean') A = sf2*np.exp(-0.5*A) if der: if der < D: # compute derivative matrix wrt length scale parameters if z == 'diag': A = A*0 elif z == None: tem = np.atleast_2d(x[:,der])/ell[der] A *= spdist.cdist(tem,tem,'sqeuclidean') else: A *= spdist.cdist(np.atleast_2d(x[:,der]).T/ell[der],np.atleast_2d(z[:,der]).T/ell[der],'sqeuclidean') elif der==D: # compute derivative matrix wrt magnitude parameter A = 2.*A else: raise Exception("Wrong derivative index in RDFard") return A
def proceed(self, x=None, z=None, der=None): ell = np.exp(self.hyp[0]) # characteristic length scale sf2 = np.exp(2.* self.hyp[1]) # signal variance d = self.para[0] # 2 times nu if np.abs(d-np.round(d)) < 1e-8: # remove numerical error from format of parameter d = int(round(d)) d = int(d) try: assert(d in [1,3,5]) # check for valid values of d except AssertionError: print "Warning: You specified d to be neither 1,3 nor 5. We set d=3. " d = 3 if z == 'diag': A = np.zeros((x.shape[0],1)) elif z == None: x = np.sqrt(d)*x/ell A = np.sqrt(spdist.cdist(x, x, 'sqeuclidean')) else: x = np.sqrt(d)*x/ell z = np.sqrt(d)*z/ell A = np.sqrt(spdist.cdist(x, z, 'sqeuclidean')) if der == None: # compute covariance matix for dataset x A = sf2 * self.mfunc(d,A) else: if der == 0: # compute derivative matrix wrt 1st parameter A = sf2 * self.dmfunc(d,A) elif der == 1: # compute derivative matrix wrt 2nd parameter A = 2 * sf2 * self.mfunc(d,A) elif der == 2: # no derivative wrt 3rd parameter A = np.zeros_like(A) # do nothing (d is not learned) else: raise Exception("Wrong derivative value in Matern") return A
def evaluate(self, individual): dist = cdist(np.atleast_2d(individual), np.atleast_2d(self.target)) if (self.model_name == "CNN"): X = np.array([np.array(individual).reshape(28,28,1)]) else: X = np.array([individual]) if self.model_name.startswith("SVM") or self.model_name.startswith("DT"): model_output = self.model.predict_proba(X) else: model_output = self.model.predict(X) desired_output = np.zeros(10) desired_output[self.target_output] = 1.0 dist2 = cdist(np.atleast_2d(model_output), np.atleast_2d(desired_output)) fit = dist*0.5 + 0.5*dist2 #fit = dist2 #fit = dist return fit,
def K_SE(xs, ys=None, l=1, deriv=False, wrt='l'): l = asarray(l) sig = 1 #l[0] #l = l[1:] xs = ascolumn(xs) if ys is None: d = squareform(pdist(xs/l, 'sqeuclidean')) else: ys = ascolumn(ys) d = cdist(xs/l, ys/l, 'sqeuclidean') cov = exp(-d/2) if not deriv: return sig * cov grads = [] if wrt == 'l': #grads.append(cov) # grad of sig for i in xrange(shape(xs)[1]): if ys is None: grad = sig * cov * squareform(pdist(ascolumn(xs[:,i]), 'sqeuclidean')) else: grad = sig * cov * cdist(ascolumn(xs[:,i]), ascolumn(ys[:,i]), 'sqeuclidean') grad /= l[i] ** 3 grads.append(grad) return sig * cov, grads elif wrt == 'y': if shape(xs)[0] != 1: print '*** x not a row vector ***' jac = sig * cov * ((ys - xs) / l**2).T return sig * cov, jac
def fine_tune_transform(feature1, feature2, init_pair_idx): ind = [] k = 1 while len(ind) < 0.6 * min(len(feature1["pts"]), len(feature2["pts"])) and k < 10: # Step 1. Randomly choose 20 points evenly distributed on the image rand_pts = np.random.rand(20, 2) * (np.amax(feature1["pts"], axis=0) - np.amin(feature1["pts"], axis=0)) * \ np.array([1, 0.8]) + np.amin(feature1["pts"], axis=0) # Step 2. Find nearest points from feature1 dist_mat = spd.cdist(rand_pts, feature1["pts"][init_pair_idx[:, 0]]) tmp_ind = np.argmin(dist_mat, axis=1) # Step 3. Use these points to find a homography tf = cv2.findHomography(feature1["pts"][init_pair_idx[tmp_ind, 0]], feature2["pts"][init_pair_idx[tmp_ind, 1]], method=cv2.RANSAC, ransacReprojThreshold=5) # Then use the transform find more matched points pts12 = cv2.perspectiveTransform(np.array([[p] for p in feature1["pts"]], dtype="float32"), tf[0])[:, 0, :] dist_mat = spd.cdist(pts12, feature2["pts"]) num1, num2 = dist_mat.shape idx12 = np.argsort(dist_mat, axis=1) tmp_ind = np.argwhere(np.array([dist_mat[i, idx12[i, 0]] for i in range(num1)]) < 5) if len(tmp_ind) > len(ind): ind = tmp_ind logging.debug("len(ind) = %d, len(feature) = %d", len(ind), min(len(feature1["pts"]), len(feature2["pts"]))) k += 1 pair_idx = np.hstack((ind, idx12[ind, 0])) tf = cv2.findHomography(feature1["pts"][pair_idx[:, 0]], feature2["pts"][pair_idx[:, 1]], method=cv2.RANSAC, ransacReprojThreshold=5) return tf, pair_idx
def covMatrix(X, Y, theta, symmetric = True, kernel = lambda u, theta: theta[0]*theta[0]*np.exp(-0.5*u*u/(theta[1]*theta[1])), \ dist_f=None): if len(np.array(X).shape) == 1: _X = np.array([X]).T else: _X = np.array(X) if len(np.array(Y).shape) == 1: _Y = np.array([Y]).T else: _Y = np.array(Y) if dist_f == None: if symmetric: cM = pdist(_X) M = squareform(cM) M = kernel(M, theta) return M else: cM = cdist(_X, _Y) M = kernel(cM, theta) return M else: if symmetric: cM = pdist(_X, dist_f) M = squareform(cM) M = kernel(M, theta) return M else: cM = cdist(_X, _Y, dist_f) M = kernel(cM, theta) return M return
def __init__(self, data, bandwidth=None, fixed=True, k=None, function='triangular', eps=1.0000001, ids=None, truncate=True, points=None): #Added truncate flag if issubclass(type(data), scipy.spatial.KDTree): self.data = data.data data = self.data else: self.data = data if k is not None: self.k = int(k) + 1 else: self.k = k if points is None: self.dmat = cdist(self.data, self.data) else: self.points = points self.dmat = cdist(self.points, self.data) self.function = function.lower() self.fixed = fixed self.eps = eps self.trunc = truncate if bandwidth: try: bandwidth = np.array(bandwidth) bandwidth.shape = (len(bandwidth), 1) except: bandwidth = np.ones((len(data), 1), 'float') * bandwidth self.bandwidth = bandwidth else: self._set_bw() self.kernel = self._kernel_funcs(self.dmat/self.bandwidth) if self.trunc: mask = np.repeat(self.bandwidth, len(self.data), axis=1) self.kernel[(self.dmat >= mask)] = 0
def pairwise_between_groups(fullsplit,i_own,split_lens,CDR3_similarity_cutoff): i_others = len(split_lens) if i_own != i_others-1: # if this is not the last group, stack groups being compared against fso = np.vstack(fullsplit[i_own+1:i_others]) else: fso = fullsplit[i_others-1] bool_dis = cdist(fullsplit[i_own],fso,'hamming').flatten() bool_inf = cdist(np.isinf(fullsplit[i_own]),np.isinf(fso),'hamming').flatten() finite_own = np.isfinite(fullsplit[i_own]).sum(axis=1) finite_others = np.isfinite(fso).sum(axis=1) pdf_all=np.empty(len(bool_dis)) for c,fin in enumerate(product(finite_own,finite_others)): pdf_all[c] = min(fin) norm_dist_all = (bool_dis-bool_inf)/pdf_all*fullsplit[0].shape[1] bool_all = norm_dist_all < (1-CDR3_similarity_cutoff) # given boolean array, find sequences belonging in a cluster, row-wise bool_all = bool_all.reshape(fullsplit[i_own].shape[0],fso.shape[0]) sets=[] col_offset = sum(split_lens[:i_own])+fullsplit[i_own].shape[0] for cnt,row in enumerate(bool_all): row_offset = cnt+sum(split_lens[:i_own]) sets_from_group = set(np.add(np.nonzero(row)[0],col_offset)) sets_from_group.add(row_offset) sets.append(sets_from_group) return sets
def computeScipySimilarity(Xs1,Xs2,sparse=False): Xall_new = np.zeros((Xs1.shape[0],4)) if sparse: print Xs1.shape print Xs2.shape Xs1 = np.asarray(Xs1.todense()) Xs2 = np.asarray(Xs2.todense()) for i,(a,b) in enumerate(zip(Xs1,Xs2)): a = a.reshape(-1,a.shape[0]) b = b.reshape(-1,b.shape[0]) #print a.shape #print type(a) dist = cdist(a,b,'cosine') Xall_new[i,0] = dist #Xall_new[i,3] = dist dist = cdist(a,b,'cityblock') Xall_new[i,1] = dist dist = cdist(a,b,'hamming') Xall_new[i,2] = dist dist = cdist(a,b,'euclidean') Xall_new[i,3] = dist Xall_new = pd.DataFrame(Xall_new,columns=['cosine','cityblock','hamming','euclidean']) print "NA:",Xall_new.isnull().values.sum() Xall_new = Xall_new.fillna(0.0) print "NA:",Xall_new.isnull().values.sum() print Xall_new.corr(method='spearman') return Xall_new
def compute_bic(kmeans,X): """ Computes the BIC metric for a given clusters Parameters: ----------------------------------------- kmeans: List of clustering object from scikit learn X : multidimension np array of data points Returns: ----------------------------------------- BIC value """ # assign centers and labels centers = [kmeans.cluster_centers_] labels = kmeans.labels_ #number of clusters m = kmeans.n_clusters # size of the clusters n = np.bincount(labels) #size of data set N, d = X.shape #compute variance for all clusters beforehand cl_var=[] for i in xrange(m): if not n[i] - m==0: cl_var.append((1.0 / (n[i] - m)) * sum(distance.cdist(X[np.where(labels == i)], [centers[0][i]], 'euclidean')**2)) else: cl_var.append(float(10**20) * sum(distance.cdist(X[np.where(labels == i)], [centers[0][i]], 'euclidean')**2)) const_term = 0.5 * m * np.log10(N) BIC = np.sum([n[i] * np.log10(n[i]) - n[i] * np.log10(N) - ((n[i] * d) / 2) * np.log10(2*np.pi) - (n[i] / 2) * np.log10(cl_var[i]) - ((n[i] - m) / 2) for i in xrange(m)]) - const_term return(BIC)
def proceed(self, x=None, z=None, der=None): ell = np.exp(self.hyp[0]) # characteristic length scale sf2 = np.exp(2.*self.hyp[1]) # signal variance v = self.para[0] # degree (v = 0,1,2 or 3 only) if np.abs(v-np.round(v)) < 1e-8: # remove numerical error from format of parameter v = int(round(v)) assert(int(v) in range(4)) # Only allowed degrees: 0,1,2 or 3 v = int(v) n, D = x.shape j = np.floor(0.5*D) + v + 1 if z == 'diag': A = np.zeros((n,1)) elif z == None: A = np.sqrt( spdist.cdist(x/ell, x/ell, 'sqeuclidean') ) else: # compute covariance between data sets x and z A = np.sqrt( spdist.cdist(x/ell, z/ell, 'sqeuclidean') ) # cross covariances if der == None: # compute covariance matix for dataset x A = sf2 * self.pp(A,j,v,self.func) else: if der == 0: # compute derivative matrix wrt 1st parameter A = sf2 * self.dpp(A,j,v,self.func,self.dfunc) elif der == 1: # compute derivative matrix wrt 2nd parameter A = 2. * sf2 * self.pp(A,j,v,self.func) elif der == 2: # wants to compute derivative wrt order A = np.zeros_like(A) else: raise Exception("Wrong derivative entry in PiecePoly") return A
def radial_basis(x_l, x, epsilon, norm="euclidean"): """ Calculates the radial basis for vector x """ r = cdist(np.array([x_l]).T, np.array([x]).T, norm) return _radial_basis(r, epsilon)
def dataset3(K): df_3 = df[(df['label']==6) | (df['label']==7)].reset_index(drop= True) vertex = list(zip(df_3['dim1'], df_3['dim2'])) centroid_index = np.random.randint(0,len(df_3), size = K) centroid = [vertex[x] for x in centroid_index] for iteration in range(50): #print(iteration) centroid_old = centroid cluster = [] array_1 = (np.square(distance.cdist(vertex, centroid))) cluster = [np.argmin(array_1[i]) for i in range(len(array_1))] cluster = np.array(cluster) def index_cluster(cluster_number): return (np.where(cluster == cluster_number)[0]) clust_points = {} for num in range(K): clust_points[num] = [vertex[i] for i in index_cluster(num)] centroid = [(sum((list(zip(*clust_points[num]))[0]))/len(clust_points[num]), sum((list(zip(*clust_points[num]))[1]))/len(clust_points[num])) for num in range(K)] if centroid == centroid_old: break list_ss = [] for clust_index in range(K): for i in range(len(clust_points[clust_index])): list_ss.append(np.square(distance.euclidean(centroid[clust_index],clust_points[clust_index][i]))) WC_SSD = sum(list_ss) cluster = [] for i in range(K): cluster.extend(list(np.repeat(i,len(clust_points[i])))) dim_1 = [] dim_2 = [] for i in range(K): x = list(zip(*clust_points[i]))[0] y = list(zip(*clust_points[i]))[1] dim_1.extend(x) dim_2.extend(y) dt = pd.DataFrame() dt['dim_1'] = dim_1 dt['dim_2'] = dim_2 dt['cluster'] = cluster dt = dt.sort_values(by = ['dim_1', 'dim_2', 'cluster']).reset_index(drop = True) df_3 = df_3.sort_values(by = ['dim1', 'dim2', 'label']).reset_index(drop = True) result = pd.merge(df_3,dt,on = df_3.index ).drop(['key_0','dim_1', 'dim_2'], axis = 1) B_avg = [] for i in range(K): for j in range(K): add = np.mean(distance.cdist(clust_points[i], clust_points[j], 'euclidean')) B_avg.append(add) B = np.mean(B_avg) A_avg = [] for i in range(K): add = np.mean(distance.cdist(clust_points[i],clust_points[i], 'euclidean')) A_avg.append(add) A = np.mean(A_avg) SC = (B-A)/max(A,B) return WC_SSD, SC
def get_gaussian_kernel(X, Y, sigma): D = cdist(X,Y, 'euclidean') K = np.exp(-sigma * D**2) return K
def time_cdist(self, num_points, metric): """Time scipy.spatial.distance.cdist over a range of input data sizes and metrics. """ distance.cdist(self.points, self.points, metric)
def main(argv): # Parse arguments parser = argparse.ArgumentParser() parser.add_argument('-i', '--input_file', help='input file', required=True) parser.add_argument('-ids', '--ids_file', help='ids file', required=True) parser.add_argument('-n_components', '--n_components', help='number of components in pca', required=True) parser.add_argument('-k', '--k', help='k of kmeans', required=True) ARGS = parser.parse_args() descriptors = load_dataset(ARGS.input_file) ids_list, news_groups = get_hash_ids(ARGS.ids_file) print("PCA") pca = PCA(n_components=int(ARGS.n_components)) descriptors = pca.fit_transform(descriptors) # kmeanModel = KMeans(n_clusters=int(ARGS.k), init='k-means++') # kmeanModel.fit(descriptors) # predictions = kmeanModel.predict(descriptors) # cluster_centers_ = kmeanModel.cluster_centers_ # print(predictions) print("Kmeans") kclusterer = KMeansClusterer(int(ARGS.k), distance=nltk.cluster.util.cosine_distance) predictions = np.array( kclusterer.cluster(descriptors, assign_clusters=True)) cluster_centers_ = np.array(kclusterer.means()) print("Distortions") # distortion_eu = sum(np.min(distance.cdist(descriptors, cluster_centers_, 'euclidean'), axis=1)) / descriptors.shape[0] distortion_cos = sum( np.min(distance.cdist(descriptors, cluster_centers_, 'cosine'), axis=1)) / descriptors.shape[0] print("Silhouettes") # silhouette_score_eu = metrics.silhouette_score(descriptors, predictions, metric='euclidean') silhouette_score_cos = metrics.silhouette_score(descriptors, predictions, metric='cosine') # print("EUCLIDEAN K:", ARGS.k, "distortion:", distortion_eu, "silhouette score:", silhouette_score_eu) print("COS K:", ARGS.k, "distortion:", distortion_cos, "silhouette score:", silhouette_score_cos) closest, _ = pairwise_distances_argmin_min(cluster_centers_, descriptors) medoids_ids = ids_list[closest] medoids = descriptors[closest] dist = distance.cdist(medoids, medoids, metric='cosine') # Five knns = dist.argsort(axis=1)[:, :6][:, 1:] for id_, knn in zip(medoids_ids, knns): print("\nMedoid id:", id_, "label:", news_groups[id_]) print("Cercanos:") for nn in knn: print("\t id:", medoids_ids[nn], "labels:", news_groups[medoids_ids[nn]]) metric = [] for i in range(int(225)): ids_l = ids_list[np.where(predictions == i)] # if len(ids_l) == 0: # counter_0+=1 # continue clusters_labels = [] for id_l in ids_l: label_list = news_groups[id_l] for ll in label_list: clusters_labels.append(ll) clnp = np.array(clusters_labels) uni, con = np.unique(clnp, return_counts=True) #letter_counts = Counter(clusters_labels) #df = pandas.DataFrame.from_dict(letter_counts, orient='index') ind = np.argsort(con)[::-1] uni = uni[ind] con = con[ind] maxim = con.sum() cont = con[0] label = uni[0] uni = uni[1:] con = con[1:] marker = np.zeros(uni.shape) for s in label.split('.'): for j in range(uni.shape[0]): if marker[j] == 0 and s in uni[j]: cont += con[j] marker[j] = 1 # print("cluster:", i, "metrica:", cont/maxim ) metric.append(cont / maxim) metric = np.array(metric, dtype=np.float) print("mean:", metric.mean()) print("std:", metric.std()) print("median:", np.median(metric)) print("Min:", np.min(metric)) print("Max:", np.max(metric)) return 0
def graph(): # # Pb Filter # mask_filenames = glob(PathManager.path_valid_masks + 'mask*.npy') valid_bits = np.sort( np.unique([ int(name.replace('\\', '/').split('/')[-1].split('_')[1]) for name in mask_filenames ])) # # Questions # reldist_filter = np.load( PathManager.path_questions_hamming_reldistance_keep_bit_idxs) questions = np.concatenate([ np.load(PathManager.path_questions_hamming_angles), np.load(PathManager.path_questions_hamming_distances), np.load( PathManager.path_questions_hamming_reldistances)[reldist_filter] ]) # # Posebyte # posebyte_conditional = np.load('../posebytes/posebyte_conditioned.npy') angles_val = np.load(PathManager.path_annotations_hamming_valtest_angle) distances_val = np.load( PathManager.path_annotations_hamming_valtest_distance) reldistances_val = np.load( PathManager.path_annotations_hamming_valtest_reldistance) posebyte_valtest = np.concatenate(( angles_val, distances_val, reldistances_val, ), axis=1)[1919:] # # Embeddings # embedding_conditional = np.load('../embeddings/embeddings_conditional.npy') embedding_test = np.load( '../../image/hamming/embeddings/embeddings_valtest_0.npy')[1919:] # # Distances # distances = cdist(embedding_conditional, embedding_test) nearest_indices = np.argsort(distances, axis=1) # # Display # output_path = 'predictions/' root_img_dir = PathManager.path_image_root sequence_file = PathManager.path_dataset_valtest_txt with open(sequence_file, 'r') as in_file: label_lines = in_file.readlines() image_list = [x.strip() for x in label_lines] image_list = [[' '.join(x.strip().split(' ')[:-16]) + '/'] + x.strip().split(' ')[-16:] for x in image_list] image_list = image_list[1919:] for anno_idx, anno in enumerate(embedding_conditional): question_idx = int(anno_idx / 2) answer = posebyte_conditional[anno_idx, question_idx] if question_idx in valid_bits: pass else: continue answer = bool(answer) question = str(question_idx) + ': ' + str(questions[question_idx]) question = question.replace('angle:', 'is bent:') question = question.replace('distance:', 'is near:') question = question.replace('beyond:', 'is beyond:') question = question + '? ' + str(answer) output_file_name = output_path + question + '.png' nearest = nearest_indices[anno_idx] fig = plt.figure() fig.set_size_inches(8.0, 8.0) for frame_idx in range(25): near_idx = nearest[frame_idx] image_name = root_img_dir + image_list[near_idx][0] + image_list[ near_idx][1].split('_')[1] + '.png' axes = fig.add_subplot(5, 5, frame_idx + 1) if posebyte_valtest[near_idx, question_idx] == answer: for spine in axes.spines.values(): spine.set_edgecolor('green') spine.set_linewidth(8) else: for spine in axes.spines.values(): spine.set_edgecolor('red') spine.set_linewidth(8) image_to_show = imread(image_name) plt.suptitle(question, fontsize=16) plt.imshow(imresize(image_to_show, (288, 288))) plt.setp(axes.get_xticklabels(), visible=False) plt.setp(axes.get_yticklabels(), visible=False) plt.show()
def update(self, rects): # check to see if the list of input bounding box rectangles # is empty if len(rects) == 0: # loop over any existing tracked objects and mark them # as disappeared for objectID in list(self.disappeared.keys()): self.disappeared[objectID] += 1 # if we have reached a maximum number of consecutive # frames where a given object has been marked as # missing, deregister it if self.disappeared[objectID] > self.maxDisappeared: self.deregister(objectID) # return early as there are no centroids or tracking info # to update return self.objects # initialize an array of input centroids for the current frame inputCentroids = np.zeros((len(rects), 2), dtype="int") # loop over the bounding box rectangles for (i, (startX, startY, endX, endY)) in enumerate(rects): # use the bounding box coordinates to derive the centroid cX = int((startX + endX) / 2.0) cY = int((startY + endY) / 2.0) print("cX : " + str(cX) + ", cY : " + str(cY)) inputCentroids[i] = (cX, cY) # if we are currently not tracking any objects take the input # centroids and register each of them if len(self.objects) == 0: for i in range(0, len(inputCentroids)): self.register(inputCentroids[i]) # otherwise, are are currently tracking objects so we need to # try to match the input centroids to existing object # centroids else: # grab the set of object IDs and corresponding centroids objectIDs = list(self.objects.keys()) objectCentroids = list(self.objects.values()) # compute the distance between each pair of object # centroids and input centroids, respectively -- our # goal will be to match an input centroid to an existing # object centroid D = dist.cdist(np.array(objectCentroids), inputCentroids) # in order to perform this matching we must (1) find the # smallest value in each row and then (2) sort the row # indexes based on their minimum values so that the row # with the smallest value as at the *front* of the index # list rows = D.min(axis=1).argsort() # next, we perform a similar process on the columns by # finding the smallest value in each column and then # sorting using the previously computed row index list cols = D.argmin(axis=1)[rows] # in order to determine if we need to update, register, # or deregister an object we need to keep track of which # of the rows and column indexes we have already examined usedRows = set() usedCols = set() # loop over the combination of the (row, column) index # tuples for (row, col) in zip(rows, cols): # if we have already examined either the row or # column value before, ignore it # val if row in usedRows or col in usedCols: continue # otherwise, grab the object ID for the current row, # set its new centroid, and reset the disappeared # counter objectID = objectIDs[row] self.objects[objectID] = inputCentroids[col] self.disappeared[objectID] = 0 # indicate that we have examined each of the row and # column indexes, respectively usedRows.add(row) usedCols.add(col) # compute both the row and column index we have NOT yet # examined unusedRows = set(range(0, D.shape[0])).difference(usedRows) unusedCols = set(range(0, D.shape[1])).difference(usedCols) # in the event that the number of object centroids is # equal or greater than the number of input centroids # we need to check and see if some of these objects have # potentially disappeared if D.shape[0] >= D.shape[1]: # loop over the unused row indexes for row in unusedRows: # grab the object ID for the corresponding row # index and increment the disappeared counter objectID = objectIDs[row] self.disappeared[objectID] += 1 # check to see if the number of consecutive # frames the object has been marked "disappeared" # for warrants deregistering the object if self.disappeared[objectID] > self.maxDisappeared: self.deregister(objectID) # otherwise, if the number of input centroids is greater # than the number of existing object centroids we need to # register each new input centroid as a trackable object else: for col in unusedCols: self.register(inputCentroids[col]) # return the set of trackable objects return self.objects
def average_linkage(self, cluster1, cluster2): distances = cdist(cluster1, cluster2, 'euclidean') return distances.mean()
def single_linkage(self, cluster1, cluster2): distances = cdist(cluster1, cluster2, 'euclidean') return distances.min()
def test_live(message): app.queue.put(message['data']) img_bytes = base64.b64decode(app.queue.get()) img = np.array(Image.open(io.BytesIO(img_bytes))) img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) try: rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) boxes_c, landmarks = mtcnn_detector.detect(img) for i in range(boxes_c.shape[0]): bbox = boxes_c[i, :4] # we dont want too small if (int(bbox[2]) - int(bbox[0])) > 100: # only detect husein cropped = rgb[int(bbox[1]):int(bbox[3]), int(bbox[0]):int(bbox[2])] predicted = face_sess.run(face_model.logits, feed_dict={face_model.X: cropped})[0] person = data_Y[np.argmin( cdist(embedded, [predicted], 'cosine')[:, 0])] if person == 0: cropped = gray[int(bbox[1]):int(bbox[3]), int(bbox[0]):int(bbox[2])] predicted = emotion_sess.run(emotion_model.logits, feed_dict={ emotion_model.X: np.expand_dims( cropped, 2) })[0] emotion = np.argmax(predicted) shape = predictor( gray, dlib.rectangle( int(bbox[0]) - 10, int(bbox[1]) - 10, int(bbox[2]) - 10, int(bbox[3]) - 10)) shape = shape_to_np(shape) roll, pitch, yaw = face_orientation( img, shape[[33, 8, 36, 45, 48, 54]]) left_eye = shape[42:48] right_eye = shape[36:42] left_EAR = eye_aspect_ratio(left_eye) right_EAR = eye_aspect_ratio(right_eye) ear = (left_EAR + right_EAR) / 2.0 ear = (ear - eyes[0]) / (eyes[1] - eyes[0]) MOUTH = mouth_aspect_ratio(shape[[61, 67, 63, 65, 60, 64]]) MOUTH = (MOUTH - mouths[0]) / (mouths[1] - mouths[0]) current_time = ( datetime.now() - timedelta(hours=8)).strftime("%Y-%m-%d %H:%M:%S") doc = { 'emotion': labels_emotion[emotion], 'roll': roll, 'pitch': pitch, 'yaw': yaw, 'mouth': MOUTH, 'eyes': ear, 'datetime': current_time.replace(' ', 'T') } es.index(index="huseinhouse_emotion", doc_type='face', id=current_time, body=doc) emit('camera_update', {'data': doc}, broadcast=True) except Exception as e: print(e) pass
f"document in {matched_preprint_published_pairs.document.tolist()}" ).set_index("document").reindex( matched_preprint_published_pairs.document.tolist()).fillna(0)) biorxiv_documents.head() pmc_documents = (pmc_embed_df.query( f"document in {matched_preprint_published_pairs.pmcid.tolist()}" ).set_index("document").reindex( matched_preprint_published_pairs.pmcid.tolist()).drop("journal", axis=1).fillna(0)) pmc_documents.head() # + published_date_distances = (matched_preprint_published_pairs.assign( doc_distances=np.diag( cdist(biorxiv_documents.values, pmc_documents.values, "euclidean"))).replace( 0, np.nan).dropna().query("doc_distances.notnull()")) published_date_distances.to_csv( "output/preprint_published_distances_rerun.tsv", sep="\t", index=False) print(published_date_distances.shape) published_date_distances.head() # - # # Construct Scatter Plot of Date vs Version Count # Preprints are delayed on an average of 51 days for each new version posted onto bioRxiv. This section regresses preprint's version counts against the time it takes to have a preprint published. A scatter and square bin plot are generated below. # + # Get smoothed linear regression line
def complete_linkage(self, cluster1, cluster2): distances = cdist(cluster1, cluster2, 'euclidean') return distances.max()
import numpy as np from scipy.spatial import distance from scipy.spatial import distance_matrix from scipy.spatial.distance import euclidean from scipy.spatial import distance data = np.load('data/digit2.npy') sample_size = data.shape[0] image_size = data.shape[1] data = data.reshape(sample_size, image_size * image_size) print(data.shape) print(data[0].shape) def Euclidean_distance(x, y, p=2): x = x.reshape(1, -1) y = y.reshape(1, -1) return distance_matrix(x, y, p=p) import time start = time.time() distance_matrix = distance.cdist(data, data, metric='euclidean') print(distance_matrix) end = time.time() print(end - start) np.save('distance_matrix.npy', distance_matrix)
id.append(int(row[0])) metadata.append([float(i) for i in row[1:]]) data[int(row[0])] = [] data[int(row[0])] = ([float(i) for i in row[1:]]) f.close() with open(data_filename) as f: position = [] for line in f: row = line.strip('\n').split(',') position.append([float(i) for i in row]) f.close() heap = [] dis = cdist(metadata, metadata, 'euclidean') for i in range(len(dis)): for j in range(i + 1, len(dis[0])): heapq.heappush(heap, [dis[i][j], (id[i], id[j])]) # hierarchical centroid = [] results = hierarchical(data, k, id, heap) clusters = [] for i in range(len(results)): centroid.append(data[results[i]]) cur_class = sorted([ int(j) for j in str(results[i]).replace('(', '').replace( ')', '').split(',') ]) clusters.append(cur_class) print clusters
import numpy as np from scipy.spatial.distance import cdist from time import time import mrpt # Generate synthetic test data k = 10; n_queries = 100 data = np.dot(np.random.rand(int(1e5),5), np.random.rand(5,100)).astype('float32') queries = np.dot(np.random.rand(n_queries,5), np.random.rand(5,100)).astype('float32') # Solve exact nearest neighbors with standard methods from scipy and numpy for reference exact_search_time = time() exact_neighbors = np.zeros((n_queries, k)) for i in range(n_queries): exact_neighbors[i] = np.argsort(cdist([queries[i]], data))[0,:k] exact_search_time = time() - exact_search_time # Offline phase: Indexing the data. This might take some time. indexing_time = time() index = mrpt.MRPTIndex(data, depth=5, n_trees=100) index.build() indexing_time = time() - indexing_time # Online phase: Finding nearest neighbors stupendously fast. approximate_search_time = time() approximate_neighbors = np.zeros((n_queries, k)) for i in range(n_queries): approximate_neighbors[i] = index.ann(queries[i], k, votes_required=4) approximate_search_time = time() - approximate_search_time # Print some stats
#reading the data from the csv file using pandas dataIn = pd.read_csv('C:\Crime_Analysis.csv') x1 = dataIn['Y'] #considering latitudes as x1 y1 = dataIn['X'] #considering longitudes as y1 data = np.array(list(zip(x1, y1))).reshape(len(x1), 2) #K Means Algorithm distortions = [] #clusters in steps of 10 clusters = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]) for k in clusters: kmean = KMeans(n_clusters=k).fit(data) kmean.fit(data) distortions.append( sum(np.min(cdist(data, kmean.cluster_centers_, 'euclidean'), axis=1)) / data.shape[0]) # Plotting elbow graph plt.plot(clusters, distortions, 'bx-') # setting the label for x-axis plt.xlabel('clusters') # setting the label for x-axis plt.ylabel('distortion') # setting the title plt.title('plot to select the number of clusters') plt.show()
def get_distances_between_keypoints(keypoints): return cdist(keypoints, keypoints)
def closest_node(node, nodes): closest_index = distance.cdist([node], nodes).argmin() return nodes[closest_index]
def profile_axis(profile, rasterize_factor, min_length=10): [x0, y0], [x1, y1] = profile.min(axis=0), profile.max(axis=0) x0, y0, x1, y1 = int(x0) - 1, int(y0) - 1, int(x1) + 1, int(y1) + 1 w, h = x1 - x0, y1 - y0 img = Image.new("1", (w * rasterize_factor, h * rasterize_factor)) draw = ImageDraw.Draw(img) profile = ((profile - [x0, y0]) * rasterize_factor).astype(int) draw.polygon([(x, y) for x, y in profile], fill=0, outline=1) outline = np.argwhere(np.array(img, dtype=bool).T) draw.polygon([(x, y) for x, y in profile], fill=1, outline=0) profile_mask = clean_mask(np.array(img, dtype=bool).T) img.close() skelet = skeletonize(profile_mask) connectivity = convolve2d(skelet, square(3), mode="same", boundary="fill", fillvalue=False) nodes = (skelet & ((connectivity > 3) | (connectivity == 2))) rcs = np.argwhere(skelet) idxs_nodes = set(np.where(nodes[rcs[:, 0], rcs[:, 1]])[0].tolist()) d = np.abs( np.dstack((rcs[:, 0, None] - rcs[None, :, 0], rcs[:, 1, None] - rcs[None, :, 1]))).max(axis=2) G = nx.from_numpy_matrix(d == 1) paths = [] for i, j in combinations(idxs_nodes, 2): path = nx.shortest_path(G, i, j) if len(idxs_nodes.intersection(path)) > 2: continue paths.append(rcs[path]) paths = sorted(paths, key=lambda path: len(path)) axis = np.array(paths.pop(), dtype=float) if ((axis[-1] / rasterize_factor - [-x0, -y0])** 2).sum() < ((axis[0] / rasterize_factor - [-x0, -y0])**2).sum(): axis = axis[::-1] d_min = cdist(axis, profile).min(axis=1) d_min = d_min[d_min > d_min.max() / 2].mean() / 2 d_max = rasterize_factor * 2 while paths: path = paths.pop() if cdist(path, profile).min() < d_min: break d1 = np.sqrt(((path[0] - axis[0])**2).sum()) d2 = np.sqrt(((path[0] - axis[-1])**2).sum()) d3 = np.sqrt(((path[-1] - axis[0])**2).sum()) d4 = np.sqrt(((path[-1] - axis[-1])**2).sum()) if (d1 < d_max) and (d3 > d_max): axis = np.vstack((path[::-1], axis)) elif (d2 < d_max) and (d4 > d_max): axis = np.vstack((axis, path)) elif (d3 < d_max) and (d1 > d_max): axis = np.vstack((path, axis)) elif (d4 < d_max) and (d2 > d_max): axis = np.vstack((axis, path[::-1])) d_min = cdist(axis, profile).min(axis=1) d_min = d_min[d_min > d_min.max() / 2].mean() axis0 = axis.copy() d = cdist([profile[np.argmin(cdist([axis[-1]], profile)[0])]], axis)[0] axis = axis[d > d_min] if axis.shape[0] < min_length: axis = axis0.copy() axis0 = axis.copy() d = cdist([profile[np.argmin(cdist([axis[0]], profile)[0])]], axis)[0] axis = axis[d > d_min] if axis.shape[0] < min_length: axis = axis0.copy() thickness = cdist(axis, profile).min(axis=1) * 2 thickness = thickness[thickness > thickness.max() / 2].mean() / rasterize_factor axis = axis / rasterize_factor + [x0, y0] axis = smoothen_coords(axis) return axis, thickness
def distance_matrix(data, numeric_distance="euclidean", categorical_distance="jaccard"): """ Compute the pairwise distance attribute by attribute in order to account for different variables type: - Continuous - Categorical For ordinal values, provide a numerical representation taking the order into account. Categorical variables are transformed into a set of binary ones. If both continuous and categorical distance are provided, a Gower-like distance is computed and the numeric variables are all normalized in the process. If there are missing values, the mean is computed for numerical attributes and the mode for categorical ones. Note: If weighted-hamming distance is chosen, the computation time increases a lot since it is not coded in C like other distance metrics provided by scipy. @params: - data = pandas dataframe to compute distances on. - numeric_distances = the metric to apply to continuous attributes. "euclidean" and "cityblock" available. Default = "euclidean" - categorical_distances = the metric to apply to binary attributes. "jaccard", "hamming", "weighted-hamming" and "euclidean" available. Default = "jaccard" @returns: - the distance matrix """ possible_continuous_distances = ["euclidean", "cityblock"] possible_binary_distances = [ "euclidean", "jaccard", "hamming", "weighted-hamming" ] number_of_variables = data.shape[1] number_of_observations = data.shape[0] # Get the type of each attribute (Numeric or categorical) is_numeric = [ all(isinstance(n, numbers.Number) for n in data.iloc[:, i]) for i, x in enumerate(data) ] is_all_numeric = sum(is_numeric) == len(is_numeric) is_all_categorical = sum(is_numeric) == 0 is_mixed_type = not is_all_categorical and not is_all_numeric # Check the content of the distances parameter if numeric_distance not in possible_continuous_distances: print "The continuous distance " + numeric_distance + " is not supported." return None elif categorical_distance not in possible_binary_distances: print "The binary distance " + categorical_distance + " is not supported." return None # Separate the data frame into categorical and numeric attributes and normalize numeric data if is_mixed_type: number_of_numeric_var = sum(is_numeric) number_of_categorical_var = number_of_variables - number_of_numeric_var data_numeric = data.iloc[:, is_numeric] data_numeric = (data_numeric - data_numeric.mean()) / ( data_numeric.max() - data_numeric.min()) data_categorical = data.iloc[:, [not x for x in is_numeric]] # Replace missing values with column mean for numeric values and mode for categorical ones. With the mode, it # triggers a warning: "SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame" # but the value are properly replaced if is_mixed_type: data_numeric.fillna(data_numeric.mean(), inplace=True) for x in data_categorical: data_categorical[x].fillna(data_categorical[x].mode()[0], inplace=True) elif is_all_numeric: data.fillna(data.mean(), inplace=True) else: for x in data: data[x].fillna(data[x].mode()[0], inplace=True) # "Dummifies" categorical variables in place if not is_all_numeric and not (categorical_distance == 'hamming' or categorical_distance == 'weighted-hamming'): if is_mixed_type: data_categorical = pd.get_dummies(data_categorical) else: data = pd.get_dummies(data) elif not is_all_numeric and categorical_distance == 'hamming': if is_mixed_type: data_categorical = pd.DataFrame([ pd.factorize(data_categorical[x])[0] for x in data_categorical ]).transpose() else: data = pd.DataFrame([pd.factorize(data[x])[0] for x in data]).transpose() if is_all_numeric: result_matrix = cdist(data, data, metric=numeric_distance) elif is_all_categorical: if categorical_distance == "weighted-hamming": result_matrix = weighted_hamming(data) else: result_matrix = cdist(data, data, metric=categorical_distance) else: result_numeric = cdist(data_numeric, data_numeric, metric=numeric_distance) if categorical_distance == "weighted-hamming": result_categorical = weighted_hamming(data_categorical) else: result_categorical = cdist(data_categorical, data_categorical, metric=categorical_distance) result_matrix = np.array([[ 1.0 * (result_numeric[i, j] * number_of_numeric_var + result_categorical[i, j] * number_of_categorical_var) / number_of_variables for j in range(number_of_observations) ] for i in range(number_of_observations)]) # Fill the diagonal with NaN values np.fill_diagonal(result_matrix, np.nan) return pd.DataFrame(result_matrix)
def main(): global Q,E,S,Pk,e,T,pH,total_charged_residues,G,G0,indiv_data,Gqq,Q0 file_pdb = arguments.f pH = np.float(arguments.arg_pH) T = np.float(arguments.arg_T) ################################################################################################## # runs the standalone version of ©Surfrace ################################################################################################## print 'Running SASA - ©Surfrace' cmd1 = 'echo 1' + arguments.f.name + ' 1.4 1| ./surfrace5_0_linux_64bit > SASA_'+os.path.splitext(arguments.f.name)[0]+'_all.trash' ## Roda o programa para a SASA os.system(cmd1) try: file_sasa = open(os.path.splitext(arguments.f.name)[0] + '_residue.txt', 'r') ## Abre o arquivo que vem do programa acima except (IOError) as errno: print ('I/O error - ** Check the files of SASA calculation - something went wrong **. %s' % errno) sys.exit() SASA_data=[] for line2 in file_sasa: list2 = line2.split() Area_norm = np.float(list2[2])/np.float(Area_residues[All_residues.index(list2[1])]) if Area_norm >= 1.0: print "Warning - ** SASA greater than 1.0 **",list2[1],list2[0],list2[2],np.float(Area_residues[All_residues.index(list2[1])]),Area_norm print "Automatically changed to 0.75" Area_norm = 0.750000000001 SASA_data.append([list2[1],list2[2],Area_norm]) indiv_data=[] S=[] SAij=[] total_atoms=[] total_residues=[] total_charged_residues=[] for line in file_pdb: ## Reading file.pdb lista = line.split() id = lista[0] if id == 'ATOM': atom_index = np.int(lista[1]) atom_type = lista[2] residue_type = lista[3] chain = lista[4] residue_index = np.int(lista[5]) total_atoms.append([atom_index]) if atom_type == 'CA' and chain == 'A': total_residues.append([residue_index]) if atom_index == 1 and atom_type == 'N' and chain == 'A' and residue_index == 1 and not residue_type in Charged_residues: ## Select the charged residues total_charged_residues.append([atom_index]) S.append(['N_T',residue_index,np.size(total_charged_residues),lista[1],lista[2],np.float(lista[6]),np.float(lista[7]),np.float(lista[8]),PKA[Charged_residues.index('N_TER')],SASA_data[np.size(total_residues)-1][2],Charge_values[Charged_residues.index('N_TER')]]) if residue_type in Charged_residues and atom_type in Charged_atoms: ## Seleciona os resíduos carregados total_charged_residues.append([atom_index]) S.append([lista[3],residue_index,np.size(total_charged_residues),lista[1],lista[2],np.float(lista[6]),np.float(lista[7]),np.float(lista[8]),PKA[Charged_residues.index(residue_type)],SASA_data[np.size(total_residues)-1][2],Charge_values[Charged_residues.index(residue_type)]]) if atom_type == 'OXT' and chain == 'A' and not residue_type in Charged_residues: total_charged_residues.append([atom_index]) S.append(['C_T',residue_index,np.size(total_charged_residues),lista[1],lista[2],np.float(lista[6]),np.float(lista[7]),np.float(lista[8]),PKA[Charged_residues.index('C_TER')],SASA_data[np.size(total_residues)-1][2],Charge_values[Charged_residues.index('C_TER')]]) print "There are: %d Charged_residues" % np.size(total_charged_residues) Restype=np.asarray([i[0] for i in S]) X=np.asarray([i[5] for i in S]) Y=np.asarray([i[6] for i in S]) Z=np.asarray([i[7] for i in S]) Pk=np.asarray([i[8] for i in S]) SA=np.asarray([i[9] for i in S]) Q=np.asarray([i[10] for i in S]) Restype=np.char.replace(np.char.replace(np.char.replace(np.char.replace(np.char.replace(Restype, 'HIS','H'), 'ASP','D'), 'ARG','R'), 'GLU','E'), 'LYS','K') X = X - np.mean(X) Y = Y - np.mean(Y) Z = Z - np.mean(Z) XYZ = zip(X,Y,Z) Origin = np.zeros(np.shape(XYZ)) dist = distance.cdist(XYZ, XYZ, 'euclidean') if arguments.arg_e == 'TK': dist_origin = distance.cdist(XYZ, Origin, 'euclidean') angle = distance.cdist(XYZ, XYZ, 'cosine') raio = (np.max(dist)*0.5 + 3.4+2.0, np.max(dist)*0.5 + 2.0+2.0) np.seterr(invalid='ignore') np.seterr(divide='ignore') theta = np.arccos(1-angle) NormA = np.matrix([LA.norm(v) for v in np.array(XYZ)]) rirj = np.array(np.dot(np.transpose(NormA),NormA)) A = np.divide(raio[1],e[0]*dist) B = (np.nansum(np.array([((e[1]-e[0])/(e[1]-(n*e[0])/(n+1)))*(np.power((rirj/(raio[1]*raio[1])),n))*(eval_legendre(n, np.cos(theta))) for n in range(0,60)]),axis=0))/(e[0]) C = (np.divide(e[2],1+e[2]) + np.power(e[2],2)*np.sum(np.array([np.divide(np.divide(2*n+1,2*n-1)*np.divide(e[1],(n+1)*e[1]+n*e[0])*(np.power((rirj/(raio[0]*raio[0])),n))*(eval_legendre(n, np.cos(theta))),np.divide(Kn(n+1,e[2]),Kn(n-1,e[2])) + np.divide(n*(e[1]-e[0]),(n+1)*e[1]+n*e[0])*np.divide(np.power(e[2],2),4.0*np.power(n,2)-1)*np.power(np.divide(raio[1],raio[0]),2*n+1)) for n in range(1,60)]),axis=0))/(e[1]) Qe = np.divide(e[3]*e[4]*e[4]*np.power(10,7),4*np.pi*e[5]) SAij = distance.cdist(zip(SA), zip(SA), lambda u,v: (u+v)*0.5) E = Qe*(np.divide(A-B,2*raio[1])-np.divide(C,2*raio[0]))*(1-SAij) if np.sum(np.where(E<0)) > 0: print '###############################################################' print "There are: %d negatives TK energy values - Please check the radius of TK method!" % np.int(np.sum(np.where(E<0))) print "Sugestion - Increase b radius" print "Current radius ratio b/a=", np.divide(raio[1],raio[0]) print '###############################################################' E[np.isinf(E)]= 0 E[np.isnan(E)]= 0 E_out=np.vstack([np.vstack([Q,E]),Pk]) np.savetxt('E.dat',E_out) if arguments.arg_s == 'EX': print u"\U0001F63A", "### TK - Exact ###", u"\U0001F63A" start = time.time() p = subprocess.Popen([r"c++","./src/tksaex.c",'-lm','-O3','-o','tksaex.exe'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=False) p.communicate() p = subprocess.Popen(["./tksaex.exe",np.str(pH),np.str(T)], stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=False) i=0 j=1 while p.poll() is None: sys.stdout.write('\r') sys.stdout.write("TKSA EX is running, please wait - [%20s%-20s]" % ('='*i,'='*i)) sys.stdout.write(u"\U0001F63A") sys.stdout.flush() if i>19: j=j+1 if j%2 == 0: i=i-1 if j%2 == 1: i=i+1 if i == 0: j=1 sys.stdout.flush() time.sleep(0.1) output,err = p.communicate() print output print err end = time.time() elapsed = end - start print "Ran in %f sec" % elapsed if arguments.arg_s == 'MC': print u"\U0001F63A", "### TKSA - MC ###", u"\U0001F63A" start = time.time() p = subprocess.Popen([r"c++","./src/tksamc.c",'-lm','-O3','-o','tksamc.exe'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=False) p.communicate() p = subprocess.Popen(["./tksamc.exe",np.str(pH),np.str(T)], stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=False) i=0 j=1 while p.poll() is None: sys.stdout.write('\r') sys.stdout.write("TKSA MC is running, please wait - [%20s%-20s]" % ('='*i,'='*i)) sys.stdout.write(u"\U0001F63A") sys.stdout.flush() if i>19: j=j+1 if j%2 == 0: i=i-1 if j%2 == 1: i=i+1 if i == 0: j=1 sys.stdout.flush() time.sleep(0.1) output,err = p.communicate() print output print err end = time.time() elapsed = end - start print "Ran in %f sec" % elapsed if arguments.arg_plot == 'yes' and arguments.arg_s =='EX': try: file_plot = open("out.dat", 'r') except (IOError) as errno: print ('I/O error - ** Output file with issues - out.dat **. %s' % errno) sys.exit() plot_data=[] for line3 in file_plot: # Plotting list3 = line3.split() plot_data.append(list3) Restype=np.char.replace(np.char.replace(["%s%02d" % t for t in zip(Restype,np.asarray([i[1] for i in S]))],'C_T'+np.str(S[-1][1]),'CTR'),'N_T0'+np.str(S[0][1]),'NTR') S=np.hstack((S,plot_data)) plot_data=list(map(float, np.asarray(plot_data).flatten())) print "Total dG Energy: ",np.sum(np.asarray(plot_data)) x_pos = np.arange(len(total_charged_residues)) fig = plt.figure() ax = fig.add_subplot(111) width=1.0 colors = [] for position, value in enumerate(plot_data): if value > 0 and SA[position] > 0.5: colors.append('r') else: colors.append('b') ax.bar(x_pos, plot_data,width=width,color=colors,linewidth=2) ax.tick_params('both', length=5, width=2, which='major',labelsize=13) plt.setp(ax.spines.values(), linewidth=2) plt.xticks(x_pos+width/2.0,Restype,rotation=90,fontsize=15) plt.xlim([0,np.size(x_pos)]) plt.ylabel(r'$\Delta G_{qq}$(kJ/mol)',fontsize=20) fig.savefig('Fig_EX_'+ os.path.splitext(arguments.f.name)[0]+'_pH_'+str(pH)+'_T_'+str(T)+'.jpg', dpi = 300) header='1-Name 2-Residue-index 3-Position 4-Atom 5-Atom-type 6-X 7-Y 8-Z 9-PKA 10-SASA 11-Charge 12-dG_Energy 13-Total_dG= '+str(np.sum(np.asarray(plot_data)))+'' np.savetxt('Output_EX_'+os.path.splitext(arguments.f.name)[0]+'_pH_'+str(pH)+'_T_'+str(T)+'.dat',S,fmt='%s', delimiter=" ",header=str(header)) if arguments.arg_plot == 'yes' and(arguments.arg_s =='MC'): try: file_plot = open("out.dat", 'r') except (IOError) as errno: print ('I/O error - ** Output file with issues - out.dat **. %s' % errno) sys.exit() plot_data=[] for line3 in file_plot: ## Plotting list3 = line3.split() plot_data.append(list3) Restype=np.char.replace(np.char.replace(["%s%02d" % t for t in zip(Restype,np.asarray([i[1] for i in S]))],'C_T'+np.str(S[-1][1]),'CTR'),'N_T0'+np.str(S[0][1]),'NTR') S=np.hstack((S,plot_data)) plot_data=list(map(float, np.asarray(plot_data).flatten())) print "Total dG Energy: ",np.sum(np.asarray(plot_data)) x_pos = np.arange(len(total_charged_residues)) fig = plt.figure() ax = fig.add_subplot(111) width=1.0 colors = [] for position, value in enumerate(plot_data): if value > 0 and SA[position] > 0.5: colors.append('r') else: colors.append('b') ax.bar(x_pos, plot_data,width=width,color=colors,linewidth=2) ax.tick_params('both', length=5, width=2, which='major',labelsize=13) plt.setp(ax.spines.values(), linewidth=2) if np.size(total_charged_residues)>35: plt.xticks(x_pos+width/2.0,Restype,rotation=90,fontsize=8) elif np.size(total_charged_residues) >= 15 and np.size(total_charged_residues) <= 35: plt.xticks(x_pos+width/2.0,Restype,rotation=90,fontsize=12) else: plt.xticks(x_pos+width/2.0,Restype,rotation=90,fontsize=15) plt.xlim([0,np.size(x_pos)]) plt.ylabel(r'$\Delta G_{qq}$(kJ/mol)',fontsize=20) plt.show() fig.savefig('Fig_MC_'+ os.path.splitext(arguments.f.name)[0]+'_pH_'+str(pH)+'_T_'+str(T)+'.jpg', dpi = 300) header='1-Name 2-Residue-index 3-Position 4-Atom 5-Atom-type 6-X 7-Y 8-Z 9-PKA 10-SASA 11-Charge 12-dG_Energy 13-Total_dG= '+str(np.sum(np.asarray(plot_data)))+'' np.savetxt('Output_MC_'+os.path.splitext(arguments.f.name)[0]+'_pH_'+str(pH)+'_T_'+str(T)+'.dat',S,fmt='%s', delimiter=" ",header=str(header)) cmd2 = 'mv result.txt *.exe E.dat out.dat SASA* '+os.path.splitext(arguments.f.name)[0]+'*.txt ./aux' os.system(cmd2) print u"\U0001F63A", "### Finished ###", u"\U0001F63A"
def dice_dist(profile1, keypoints1, profile2, keypoints2, r, rasterize_factor): def _make_mask(profile, keypoints, angle, r_r, rasterize_factor): if angle != 0: profile = rotate_coords(profile, angle) keypoints = rotate_coords(keypoints, angle) profile = np.round(profile * rasterize_factor).astype(int) keypoints = np.round(keypoints * rasterize_factor).astype(int) x0, y0 = profile.min(axis=0) - 4 * r_r profile -= [x0, y0] keypoints -= [x0, y0] w, h = profile.max(axis=0) + 4 * r_r img = Image.new("1", (w, h)) draw = ImageDraw.Draw(img) draw.polygon([(x, y) for x, y in profile], fill=1) mask = np.array(img, dtype=bool).T img.close() return mask, keypoints def _get_kp_mask(mask, keypoints, i, shift_x, shift_y, r_r, rasterize_factor): row, col = np.round( keypoints[i] - [shift_x * rasterize_factor, shift_y * rasterize_factor]).astype(int) col0 = col - r_r col1 = col0 + 2 * r_r + 1 row0 = row - r_r row1 = row0 + 2 * r_r + 1 return mask[:, col0:col1][row0:row1] r = int(round(r)) r_r = r * rasterize_factor w = np.ones((2 * r_r + 1, 2 * r_r + 1), dtype=float) ijs = np.argwhere(w > 0) d = np.sqrt(((ijs - r_r)**2).sum(axis=1)) d[d > r_r] = r_r w[ijs[:, 0], ijs[:, 1]] = ((r_r - d) / r_r)**2 rot_step = 2 * np.arcsin(1 / (2 * r)) angle_min = -np.pi / 8 angle_max = np.pi / 8 angles = np.linspace(angle_min, angle_max, int(round((angle_max - angle_min) / rot_step))) angles = angles[angles != 0] angles = np.insert(angles, 0, 0) shifts = [] for shift_x in range(-4, 5, 2): for shift_y in range(-4, 5, 2): if [shift_x, shift_y] == [0, 0]: continue shifts.append([shift_x, shift_y]) shifts = [[0, 0]] + shifts d_dist_sum = 0 d_dist_norm = 0 d = cdist(keypoints1, keypoints2) mask_m1, keypoints_m1 = _make_mask(profile1, keypoints1, 0, r_r, rasterize_factor) for i in range(keypoints1.shape[0]): jj = np.where(d[i] < 2 * r)[0] if not jj.size: continue mask1 = _get_kp_mask(mask_m1, keypoints_m1, i, 0, 0, r_r, rasterize_factor) mask1_sum = w[mask1].sum() d_dist_opt = np.inf for angle in angles: mask_m2, keypoints_m2 = _make_mask(profile2, keypoints2, angle, r_r, rasterize_factor) for j in jj: for shift_x, shift_y in shifts: mask2 = _get_kp_mask(mask_m2, keypoints_m2, j, shift_x, shift_y, r_r, rasterize_factor) d_dist = 1 - (2 * w[mask1 & mask2].sum()) / ( mask1_sum + w[mask2].sum()) if d_dist < d_dist_opt: d_dist_opt = d_dist if d_dist_opt < np.inf: d_dist_sum += d_dist_opt**2 d_dist_norm += 1 return d_dist_sum, d_dist_norm
def fit(self, target_mask, method='min_distance', r=5, n_exps=50, n_parcels=2, meta_estimator=SCALE, **kwargs): """ Run CBP parcellation. Parameters ---------- target_mask : img_like Image with binary mask for region of interest to be parcellated. n_parcels : :obj:`int` or array_like of :obj:`int`, optional Number of parcels to generate for ROI. If array_like, each parcel number will be evaluated and results for all will be returned. Default is 2. n_iters : :obj:`int`, optional Number of iterations to run for each parcel number. Default is 10000. n_cores : :obj:`int`, optional Number of cores to use for model fitting. Returns ------- results """ assert np.array_equal(self.mask.affine, target_mask.affine) kernel_args = { k: v for k, v in kwargs.items() if k.startswith('kernel__') } meta_args = { k.split('meta__')[1]: v for k, v in kwargs.items() if k.startswith('meta__') } if not isinstance(n_parcels, list): n_parcels = [n_parcels] # Step 1: Build correlation matrix target_data = apply_mask(target_mask, self.mask) target_map = unmask(target_data, self.mask) target_data = target_map.get_data() mask_idx = np.vstack(np.where(target_data)) n_voxels = mask_idx.shape[1] voxel_arr = np.zeros((n_voxels, np.sum(self.mask))) ijk = self.coordinates[['i', 'j', 'k']].values temp_df = self.coordinates.copy() for i_voxel in range(n_voxels): voxel = mask_idx[:, i_voxel] temp_df['distance'] = cdist(ijk, voxel) if method == 'min_studies': # number of studies temp_df2 = temp_df.groupby('id')[['distance']].min() temp_df2 = temp_df2.sort_values(by='distance') sel_ids = temp_df2.iloc[:n_exps].index.values elif method == 'min_distance': # minimum distance temp_df2 = temp_df.groupby('id')[['distance']].min() sel_ids = temp_df2.loc[temp_df2['distance'] < r].index.values # Run MACM voxel_meta = meta_estimator(self.dataset, ids=sel_ids, **kernel_args) voxel_meta.fit(**meta_args) voxel_arr[i_voxel, :] = apply_mask(voxel_meta.results['ale'], self.mask) # Correlate voxel-specific MACMs across voxels in ROI voxel_corr = np.corrcoef(voxel_arr) corr_dist = 1 - voxel_corr # Step 2: Clustering labels = np.zeros((n_voxels, len(n_parcels))) metric_types = ['contiguous'] metrics = pd.DataFrame(index=n_parcels, columns=metric_types, data=np.zeros( (len(n_parcels), len(metric_types)))) for i_parc, n_clusters in enumerate(n_parcels): # K-Means clustering _, labeled, _ = k_means(corr_dist, n_clusters, init='k-means++', precompute_distances='auto', n_init=1000, max_iter=1023, verbose=False, tol=0.0001, random_state=1, copy_x=True, n_jobs=1, algorithm='auto', return_n_iter=False) labels[:, i_parc] = labeled # Check contiguity of clusters # Can nilearn do this? temp_mask = np.zeros(target_data.shape) for j_voxel in range(n_voxels): i, j, k = mask_idx[:, j_voxel] temp_mask[i, j, k] = labeled[j_voxel] labeled = meas.label(temp_mask, np.ones((3, 3, 3)))[0] n_contig = len(np.unique(labeled)) metrics.loc[n_clusters, 'contiguous'] = int(n_contig > (n_clusters + 1)) self.solutions = labels self.metrics = metrics
def update(self, rects): if len(rects) == 0: for objectID in list(self.disappeared.keys()): self.disappeared[objectID] += 1 if self.disappeared[objectID] > self.maxDisappeared: self.deregister(objectID) return self.objects inputCentroids = np.zeros((len(rects), 2), dtype="int") for (i, (startX, startY, endX, endY)) in enumerate(rects): # use the bounding box coordinates to derive the centroid cX = int((startX + endX) / 2.0) cY = int((startY + endY) / 2.0) inputCentroids[i] = (cX, cY) if len(self.objects) == 0: for i in range(0, len(inputCentroids)): self.register(inputCentroids[i]) else: objectIDs = list(self.objects.keys()) objectCentroids = list(self.objects.values()) D = dist.cdist(np.array(objectCentroids), inputCentroids) rows = D.min(axis=1).argsort() cols = D.argmin(axis=1)[rows] usedRows = set() usedCols = set() for (row, col) in zip(rows, cols): if row in usedRows or col in usedCols: continue if D[row, col] > self.maxDistance: continue objectID = objectIDs[row] self.objects[objectID] = inputCentroids[col] self.disappeared[objectID] = 0 usedRows.add(row) usedCols.add(col) unusedRows = set(range(0, D.shape[0])).difference(usedRows) unusedCols = set(range(0, D.shape[1])).difference(usedCols) if D.shape[0] >= D.shape[1]: for row in unusedRows: objectID = objectIDs[row] self.disappeared[objectID] += 1 if self.disappeared[objectID] > self.maxDisappeared: self.deregister(objectID) else: for col in unusedCols: self.register(inputCentroids[col]) return self.objects
def __call__(self, X, Y=None, eval_gradient=False): X = np.atleast_2d(X) length_scale = _check_length_scale(X, self.length_scale) if Y is None: dists = pdist(X / length_scale, metric='euclidean') else: if eval_gradient: raise ValueError( "Gradient can only be evaluated when Y is None.") dists = cdist(X / length_scale, Y / length_scale, metric='euclidean') if self.nu == 0.5: K = np.exp(-dists) elif self.nu == 1.5: K = dists * math.sqrt(3) K = (1. + K) * np.exp(-K) elif self.nu == 2.5: K = dists * math.sqrt(5) K = (1. + K + K**2 / 3.0) * np.exp(-K) else: # general case; expensive to evaluate K = dists K[K == 0.0] += np.finfo(float).eps # strict zeros result in nan tmp = (math.sqrt(2 * self.nu) * K) K.fill((2**(1. - self.nu)) / gamma(self.nu)) K *= tmp**self.nu K *= kv(self.nu, tmp) if Y is None: # convert from upper-triangular matrix to square matrix K = squareform(K) np.fill_diagonal(K, 1) if eval_gradient: if self.hyperparameter_length_scale.fixed: # Hyperparameter l kept fixed K_gradient = np.empty((X.shape[0], X.shape[0], 0)) return K, K_gradient # We need to recompute the pairwise dimension-wise distances if self.anisotropic: D = (X[:, np.newaxis, :] - X[np.newaxis, :, :])**2 \ / (length_scale ** 2) else: D = squareform(dists**2)[:, :, np.newaxis] if self.nu == 0.5: K_gradient = safe_divide(K[..., np.newaxis] * D, np.sqrt(D.sum(2))[:, :, np.newaxis]) K_gradient[~np.isfinite(K_gradient)] = 0 elif self.nu == 1.5: K_gradient = \ 3 * D * np.exp(-np.sqrt(3 * D.sum(-1)))[..., np.newaxis] elif self.nu == 2.5: tmp = np.sqrt(5 * D.sum(-1))[..., np.newaxis] K_gradient = 5.0 / 3.0 * D * (tmp + 1) * np.exp(-tmp) else: # approximate gradient numerically def f(theta): # helper function return self.clone_with_theta(theta)(X, Y) return K, _approx_fprime(self.theta, f, 1e-10) if not self.anisotropic: return K, K_gradient[:, :].sum(-1)[:, :, np.newaxis] else: return K, K_gradient else: return K
def is_at_goal(self, target_id, goal_id, state): target = np.array([state[target_id, :2]]) goal = np.array([self.blueprint[goal_id]]) mostly_still = abs(np.sum(state[target_id, 2:])) < 0.01 close_to_goal = float(cdist(target, goal)) < 0.2 return close_to_goal, mostly_still
sess.run(inits) # Evaluation routine stat_hb1 = [] stat_icarl = [] stat_ncm = [] for i in range(int(np.ceil(len(files_from_cl) / batch_size))): sc, l, loss, files_tmp, feat_map_tmp = sess.run([ scores, label_batch, loss_class, file_string_batch, op_feature_map ]) mapped_prototypes = feat_map_tmp[:, 0, 0, :] pred_inter = (mapped_prototypes.T) / np.linalg.norm( mapped_prototypes.T, axis=0) sqd_icarl = -cdist(class_means[:, :, 0, itera].T, pred_inter.T, 'sqeuclidean').T sqd_ncm = -cdist(class_means[:, :, 1, itera].T, pred_inter.T, 'sqeuclidean').T stat_hb1 += ([ ll in best for ll, best in zip(l, np.argsort(sc, axis=1)[:, -top:]) ]) stat_icarl += ([ ll in best for ll, best in zip(l, np.argsort(sqd_icarl, axis=1)[:, -top:]) ]) stat_ncm += ([ ll in best for ll, best in zip(l,
x = PC9_Shipment_Qty.PC9_Shipped_Qty X = [] for i in range(len(x)): X.append([float(x[i]), 0]) # changes to 2D list X = np.asarray(X) # changes list into array # Elbow test to determine k # Run K-Means algorithm for all values between 1 to 10 from scipy.cluster.vq import kmeans K = range(1,10) KM = [kmeans(X,k) for k in K] # Determine the distance between each PC9 Size combination and all calculated Centroids centroids = [cent for (cent,var) in KM] D_k = [cdist(X, cent, 'euclidean') for cent in centroids] # As all possible combinations are produced between PC9 Size and Centroids # Keep only the pairing with the shortest distance (or MINIMUM) dist = [np.min(D,axis=1) for D in D_k] # Stores all of the respective error results from each K cluster. # As 10 clusters were run, 10 cluster results were stored avgWithinSS[name] = [sum(d)/X.shape[0] for d in dist] # Initialize variables k = 2 ratio = 1 ratio2 = 1 # Perform "Elbow" test to determine the best cluster
def main(): """ Create an aligned functional group based on command line arguments. """ parser = argparse.ArgumentParser( description='Create a functional group from a smiles pattern', epilog= 'Example usage: %(prog)s -s OPr -n PropylEther -c "Alkyl Ether" -m OCCC' ) parser.add_argument('smi_string', help="Smiles string to generate group") parser.add_argument('-s', '--short-name', help='Short name (defaults to smiles string)') parser.add_argument('-n', '--name', required=True, help='Descriptive name (e.g. PropylEther)') parser.add_argument('-m', '--mepo-compatible', action='store_true', help='Record group as compatible with MEPO-QEq') parser.add_argument('-c', '--classification', help='General classification (e.g. "Alkyl Halide")') parser.add_argument('-t', '--terminal', action='store_true', help='Output to terminal as well as files') args = parser.parse_args() fgroup = args.smi_string if '%99' in fgroup: print('Do not use ring closure 99') raise SystemExit if not args.short_name: args.short_name = fgroup # Use an explicitly defined benzene as a base # Do rings closure at 99 in case functional group has other closures attached = '[cH]%99[cH][cH][cH][cH]c%99' # make3D by default gives an optimised structure, great! pybel_mol = pybel.readstring('smi', attached + fgroup) pybel_mol.title = "[{}] {}".format(args.short_name, args.name) pybel_mol.make3D(forcefield='UFF') uff = ob.OBForceField_FindForceField('uff') uff.Setup(pybel_mol.OBMol) uff.GetAtomTypes(pybel_mol.OBMol) coordinates = [] for ob_atom in pybel_mol: coordinates.append(ob_atom.coords) rotated_coordinates = realign(coordinates, 11, 10, 8) bonds = {} # look at all the bonds separately from the atoms for bond in ob.OBMolBondIter(pybel_mol.OBMol): # These rules are translated from ob/forcefielduff.cpp... start_idx = bond.GetBeginAtomIdx() end_idx = bond.GetEndAtomIdx() start_atom = bond.GetBeginAtom() end_atom = bond.GetEndAtom() bond_order = bond.GetBondOrder() if bond.IsAromatic(): bond_order = 1.5 # e.g., in Cp rings, may not be "aromatic" by OB # but check for explicit hydrogen counts #(e.g., biphenyl inter-ring is not aromatic) #FIXME(tdaff): aromatic C from GetType is "Car" is this correct? if (start_atom.GetType()[-1] == 'R' and end_atom.GetType()[-1] == 'R' and start_atom.ExplicitHydrogenCount() == 1 and end_atom.ExplicitHydrogenCount() == 1): bond_order = 1.5 if bond.IsAmide(): bond_order = 1.41 # Zero the indicies for the connecting atom so that # negative indexes are benzene atoms bond_length = bond.GetLength() bond_id = tuple(sorted((start_idx - 12, end_idx - 12))) bonds[bond_id] = (bond_length, bond_order) # We can start building our output now! output_text = [ "[{}]\n".format(args.short_name), "name = {}\n".format(args.name), "smiles = {}\n".format(fgroup), "mepo_compatible = {}\n".format(args.mepo_compatible) ] if args.classification: output_text.append("class = {}\n".format(args.classification)) # functional group fingerprint nbins = 10 max_distance = 10.0 bin_width = max_distance / nbins fingerprint = [0.0] * (nbins * 3) atom_block = [] base_atom = pybel_mol.atoms[10].OBAtom for ob_atom, coord in zip(pybel_mol, rotated_coordinates): atom_idx = ob_atom.OBAtom.GetIndex() if atom_idx > 10: atomicnum = ob_atom.atomicnum element = ATOMIC_NUMBER[atomicnum] ff_type = ob_atom.OBAtom.GetData("FFAtomType").GetValue() atom_block.append( " {0:4} {1:5} {2[0]:10.6f} {2[1]:10.6f} {2[2]:10.6f}\n". format(element, ff_type, coord)) # Generate fingerprint data distance = ob_atom.OBAtom.GetDistance(base_atom) if distance > max_distance: continue # Put in distance bin fingerprint[int(distance / bin_width)] += 1 # Put in electronegativity bin electronegativity = ob.etab.GetElectroNeg(atomicnum) fingerprint[nbins + int(distance / bin_width)] += electronegativity # Put in vdw radii vdw_radius = ob.etab.GetVdwRad(atomicnum) fingerprint[2 * nbins + int(distance / bin_width)] += vdw_radius fingerprint = ",".join("{:.2f}".format(i) for i in fingerprint) # # 3D fingerprint # xmin, xmax = -5.658385, 6.758497 ymin, ymax = -2.506779, 7.580274 zmin, zmax = -2.469688, 4.024162 spacing = 1.0 # make gridpoints have the cartesian coordinates of all the # points of interest on the grid x_range = np.arange(xmin - 2.0 * spacing, xmax + 3.0 * spacing, spacing) y_range = np.arange(ymin - 2.0 * spacing, ymax + 3.0 * spacing, spacing) z_range = np.arange(zmin - 2.0 * spacing, zmax + 3.0 * spacing, spacing) gridpoints = [(x, y, z) for x in x_range for y in y_range for z in z_range] grid_shape = (len(x_range), len(y_range), len(z_range)) # Calculate all the atom-point distances distance_matrix = cdist(rotated_coordinates, gridpoints) # Find charges for all the atoms manually. # Automatically would do gasteiger, but fails for # some elements and we use qeq anyway qeq = ob.OBChargeModel_FindType('qeq') qeq.ComputeCharges(pybel_mol.OBMol) # coulomb = q1q2/4pie0r no units yet... coulomb_matrix = np.zeros(len(gridpoints)) for ob_atom, distances in zip(pybel_mol, distance_matrix): coulomb_matrix += ob_atom.partialcharge / distances # LJ potential based off UFF also no units yet... vdw_matrix = np.zeros(len(gridpoints)) for ob_atom, distances in zip(pybel_mol, distance_matrix): # Lorentz-Berthelot mixing rules probe = (3.4309, 0.1050) # Carbon source = UFF[ATOMIC_NUMBER[ob_atom.atomicnum]] sigma = (source[0] + probe[0]) / 2.0 epsilon = (source[1] * probe[1])**0.5 vdw_matrix += 4 * epsilon * ((sigma / distances)**12 - (sigma / distances)**6) # Make into 3D gridded data coulomb_matrix = np.reshape(coulomb_matrix, grid_shape) vdw_matrix = np.reshape(vdw_matrix, grid_shape) # Can clip the maximums here or elsewhere coulomb_matrix = np.clip(coulomb_matrix, -0.1, 0.1) vdw_matrix = np.clip(vdw_matrix, -10, 0) # 3D plotting for visualisation #from mayavi import mlab #s = mlab.contour3d(coulomb_matrix) #s = mlab.contour3d(vdw_matrix) #mlab.show() # # Output # output_text.append('atoms =\n') output_text.extend(atom_block) output_text.append('orientation = 0.0 1.0 0.0\n') output_text.append('normal = 0.0 0.0 1.0\n') output_text.append('carbon_bond = {:.3f}\n'.format(bonds[(-1, 0)][0])) output_text.append('fingerprint = {}\n'.format(fingerprint)) bonds_block = [] # no bonds < idx 11 for bond in sorted(bonds): if not bond[0] < 0 and not bond[1] < 0: bonds_block.append(" {0[0]:4} {0[1]:4} {1[1]:5.2f}\n".format( bond, bonds[bond])) output_text.append('bonds =\n') output_text.extend(bonds_block[:]) # Make some pictures; do this now so the ascii can go in the file # But first get rid of the benzene for _idx in range(10): pybel_mol.OBMol.DeleteAtom(pybel_mol.atoms[0].OBAtom) pybel_mol.atoms[0].OBAtom.SetType('R') if not 'ascii' in pybel.outformats: print("Ascii art not available, please upgrade openbabel") else: ascii_mol = pybel_mol.write(format='ascii', opt={'a': 2, 'w': 40}) ascii_mol = [ '# {}\n'.format(x) for x in ascii_mol.splitlines() if x.strip() ] output_text[2:2] = ['#\n'] + ascii_mol + ['#\n'] basename = args.short_name pybel_mol.write(format='mol', filename='{}.mol'.format(basename)) # Always output to a library with open('{}.flib'.format(basename), 'w') as out_lib: out_lib.writelines(output_text) # Make the image with R groups and implicit hydrogen unopt_mol = pybel.readstring('smi', "[*:1]" + fgroup) unopt_mol.write(format='svg', filename='{}.svg'.format(basename), opt={'C': None}) # Make a table row in html with open('{}.html'.format(basename), 'w') as out_html: out_html.write("""\ <td>{args.short_name}</td> <td><p>name: {args.name}</p> <p>smiles: {args.smi_string}</p> <p>MEPO-QEq compatible: {args.mepo_compatible}</td> <td><a href="img/{args.short_name}.svg"> <img src="img/{args.short_name}.svg" alt="Group: {args.short_name}" title="[{args.short_name}] {args.name} {args.smi_string})" style="height: 75px"/></a> </td> """.format(args=args)) if args.terminal: print("".join(output_text))
def _evaluate(self, X, std, gradient, hessian): F, dF, hF = [], [], [] # mean S, dS, hS = [], [], [] # std for gp in self.gps: # mean K = gp.kernel_(X, gp.X_train_) # K: shape (N, N_train) y_mean = K.dot(gp.alpha_) F.append(y_mean) # y_mean: shape (N,) if std: if gp._K_inv is None: L_inv = solve_triangular(gp.L_.T, np.eye(gp.L_.shape[0])) gp._K_inv = L_inv.dot(L_inv.T) y_var = gp.kernel_.diag(X) y_var -= np.einsum("ij,ij->i", np.dot(K, gp._K_inv), K) y_var_negative = y_var < 0 if np.any(y_var_negative): y_var[y_var_negative] = 0.0 y_std = np.sqrt(y_var) S.append(y_std) # y_std: shape (N,) if not (gradient or hessian): continue ell = np.exp(gp.kernel_.theta[1:-1]) # ell: shape (n_var,) sf2 = np.exp(gp.kernel_.theta[0]) # sf2: shape (1,) d = np.expand_dims(cdist(X / ell, gp.X_train_ / ell), 2) # d: shape (N, N_train, 1) X_, X_train_ = np.expand_dims(X, 1), np.expand_dims(gp.X_train_, 0) dd_N = X_ - X_train_ # numerator dd_D = d * ell**2 # denominator dd = safe_divide(dd_N, dd_D) # dd: shape (N, N_train, n_var) if self.nu == 1: dK = -sf2 * np.exp(-d) * dd elif self.nu == 3: dK = -3 * sf2 * np.exp(-np.sqrt(3) * d) * d * dd elif self.nu == 5: dK = -5. / 3 * sf2 * np.exp( -np.sqrt(5) * d) * (1 + np.sqrt(5) * d) * d * dd else: # RBF dK = -sf2 * np.exp(-0.5 * d**2) * d * dd dK_T = dK.transpose( 0, 2, 1 ) # dK: shape (N, N_train, n_var), dK_T: shape (N, n_var, N_train) if gradient: dy_mean = dK_T @ gp.alpha_ # gp.alpha_: shape (N_train,) dF.append(dy_mean) # dy_mean: shape (N, n_var) # TODO: check if std: K = np.expand_dims(K, 1) # K: shape (N, 1, N_train) K_Ki = K @ gp._K_inv # gp._K_inv: shape (N_train, N_train), K_Ki: shape (N, 1, N_train) dK_Ki = dK_T @ gp._K_inv # dK_Ki: shape (N, n_var, N_train) dy_var = -np.sum(dK_Ki * K + K_Ki * dK_T, axis=2) # dy_var: shape (N, n_var) dy_std = 0.5 * safe_divide( dy_var, y_std) # dy_std: shape (N, n_var) dS.append(dy_std) if hessian: d = np.expand_dims(d, 3) # d: shape (N, N_train, 1, 1) dd = np.expand_dims(dd, 2) # dd: shape (N, N_train, 1, n_var) hd_N = d * np.expand_dims(np.eye(len(ell)), ( 0, 1)) - np.expand_dims(X_ - X_train_, 3) * dd # numerator hd_D = d**2 * np.expand_dims(ell**2, (0, 1, 3)) # denominator hd = safe_divide(hd_N, hd_D) # hd: shape (N, N_train, n_var, n_var) if self.nu == 1: hK = -sf2 * np.exp(-d) * (hd - dd**2) elif self.nu == 3: hK = -3 * sf2 * np.exp( -np.sqrt(3) * d) * (d * hd + (1 - np.sqrt(3) * d) * dd**2) elif self.nu == 5: hK = -5. / 3 * sf2 * np.exp( -np.sqrt(5) * d) * (-5 * d**2 * dd**2 + (1 + np.sqrt(5) * d) * (dd**2 + d * hd)) else: # RBF hK = -sf2 * np.exp(-0.5 * d**2) * ( (1 - d**2) * dd**2 + d * hd) hK_T = hK.transpose( 0, 2, 3, 1 ) # hK: shape (N, N_train, n_var, n_var), hK_T: shape (N, n_var, n_var, N_train) hy_mean = hK_T @ gp.alpha_ # hy_mean: shape (N, n_var, n_var) hF.append(hy_mean) # TODO: check if std: K = np.expand_dims(K, 2) # K: shape (N, 1, 1, N_train) dK = np.expand_dims(dK_T, 2) # dK: shape (N, n_var, 1, N_train) dK_Ki = np.expand_dims( dK_Ki, 2) # dK_Ki: shape (N, n_var, 1, N_train) hK_Ki = hK_T @ gp._K_inv # hK_Ki: shape (N, n_var, n_var, N_train) hy_var = -np.sum(hK_Ki * K + 2 * dK_Ki * dK + K_Ki * hK_T, axis=3) # hy_var: shape (N, n_var, n_var) hy_std = 0.5 * safe_divide( hy_var * y_std - dy_var * dy_std, y_var) # hy_std: shape (N, n_var, n_var) hS.append(hy_std) F = np.stack(F, axis=1) dF = np.stack(dF, axis=1) if gradient else None hF = np.stack(hF, axis=1) if hessian else None S = np.stack(S, axis=1) if std else None dS = np.stack(dS, axis=1) if std and gradient else None hS = np.stack(hS, axis=1) if std and hessian else None out = {'F': F, 'dF': dF, 'hF': hF, 'S': S, 'dS': dS, 'hS': hS} return out