def test_mahalanobis_partial_mapping(mapping_type): mapping = mapping_type([0, 1]) measure = measures.Mahalanobis(mapping=mapping) reduced_ui = CovarianceMatrix(np.diag([100, 10])) assert measure(state_u, state_v) == \ distance.mahalanobis([[10], [1]], [[11], [10]], np.linalg.inv(reduced_ui)) mapping = np.array([0, 3]) reduced_ui = CovarianceMatrix(np.diag([100, 10])) measure = measures.Mahalanobis(mapping=mapping) assert measure(state_u, state_v) == \ distance.mahalanobis([[10], [1]], [[11], [2]], np.linalg.inv(reduced_ui)) mapping = mapping_type([0, 1]) measure = measures.Mahalanobis(mapping=mapping, mapping2=mapping) assert measure(state_u, state_v) == \ distance.mahalanobis([[10], [1]], [[11], [10]], np.linalg.inv(reduced_ui)) mapping = np.array([0, 3]) measure = measures.Mahalanobis(mapping=mapping, mapping2=mapping) assert measure(state_u, state_v) == \ distance.mahalanobis([[10], [1]], [[11], [2]], np.linalg.inv(reduced_ui)) mapping = mapping_type([0, 1]) mapping2 = np.array([0, 3]) measure = measures.Mahalanobis(mapping=mapping, mapping2=mapping2) assert measure(state_u, state_v) == \ distance.mahalanobis([[10], [1]], [[11], [2]], np.linalg.inv(reduced_ui))
def calculate_unit_distances(session): preprocessing_file = h5py.File(experiment.dir + 'preprocessing_results.hdf5', 'r+') features, units, cluster_id = get_spike_features_and_unit_ids(session) unit_cms = np.zeros((len(units), len(features[0]))) euc = np.zeros((len(units), len(units))) maho = np.zeros((len(units), len(units))) inv_cov = np.linalg.inv(np.cov(features, rowvar=False)) for i, unit in enumerate(units): unit_spikes = np.where(cluster_id == unit)[0] unit_spike_coords = features[unit_spikes] unit_cms[i] = np.mean(unit_spike_coords, 0) for i in range(len(units)): for j in range(len(units)): if j==i: euc[i,j] = np.linalg.norm(unit_cms[i] - np.zeros(len(features[0]))) maho[i,j] = mahalanobis(unit_cms[i], np.zeros(len(features[0])), inv_cov) else: euc[i,j] = np.linalg.norm(unit_cms[i] - unit_cms[j]) maho[i,j] = mahalanobis(unit_cms[i], unit_cms[j], inv_cov) group_grp = preprocessing_file[session.subExperiment.name + '/' + session.name + '/group_0/'] group_grp.create_dataset("euclidian", data=euc) group_grp.create_dataset("mahalanobis", data=maho) return euc, maho
def ratio_of_mv_normals(self, ukf_1, ukf_2, obs): """take the ratio of two mv normal densities too small for python often we see densities <1e-16 which python can struggle to handle. Its easier to do this by hand especially as many of the terms in the ratio cancel out reducing the chance of numeric errors. Parameters -------- ukf_1, ukf_2 : cls original `ukf1` ukf model and candidate `ukf2` ukf model Returns ------- ratio : float `ratio` of two mv normals. prob > 0 """ x1 = ukf_1.x p1 = ukf_1.p x2 = ukf_2.x p2 = ukf_2.p obs1 = np.matmul(ukf_1.k, obs) obs2 = np.matmul(ukf_2.k, obs) ratio = 1. ratio *= (np.linalg.det(p1)/np.linalg.det(p2))**(-1/2) distance = -0.5*(mahalanobis(obs2, x2, p2)**2 - mahalanobis(obs1, x1, p1)**2) if np.exp(distance) == np.inf: ratio *= 0. else: ratio *= np.exp(distance) return ratio
def compute_dist_matrix_with_ubm(dataset_x, all_gmms, all_ubms, label_dict): dist_matrix = np.zeros(shape=(len(dataset_x), len(label_dict))) for subject_i in range(len(dataset_x)): cur_subject = dataset_x[subject_i] class_distances = [] for class_i in range(len(label_dict)): class_gmm = all_gmms[class_i] min_dist = np.inf for gmm_comp in range(class_gmm.n_components): cur_dist = distance.mahalanobis( cur_subject, class_gmm.means_[gmm_comp], inv(class_gmm.covariances_)[gmm_comp]) if cur_dist < min_dist: min_dist = cur_dist class_ubm = all_ubms[class_i] ubm_min_dist = np.inf for gmm_comp in range(class_ubm.n_components): cur_dist = distance.mahalanobis( cur_subject, class_ubm.means_[gmm_comp], inv(class_ubm.covariances_)[gmm_comp]) if cur_dist < ubm_min_dist: ubm_min_dist = cur_dist class_distances.append(min_dist + (1 / ubm_min_dist)) dist_matrix[subject_i, :] = class_distances return dist_matrix
def mahalanobis_distance_sq(self, vec_x: Union[vector, matrix]): """ returns the square of the mahalanobis distance using mean and covariance of the distribution. d = (x - \mu)^T \Sigma^{-1} (x - \mu) -0.5*mahalanobis_distance_sq + log_normalization_term = logpdf :param vec_x: observation(s) :return: (array of) square of the mahalanobis distance. """ if vec_x.ndim == 1: if self._is_cov_diag: dist = _mvn_isotropic_mahalanobis_dist_sq(vec_x, self._mu, self._cov) else: cov_inv = np.linalg.inv(self._cov) dist = mahalanobis(vec_x, self._mu, cov_inv)**2 elif vec_x.ndim == 2: if self._is_cov_diag: dist = np.array([_mvn_isotropic_mahalanobis_dist_sq(x, self._mu, self._cov) for x in vec_x]) else: cov_inv = np.linalg.inv(self._cov) dist = np.array([mahalanobis(x, self._mu, cov_inv)**2 for x in vec_x]) else: raise NotImplementedError("unexpected input.") return dist
def detect_outliers_with_tsne(inliers, outliers, reuses=True): dataset = np.concatenate([inliers, outliers], axis=0) reduced_dataset = None if reuses is False: reduced_dataset = TSNE(n_components=2, random_state=0).fit_transform(dataset) np.save(REDUCED_DATASET_PATH, reduced_dataset) else: reduced_dataset = np.load(REDUCED_DATASET_PATH) reduced_inliers = reduced_dataset[:inliers.shape[0]] reduced_outliers = reduced_dataset[inliers.shape[0]:] # calculate a covariance matrix using reduced_inners inv_sigma = np.linalg.inv(np.cov(reduced_inliers, rowvar=False)) print('inv_sigma', inv_sigma.shape) # calculate a mean using reduced_inners mean = np.mean(reduced_inliers, axis=0) with open(F_VALUE_PATH, 'w') as fout: for i in range(20): threshold = THRESHOLD + 0.1 * i r = sum(1 for outlier in reduced_outliers if mahalanobis(mean, outlier, inv_sigma) > threshold) b = sum(1 for inlier in reduced_inliers if mahalanobis(mean, inlier, inv_sigma) > threshold) # the number of predicted outliers n = r + b precision = r / n recall = r / N_OUTLIERS f = 2 * precision * recall / (precision + recall) # print('thr={},f={},p={},r={}'.format(threshold, f, precision, recall)) fout.write('{} {} {} {}\n'.format(threshold, f, precision, recall))
def mahalanobis_distance(text1, text2): vec1, vec2 = __get_vectors(text1, text2) intersection = set(vec1.keys()) & set(vec2.keys()) intersection_vec1 = np.array([vec1[x] for x in intersection]) intersection_vec2 = np.array([vec2[x] for x in intersection]) unique_vec1 = np.array([vec1[x] for x in vec1.keys() - intersection]) unique_vec2 = np.array([vec2[x] for x in vec2.keys() - intersection]) intersection_covariance_matrix = __get_covariance_matrix( intersection_vec1, intersection_vec2) intersection_inverse_covariance = __get_pseudo_inverse( intersection_covariance_matrix) distance = mahalanobis(intersection_vec1, intersection_vec2, intersection_inverse_covariance) auto_covariance1 = __get_covariance_matrix(unique_vec1, unique_vec1) auto_covariance2 = __get_covariance_matrix(unique_vec2, unique_vec2) distance += mahalanobis(unique_vec1, unique_vec1, __get_pseudo_inverse(auto_covariance1)) distance += mahalanobis(unique_vec2, unique_vec2, __get_pseudo_inverse(auto_covariance2)) return distance
def multivariate_normal_m_dist(mean, cov_mat, n, m_dist_max): # NOTE: initialise success markers success = False # NOTE: draw initial random sample xy = np.random.multivariate_normal(mean, cov_mat, n) stop = False while not success: # NOTE: transform sample statistics to match population parameters cov_mat_samp = np.cov(xy.T) cholesky_sample = np.linalg.cholesky(cov_mat_samp) cholesky_sample_inverse = np.linalg.inv(cholesky_sample) cholesky_population = np.linalg.cholesky(cov_mat) for i in range(n): xy[i] = np.matmul( np.matmul(cholesky_sample_inverse, cholesky_population), (xy[i, :] - np.mean(xy, axis=0))) + mean # NOTE: remove outliers for i in range(n): m_dist = mahalanobis(xy[i, :], mean, np.linalg.inv(cov_mat)) while m_dist > m_dist_max: xy[i, :] = np.random.multivariate_normal(mean, cov_mat, 1) m_dist = mahalanobis(xy[i, :], mean, np.linalg.inv(cov_mat)) outliers_removed = True tol = 10.0 sample_mean = np.mean(xy, axis=0) sample_cov_mat = np.all(np.cov(xy.T)) if np.all(sample_mean - mean < tol) and np.all( sample_cov_mat - cov_mat < tol): success = True return (xy)
def test_mahalanobis_full_mapping(mapping_type): mapping = mapping_type(np.arange(len(u))) measure = measures.Mahalanobis(mapping=mapping) assert measure(state_u, state_v) == distance.mahalanobis(u, v, np.linalg.inv(ui)) measure = measures.Mahalanobis(mapping=mapping, mapping2=mapping) assert measure(state_u, state_v) == distance.mahalanobis(u, v, np.linalg.inv(ui))
def MStep(X, r, z, dof, updateDof=False): K, N = r.shape[1], X.shape[0] D = X.shape[1] mu = [] sigma = [] pi = [] dof_new = [] for i in range(K): rk = r[:, i] zk = z[:, i] w = (rk * zk).reshape(-1, 1) # N * 1 mu_k = np.sum(w * X, axis=0) / np.sum(w) sigma_k = UpdateSigma(X, mu_k, r, z, i) pi_k = np.sum(rk) / N mu.append(mu_k.ravel()) sigma.append(sigma_k) pi.append(pi_k) if updateDof: L = np.zeros((N, K)) for i in range(K): sigma_k = sigma[i] logdet = 0.5 * math.log(sl.det(sigma_k)) logmix = math.log(pi[i]) distances = [] inv_sigma_k = sl.inv(sigma_k) for j in range(len(X)): xj = X[j] distances.append( mahalanobis(xj.ravel(), mu[i].ravel(), inv_sigma_k)**2) distances = np.array(distances).ravel() L[:, i] = GetLk(dof[i], logmix, logdet, distances, X.shape[1]) for i in range(K): sigma_k = sigma[i] logdet = 0.5 * math.log(sl.det(sigma_k)) logmix = math.log(pi[i]) distances = [] inv_sigma_k = sl.inv(sigma_k) for j in range(len(X)): xj = X[j] distances.append( mahalanobis(xj.ravel(), mu[i].ravel(), inv_sigma_k)**2) distances = np.array(distances).ravel() bnds = ((0.1, 200), ) x_init = (dof[i], ) res = so.minimize(dofFunc, x0=x_init, args=(logmix, logdet, distances, X.shape[1], L, i), bounds=bnds) dof_new.append(res.x) else: dof_new = dof return np.array(mu), np.array(sigma), np.array(pi), np.array(dof_new)
def get_caliper(trt_compare, trtinfo, binid, median, percentile): looking_for = 30 trt_more = [trt_compare.values] if trt_compare.shape[0] < looking_for: for neighbor in get_neighbors(trtinfo['bindf'], binid, his2ft.binners, trtinfo['levels'], looking_for): node = trtinfo['drugbins'].get_node("/" + neighbor) nodelab = node[:, 1] == trtinfo['trt'] trt_more.append(trtinfo['scaler'].transform(node[:, 6:][nodelab, :])) trt_more = pd.DataFrame(np.vstack(trt_more)) trt_more.index = list(trt_compare.index) + list( set(np.arange(2 * trt_more.shape[0])) - set(trt_compare.index))[:(trt_more.shape[0] - trt_compare.shape[0])] trtdist = pd.DataFrame() cutoff = 10 if trt_compare.shape[0] > 10000: tshape = trt_compare.shape[0] print("bigboy! ", tshape) dists = [] for k in range(int(trt_compare.shape[0] / 1000)): x = trt_compare.iloc[ k:min(trt_compare.shape[0], k + 1000), :].apply( lambda x: trt_more.drop(x.name, axis=0).iloc[ np.random.choice(tshape - 1, 500, replace=False), :] .apply(lambda y: mahalanobis(x, y, trtinfo['prec']), axis=1), axis=1) dists.append( x.apply(lambda q: np.percentile(q[~pd.isnull(q)], percentile), axis=1)) return np.median(np.hstack(dists)) elif trt_compare.shape[0] > 1000: #pdb.set_trace() tshape = trt_compare.shape[0] trtdist = trt_compare.apply( lambda x: trt_more.drop(x.name, axis=0).iloc[np.random.choice( tshape - 1, 100, replace=False), :].apply( lambda y: mahalanobis(x, y, trtinfo['prec']), axis=1), axis=1) elif trt_more.shape[0] >= 30: trtdist = trt_compare.apply( lambda x: trt_more.drop(x.name, axis=0).apply( lambda y: mahalanobis(x, y, trtinfo['prec']), axis=1), axis=1) #pdb.set_trace() if median: return trtdist.apply( lambda q: np.percentile(q[~pd.isnull(q)], percentile), axis=1).median() else: return np.percentile(trtdist.stack(), percentile)
def reassignLabels(self, X, threshold): Z = self.post_gmm_encode(X, transform=False) preds = self.kmeans.pred for i in range(len(preds)): if (distance.mahalanobis(Z[i], self.kmeans.means[0], self.kmeans.covs[0]) > threshold ) and (distance.mahalanobis(Z[i], self.kmeans.means[1], self.kmeans.covs[1]) > threshold): preds[i] = 2 self.kmeans.pred = preds
def z_score(self, x): """Computes the Mahalanobis distance of `x` from the center of this Gaussian. In the 1D case this reduces to computing an absolute z-score. NOTE: This function is vectorized if you pass multiple points as `x`. >>> Gaussian(2, 4).z_score(6) 2.0 >>> Gaussian(2, 4).z_score([0, 3, 6]) array([1. , 0.5, 2. ]) >>> Gaussian(pd.Series([2, 0, 0]), pd.DataFrame([ \ [ 1.5, -0.5, -0.5], \ [-0.5, 1.5, -0.5], \ [-0.5, -0.5, 1.5] \ ])).z_score(pd.Series([0, 1, 0])) 1.7320508075688763 >>> Gaussian(pd.Series([2, 0, 0]), pd.DataFrame([ \ [ 1.5, -0.5, -0.5], \ [-0.5, 1.5, -0.5], \ [-0.5, -0.5, 1.5] \ ])).z_score([[0, 1, 0], [1, 0, 0]]) array([1.73205081, 1. ]) >>> Gaussian(pd.Series([0, 1], index=['a', 'b']), [1, 2]).z_score([1, 3]) 1.7320508075688772 >>> Gaussian(pd.Series([0, 1], index=['a', 'b']), [1, 2]) \ .z_score(pd.DataFrame([[3, 1], [5, 2]], columns=['b', 'a'], index=[0, 1])) 0 1.732051 1 3.464102 dtype: float64 """ cov_inv = np.linalg.pinv(self.__covariance) if self.__should_vectorize(x): if isinstance(x, pd.DataFrame): x = x[self.__mean.index] return x.apply(lambda x: mahalanobis(self.__mean, x, cov_inv), axis=1) else: return np.array( [mahalanobis(self.__mean, x_i, cov_inv) for x_i in x]) else: # Sort `x` labels to match `mean` indexing. if self.__has_similar_labels(x): x = x[self.__mean.index] return mahalanobis(self.__mean, x, cov_inv)
def y_vec(self, centers, w): if self.eta is None: if self.sparse: sparse_reg = np.zeros(self.n_clusters) # scale_param = np.tanh(self.iteration_num / 300) if self.iteration_num > 700 else 0 scale_param = 1e-5 reg = utils.get_sparse_reg(centers, self.process_label) sparse_reg[self.process_label] = scale_param * reg return np.array([ mahalanobis(w, centers[label], self.Gammas_inv[label]) + sparse_reg[label] + self.noise.fabric(self.iteration_num) for label in range(self.n_clusters) ]) else: return np.array([ mahalanobis(w, centers[label], self.Gammas_inv[label]) + self.noise.fabric(self.iteration_num) for label in range(self.n_clusters) ]) else: if self.sparse: sparse_reg = np.zeros(self.n_clusters) reg_1 = utils.get_sparse_reg(centers, self.process_label) scale_param_1 = 1e-5 # scale_param_1 = 1e-5 * np.tanh(self.iteration_num / 300) if self.iteration_num > 700 else 0 sparse_reg[self.process_label] += scale_param_1 * reg_1 reg_2 = utils.get_sparse_reg_2(centers, self.process_label) scale_param_2 = 1e-3 * np.tanh( self.iteration_num / 300) if self.iteration_num > 700 else 0 # scale_param_2 = 1e-3 sparse_reg[self.process_label] += scale_param_2 * reg_2 return np.array([ mahalanobis(w, centers[label], np.linalg.inv(self.Gammas[label])) + sparse_reg[label] + self.noise.fabric(self.iteration_num) for label in range(self.n_clusters) ]) else: return np.array([ mahalanobis(w, centers[label], np.linalg.inv(self.Gammas[label])) + self.noise.fabric(self.iteration_num) for label in range(self.n_clusters) ])
def mahalanobisR(myRow, inData, covariance): """ Find eucledian distance between given row and data given data could be an array or a single row requres covariance matrix, not inverted covariance matrix """ IC = covariance.values if isinstance(covariance, pd.DataFrame) else covariance IC = sp.linalg.inv(IC) m = [] if (len(inData.shape) == 1): return(mahalanobis(inData,myRow,IC) ** 2) for i in range(inData.shape[0]): m.append(mahalanobis(inData.ix[i,:],myRow,IC) ** 2) return(m)
def _call(self, ds): """ The method use a dataset and computes the mahalanobis distance from the trained distribution. It uses the pvalue as a threshold to calculate how many volumes are mahalanobis-distant from the training distribution. Parameters ---------- ds: pymvpa dataset. Testing dataset Returns ------- dataset: a dataset with the number of volumes with a m-distance below the threshold """ distances = [] mean_ = self.params['mean'] icov_ = self.params['icov'] for ex in ds: dist_ = mahalanobis(mean_, ex, icov_) distances.append(dist_) chi_sq = scipy.stats.distributions.chi2(mean_.shape[0]) m_value = chi_sq.isf(self.p) distances = np.array(distances) value = np.count_nonzero((distances ** 2) < m_value) #space = self.get_space() return Dataset(np.array([value]))
def scoring(err, mu, sigma): scores = [] for e in err: scores.append(mahalanobis(e, mu, sigma)) return scores
def _get_node_distance_matrix(self, datapoint, som_array): """Get distance of datapoint and node using Euclidean distance. Parameters ---------- datapoint : np.array, shape=(X.shape[1]) Datapoint = one row of the dataset `X` som_array : np.array Weight vectors of the SOM, shape = (self.n_rows, self.n_columns, X.shape[1]) Returns ------- distmat : np.array of float Distance between datapoint and each SOM node """ # algorithms on the full matrix if self.distance_metric == "euclidean": return np.linalg.norm(som_array - datapoint, axis=2) # node-by-node algorithms distmat = np.zeros((self.n_rows, self.n_columns)) if self.distance_metric == "manhattan": for node in self.node_list_: distmat[node] = dist.cityblock( som_array[node[0], node[1]], datapoint) elif self.distance_metric == "mahalanobis": for node in self.node_list_: som_node = som_array[node[0], node[1]] cov = np.cov(np.stack((datapoint, som_node), axis=0), rowvar=False) cov_pinv = np.linalg.pinv(cov) # pseudo-inverse distmat[node] = dist.mahalanobis( datapoint, som_node, cov_pinv) elif self.distance_metric == "tanimoto": # Note that this is a binary distance measure. # Therefore, the vectors have to be converted. # Source: Melssen 2006, Supervised Kohonen networks for # classification problems # VERY SLOW ALGORITHM!!! threshold = 0.5 for node in self.node_list_: som_node = som_array[node[0], node[1]] distmat[node] = dist.rogerstanimoto( binarize(datapoint.reshape(1, -1), threshold=threshold, copy=True), binarize(som_node.reshape(1, -1), threshold=threshold, copy=True)) elif self.distance_metric == "spectralangle": for node in self.node_list_: distmat[node] = np.arccos(np.divide( np.dot(som_array[node[0], node[1]], datapoint), np.multiply(np.linalg.norm(som_array), np.linalg.norm(datapoint)))) return distmat
def compute_distance_hmd(self, domain): meaningful_word_ratio = domain.get_linguistic_feature_set().get_meaningful_word_ratio() one_gram_normality_score = domain.get_linguistic_feature_set().get_one_gram_normality_score() two_gram_normality_score = domain.get_linguistic_feature_set().get_two_gram_normality_score() three_gram_normality_score = domain.get_linguistic_feature_set().get_three_gram_normality_score() four_gram_normality_score = domain.get_linguistic_feature_set().get_four_gram_normality_score() five_gram_normality_score = domain.get_linguistic_feature_set().get_five_gram_normality_score() current_sample = numpy.array([meaningful_word_ratio, one_gram_normality_score, two_gram_normality_score, three_gram_normality_score]) filename = time.strftime('%Y-%m-%d', time.localtime(time.time())) + '.csv' _info = filename.strip('.csv') timestr = time.strftime('%Y-%m-%d', time.localtime(time.time())) f = open(parent_path + '/trails/' + timestr + ".csv", "a") f.writelines(str(meaningful_word_ratio)) f.writelines('\t') f.writelines(str(one_gram_normality_score)) f.writelines('\t') f.writelines(str(two_gram_normality_score)) f.writelines('\t') f.writelines(str(three_gram_normality_score)) f.writelines('\t') f.writelines(str(four_gram_normality_score)) f.writelines('\t') f.writelines(str(five_gram_normality_score)) f.writelines('\t') f.close() for i in range(len(current_sample)): if self._centroid[i] < current_sample[i]: current_sample[i] = self._centroid[i] ###if current_sample[i] is bigger than centroid[i],then it must be no_dga. distance = mahalanobis(current_sample, self._centroid, self._cov_inv) return distance
def compute_distance_bmd(self, domain): meaningful_word_ratio = domain.get_linguistic_feature_set().get_meaningful_word_ratio() one_gram_normality_score = domain.get_linguistic_feature_set().get_one_gram_normality_score() two_gram_normality_score = domain.get_linguistic_feature_set().get_two_gram_normality_score() three_gram_normality_score = domain.get_linguistic_feature_set().get_three_gram_normality_score() four_gram_normality_score = domain.get_linguistic_feature_set().get_four_gram_normality_score() five_gram_normality_score = domain.get_linguistic_feature_set().get_five_gram_normality_score() current_sample = numpy.array([meaningful_word_ratio, one_gram_normality_score, two_gram_normality_score, three_gram_normality_score]) f=open("./new2/result_bmd.csv","a") f.writelines(str(meaningful_word_ratio)) f.writelines('\t') f.writelines( str(one_gram_normality_score)) f.writelines('\t') f.writelines( str(two_gram_normality_score)) f.writelines('\t') f.writelines( str(three_gram_normality_score)) f.writelines('\t') f.writelines( str(four_gram_normality_score)) f.writelines('\t') f.writelines( str(five_gram_normality_score)) f.writelines('\t') f.close() for i in range(len(current_sample)): if self._centroid[i] < current_sample[i]: current_sample[i] = self._centroid[i] ###if current_sample[i] is bigger than centroid[i],then it must be no_dga. distance = mahalanobis(current_sample, self._centroid, self._cov_inv) return distance
def gaussian_weights(bundle, n_points=100, return_mahalnobis=False): """ Calculate weights for each streamline/node in a bundle, based on a Mahalanobis distance from the mean of the bundle, at that node Parameters ---------- bundle : array or list If this is a list, assume that it is a list of streamline coordinates (each entry is a 2D array, of shape n by 3). If this is an array, this is a resampled version of the streamlines, with equal number of points in each streamline. n_points : int, optional The number of points to resample to. *If the `bundle` is an array, this input is ignored*. Default: 100. Returns ------- w : array of shape (n_streamlines, n_points) Weights for each node in each streamline, calculated as its relative inverse of the Mahalanobis distance, relative to the distribution of coordinates at that node position across streamlines. """ if isinstance(bundle, list) or isinstance(bundle, dts.Streamlines): # if you got a list, assume that it needs to be resampled: bundle = _resample_bundle(bundle, n_points) else: if bundle.shape[-1] != 3: e_s = "Input must be shape (n_streamlines, n_points, 3)" raise ValueError(e_s) n_points = bundle.shape[1] w = np.zeros((bundle.shape[0], n_points)) # If there's only one fiber here, it gets the entire weighting: if bundle.shape[0] == 1: return np.array([1]) for node in range(bundle.shape[1]): # This should come back as a 3D covariance matrix with the spatial # variance covariance of this node across the different streamlines # This is a 3-by-3 array: node_coords = bundle[:, node] c = np.cov(node_coords.T, ddof=0) c = np.array([[c[0, 0], c[0, 1], c[0, 2]], [0, c[1, 1], c[1, 2]], [0, 0, c[2, 2]]]) # Calculate the mean or median of this node as well # delta = node_coords - np.mean(node_coords, 0) m = np.mean(node_coords, 0) # Weights are the inverse of the Mahalanobis distance for fn in range(bundle.shape[0]): # calculate Mahalanobis for node on fiber[fn] w[fn, node] = mahalanobis(node_coords[fn], m, np.linalg.inv(c)) if return_mahalnobis: return w # weighting is inverse to the distance (the further you are, the less you # should be weighted) w = 1 / w # Normalize before returning, so that the weights in each node sum to 1: return w / np.sum(w, 0)
def Cal_FPR(self, config): N_ood = len(X_ood) # 10000 X_ood = np.pad(X_ood, ((0, 0), (2, 2), (2, 2), (0, 0)), 'constant') # Adding the padding to the dataset f_of_x_ood = np.array(sess.run(fullc2, feed_dict={x: X_ood, keep_prob: 1.0})) label_of_x_ood = np.array(range(N_ood)) for i in range(N_ood): temp = [None] * num_of_labels for label in range(num_of_labels): temp[label] = list() u = np.reshape(f_of_x_ood[i], (1, num_of_neurons)) v = np.reshape(mu_hat[label], (1, num_of_neurons)) temp[label].append(distance.mahalanobis(u, v, np.linalg.inv(sigma_hat)) ** 2) m_dist_data_of_x_ood = np.array(temp) index = np.argmin(m_dist_data_of_x_ood, 0) # finding index of the closest label confidence_score_of_x_ood = m_max[index] - m_dist_data_of_x_ood[index] # computing confidence score if confidence_score_of_x_ood > threshold: label_of_x_ood[i] = index # classifying in-distribution data else: label_of_x_ood[i] = ood_index # classifying out-of-distribution data num_of_in_distribution = 0 for i in range(N_ood): if label_of_x_ood[i] != ood_index: num_of_in_distribution = num_of_in_distribution + 1 fpr = num_of_in_distribution / N_ood print('FPR on out-of-distribution(EMNIST): {:.4f}'.format(fpr), end='\n') '''
def Cal_TPR(self, config): N_data = len(data) # 10000 f_x = np.array(sess.run(fullc2, feed_dict={x: data, keep_prob: 1.0})) label_x pred_x = np.array(range(N_test)) for i in range(N_data): temp = [None] * num_of_labels for label in range(num_of_labels): temp[label] = list() u = np.reshape(f_x[i], (1, num_of_neurons)) v = np.reshape(mu_hat[label], (1, num_of_neurons)) temp[label].append(distance.mahalanobis(u, v, np.linalg.inv(sigma_hat)) ** 2) m_dist_data_of_x_test = np.array(temp) index = np.argmin(m_dist_data_of_x_test, 0) # finding index of the closest label confidence_score_of_x_test = m_max[index] - m_dist_data_of_x_test[index] # computing confidence score if confidence_score_of_x_test > threshold: label_of_x_test[i] = index // 2 # classifying in-distribution data else: label_of_x_test[i] = ood_index # classifying out-of-distribution data num_of_in_distribution = 0 num_of_correctly_classified = 0 accuracy_on_in_distribution = 0.0 for i in range(N_test): if label_of_x_test[i] != ood_index: num_of_in_distribution = num_of_in_distribution + 1 if label_of_x_test[i] == target_label_of_x_test[i]: num_of_correctly_classified = num_of_correctly_classified + 1 accuracy_on_in_distribution = num_of_correctly_classified / num_of_in_distribution tpr = num_of_in_distribution / N_test print('Classification accuracy on in-distribution: {:.4f}'.format(accuracy_on_in_distribution)) print('TPR on in-distribution(MNIST): {:.4f}'.format(tpr), end='\n')
def genSimilarComposition(pulsePeriod, pieceDur, strokeModels = None, iAudioFile = None, iPos = None, invC = None): if strokeModels == None: strokeSeq = None ts = None opulsePos = None else: testFeatFull = getFeatSequence(iAudioFile,iPos) testFeat = testFeatFull['pmfcc'] print testFeat.shape Npulse = testFeat.shape[0] Ndata = len(strokeModels) strokeSeq = np.array([]) ts = np.array([]) tscurr = 0.0 opulsePos = np.arange(0,pieceDur,pulsePeriod) for k in range(Npulse): ftIn = testFeat[k,params.selectInd] distVal = 1e6*np.ones(Ndata) ts = np.append(ts,tscurr) tscurr = tscurr + pulsePeriod for p in range(Ndata): ftOut = strokeModels[p]['feat']['pmfcc'][0][params.selectInd] distVal[p] = DS.mahalanobis(ftIn,ftOut,invC) strokeSeq = np.append(strokeSeq,np.argmin(distVal)) return strokeSeq, ts, opulsePos
def mahalanobis_distances(df, axis=0): ''' Returns a pandas Series with Mahalanobis distances for each sample on the axis. Note: does not work well when # of observations < # of dimensions Will either return NaN in answer or (in the extreme case) fail with a Singular Matrix LinAlgError Args: df: pandas DataFrame with columns to run diagnostics on axis: 0 to find outlier rows, 1 to find outlier columns ''' df = df.transpose() if axis == 1 else df means = df.mean() try: inv_cov = np.linalg.inv(df.cov()) except LinAlgError: return pd.Series([np.NAN] * len(df.index), df.index, name='Mahalanobis') dists = [] for i, sample in df.iterrows(): dists.append(mahalanobis(sample, means, inv_cov)) return pd.Series(dists, df.index, name='Mahalanobis')
def pdf(x): x = np.asarray(x).ravel() assert len(x) == d, "Incorrect dimensionality. The input data must " \ "be %d-dimensional." % d return scale_factor * m.exp(-0.5*mahalanobis(x, mu, inv_sigma))
def _test(self, means: NDArray, cvars: NDArray) -> None: embeddings, artifacts = self._embed("test") b, c, h, w = embeddings.shape embeddings = embeddings.reshape(b, c, h * w) distances = [] for i in tqdm(range(h * w), desc=f"{self.cfg.params.category} - compute distance"): mean = means[:, i] cvar_inv = np.linalg.inv(cvars[:, :, i]) distance = [mahalanobis(e[:, i], mean, cvar_inv) for e in embeddings] distances.append(distance) img_h = self.cfg.params.height img_w = self.cfg.params.width amaps = torch.tensor(np.array(distances), dtype=torch.float32) amaps = amaps.permute(1, 0).view(b, h, w).unsqueeze(dim=1) # (b, 1, h, w) amaps = F.interpolate(amaps, size=(img_h, img_w), mode="bilinear", align_corners=False) amaps = mean_smoothing(amaps) amaps = (amaps - amaps.min()) / (amaps.max() - amaps.min()) amaps = amaps.squeeze().numpy() roc_score = compute_roc_score(amaps, np.array(artifacts["mask"]), artifacts["stem"]) pro_score = compute_pro_score(amaps, np.array(artifacts["mask"])) mlflow.log_metrics({"roc_score": roc_score, "pro_score": pro_score}) draw_roc_and_pro_curve(roc_score, pro_score) savegif( np.array(artifacts["image"]), amaps, np.array(artifacts["mask"]), artifacts["stem"], )
def get_anomalous_values(data, window_size, prob=0.99): """ return a list of anomalous values, i.e. the ones that exceed md times in terms of Mohalanobis distance the expected multivariate average. Both multivariate average and Mahalanobis distance are calculated considering the moving windows, i.e. the value computed considering window_size neighbours, moving the window for each value of the serie. data : pandas.core.frame.DataFrame window_size: int md: float return: list """ # under normal hypotesis, the Mohalanobis dinstance is Chi-squared # distribuited threshold = np.sqrt(-2 * np.log(1 - prob)) # calculate the moving window for each point, and report the anomaly if # the distance of the idx-th point is greater than md times the mahalanobis # distance return [(p['idx'], p['value']) for p in nd_rolling(data, window_size) if mahalanobis(p['value'], p['window_avg'], np.linalg.inv(p['window_cov'])) > threshold]
def KullbackLeiberDivergence(CoefficientA, CoefficientB, CoefficientC, Mean, Sample): distance = (ds.mahalanobis(Mean, Sample, CoefficientA))**2 divergence = CoefficientC + distance - CoefficientB - len(Mean) return np.sqrt(divergence / 2)
def _mahalanobis_distances_scipy(m, SI, X): n = X.shape[0] mahal = np.zeros(n) for i in xrange(X.shape[0]): x = X[i,:] mahal[i] = distance.mahalanobis(x,m,SI) return mahal
def metrykaMahalanobisa(self,array1,array2, macierzKowariancji): """ Computes the Mahalanobis distance between two n-vectors ``u`` and ``v``, which is defined as .. math:: \sqrt{ (u-v) V^{-1} (u-v)^T } where ``V`` is the covariance matrix. Note that the argument ``VI`` is the inverse of ``V``. Parameters ---------- u : ndarray An :math:`n`-dimensional vector. v : ndarray An :math:`n`-dimensional vector. VI : ndarray The inverse of the covariance matrix. Returns ------- d : double The Mahalanobis distance between vectors ``u`` and ``v``. """ return mahalanobis(array1, array2, macierzKowariancji)
def compute_compatibility(observations, predictions): """ Individual Compatibility Test """ compatibility = dict() compatibility = {'d2': None, 'IC': None} compatibility['d2'] = np.zeros(shape=(observations['M'], predictions['N'])) compatibility['IC'] = np.zeros(shape=(observations['M'], predictions['N'])) # Compute Individual Squared Mahalanobis Distances for i in range(observations['M']): z = observations['z'][i] R = observations['R_covariance'][i] # R = [1] for j in range(predictions['N']): C = np.add(predictions['H_P_H'][i], R) C_inverse = np.linalg.inv(C) # C_inverse = [1] # print(z,R,C, predictions['h_map_fn'][j]) compatibility['d2'][i][j] = mahalanobis(z, predictions['h_map_fn'][j], C_inverse) # Check Mahalanobis Distance against critical values from a Chi2 Distribution. for i in range(observations['M']): for j in range(predictions['N']): if (compatibility['d2'][i][j] < chi2.isf(q=0.01, df=2)): compatibility['IC'][i][j] = 1 else: compatibility['IC'][i][j] = 0 return compatibility
def __call__(self, state1, state2): r"""Calculate the Mahalanobis distance between a pair of state objects Parameters ---------- state1 : :class:`~.State` state2 : :class:`~.State` Returns ------- float Mahalanobis distance between a pair of input :class:`~.State` objects """ if self.mapping is not None: u = state1.state_vector[self.mapping] v = state2.state_vector[self.mapping] # extract the mapped covariance data rows = np.array(self.mapping, dtype=np.intp) columns = np.array(self.mapping, dtype=np.intp) cov = state1.covar[rows[:, np.newaxis], columns] else: u = state1.state_vector v = state2.state_vector cov = state1.covar vi = np.linalg.inv(cov) return distance.mahalanobis(u, v, vi)
def calc_distance(tracks, means, covs,remeasure): f_point = tracks.get_point_measurement() if remeasure: f_point.append(tracks.get_vdur()) c = np.matrix(covs).I dist = mahalanobis(f_point,means, c) return dist
def distance(vector1, vector2, alpha=2, metric='euclidean'): ''' Helper function that calculates the alpha :param vector1: a vector :type vector1: list of doubles :param vector2: a vector :type vector2: list of doubles :param metric: euclidean, mahalanobis, seuclidean, cityblock :type metric: string :rtype: norm between vectors A and B ''' mp.dps = 50 alpha = mpf(1.0 * alpha) vector1 = matrix(numpy.array(vector1)) vector2 = matrix(numpy.array(vector2)) if metric == 'euclidean': vector_norm = distances.euclidean(vector1, vector2) elif metric == 'mahalanobis': vi = numpy.linalg.inv( numpy.cov(numpy.concatenate((vector1, vector2)).T)) vector_norm = distances.mahalanobis(vector1, vector2, vi) elif metric == 'seuclidean': vector_norm = distances.seuclidean(vector1, vector2) elif metric == 'cityblock': vector_norm = distances.cityblock(vector1, vector2) elif metric == 'hamming': vector_norm = distances.hamming(vector1, vector2) else: print "Unknown metric" return None return vector_norm
def mahalanobis_distance(a, b): """ uses the scipy mahalanobis distances to calculate the distance between two arrays. """ x = np.array(a) y = np.array(b) z = np.vstack((x, y)) cov = np.cov(z.T) return distance.mahalanobis(x, y, cov)
def greylvldistancemetric(x1,x2, cov, k ,n,img): grads = getgradientsalong(x1,n,img,k) sumgrads = sum(np.abs(grads)) if sumgrads != 0: grads = grads/float(sumgrads) return ssd.mahalanobis(x2,grads,cov)
def distancePV ( sample, mask, params_tissue1, params_tissue2, distance ): from scipy.spatial.distance import mahalanobis,euclidean import numpy as np # Direction vector between pure tissues d_vect = np.ravel(params_tissue2[0] - params_tissue1[0]).T mu1 = np.ravel(params_tissue1[0]) mu2 = np.ravel(params_tissue2[0]) SI1 = params_tissue1[1].getI() SI2 = params_tissue2[1].getI() if distance=='mahalanobis': norm = np.array( [ 1/(1+ mahalanobis(pix,mu2,SI2)/ mahalanobis(pix,mu1,SI1)) for pix in sample[mask==1] ] ) elif distance=='dummy': norm = mask*0.5 else: norm = np.array( [ 1/(1+ euclidean(pix,mu2)/ euclidean(pix,mu1)) for pix in sample[mask==1] ] ) result = np.zeros( np.shape( mask ) ) result[mask==1] = norm return result
def closest_mahalanobis(pt, W, cov, k): min_is = [-1] * k min_dists = [float('inf')] * k for i in range(W.shape[1]): dist = distance.mahalanobis(pt, W[:, i], cov) max_i = max(xrange(len(min_dists)), key=min_dists.__getitem__) if dist < min_dists[max_i]: min_dists[max_i]= dist min_is[max_i] = i return min_is
def fit(self, X): """Prints some summary stats (if verbose is one) and returns the indices of what it consider to be extreme""" self.mcd.fit(X) mahalanobis = lambda p: distance.mahalanobis(p, self.mcd.location_, self.mcd.precision_ ) d = np.array(map(mahalanobis, X)) #Mahalanobis distance values self.d2 = d ** 2 #MD squared n, self.degrees_of_freedom_ = X.shape self.iextreme_values = (self.d2 > self.chi2.ppf(0.995, self.degrees_of_freedom_) ) if self.verbose: print "%.3f proportion of outliers at %.3f%% chi2 percentile, "%(self.iextreme_values.sum()/float(n), self.chi2_percentile) print "with support fraction %.2f."%self.support_fraction return self
def mah_dist(self, new_pat, old_pats, min_limit): old_gaps = [] new_gap = self.convert_pattern_to_gaps(new_pat) for pat in old_pats: old_gaps.append(self.convert_pattern_to_gaps(pat)) old_arr = numpy.array(old_gaps) cov = numpy.cov(old_arr, rowvar=0) cov_inv = numpy.linalg.pinv(cov) mean = numpy.mean(old_gaps, axis=0) dist = numpy.sqrt(distance.mahalanobis(new_gap, mean, cov_inv)) print('dist = ', dist) return True if dist < min_limit else False
def SetCOINdist_2D(self, fit, distances, means, dist_type): #loop population Fitness count = 0 #calculate covariance if dist_type == "mahalanobis": x = [] y = [] for i in fit: x.append(i[0][0]) y.append(i[0][1]) covar = np.linalg.pinv(np.cov(np.array(x),np.array(y))) #print " zzzzz " #print covar for f in fit: # print f # print " x " # print f[0][0] #distances to the mean shortest = float("inf") #print means for i in means: # 1: mean - fitness if dist_type == "euclidean": a = distance.euclidean(i,f[0]) else: # a = distance.euclidean(i,f[0]) # b = distance.mahalanobis(i,f[0],covar) # print a # print b a = distance.mahalanobis(i,f[0],covar) #print a if a<=shortest: shortest = a # print a # print i # print f[0] # quit() #print shortest #quit() #the shortest distance to one of the points distances[count] = shortest count +=1
def module3(self): # vectorの定義 vector1 = np.array([1,1]).astype(np.float64) vector2 = np.array([2,3]).astype(np.float64) con = np.vstack((vector1, vector2)) # 逆行列の計算 # viは共分散行列 vi = np.linalg.inv( con.T ) print vi vector_norm = distances.mahalanobis( vector1, vector2, vi ) print vector_norm
def test_mah_dist(true_pat, false_pat, err_bound): n_error_true = 0 n_error_false = 0 for i, t_pat in enumerate(true_pat): ex_pat = [p for j, p in enumerate(true_pat) if i is not j] m = np.mean(ex_pat, axis=0) cov = np.cov(ex_pat, rowvar=0) inv_cov = np.linalg.pinv(cov) mah_dist = dist.mahalanobis(t_pat, m, inv_cov) if(mah_dist > err_bound): n_error_true += 1 m = np.mean(true_pat, axis=0) cov = np.cov(true_pat, rowvar=0) inv_cov = np.linalg.pinv(cov) for f_pat in false_pat: mah_dist = dist.mahalanobis(f_pat, m, inv_cov) if(mah_dist < err_bound): n_error_false += 1 frr = float(n_error_true)/float(len(true_pat)) far = float(n_error_false)/float(len(false_pat)) return far, frr
def opt_alpha(alpha): e1 = alpha[0]*v_eff + alpha[1]*w_eff e2 = alpha[2]*v_eff + alpha[3]*w_eff v = v_eff + np.random.normal(e1) w = w_eff + np.random.normal(e2) all_v = np.vstack((v, w)) cov_mat = np.cov(wld_vel.T) total_dist = 0.0 for i in xrange(0, len(all_v.T)): total_dist += sp_dist.mahalanobis(all_v.T[i], wld_vel[i], np.linalg.inv(cov_mat)) return total_dist
def find_best_position(self, sampled_profile, point_index): ''' :param sampled_profile: Length 2m + 1 :param model: Length 2k + 1 :return: ''' model_length = len(self.means_points_model[point_index]) sampled_profile_length = len(sampled_profile) min_value = float("inf") min_index = 0 for i in range(0, sampled_profile_length - model_length + 1): sampled_profile_part = sampled_profile[i:i + model_length] distance = mahalanobis(sampled_profile_part, self.means_points_model[point_index], self.inverse_covariance_points_model[point_index]) if distance < min_value: min_value = distance min_index = i return self.k + min_index
def _call(self, ds): distances = [] mean_ = self.params['mean'] icov_ = self.params['icov'] for ex in ds: dist_ = mahalanobis(mean_, ex, icov_) distances.append(dist_) chi_sq = scipy.stats.distributions.chi2(mean_.shape[0]) m_value = chi_sq.isf(self.p) distances = np.array(distances) value = np.count_nonzero((distances ** 2) < m_value) #space = self.get_space() return Dataset(np.array([value]))
def mahala_fcn(x, y): ''' Parameters ---------- x - numpy.ndarray A 1D array y - numpy.ndarray A 1D array ''' cov = np.cov(zip(x, y)) try: icov = np.linalg.inv(cov) except np.linalg.LinAlgError: icov = np.linalg.inv(cov + np.eye(cov.shape[0], cov.shape[1], k=1e-3)) val = mahalanobis(x, y, icov) return np.sqrt(val)
def init_d2_weighting(data, num_comp): num_obs = data.shape[0] cov_inv = np.linalg.inv(np.cov(data, rowvar=0)) select_prob = np.ones(num_obs) / num_obs shortest_dist = np.inf * np.ones(num_obs) centroid = np.ones(num_comp) for k in range(num_comp): # Select a random data point as centroid centroid[k] = np.nonzero(multinomial(1, select_prob))[0] # Recompute distances for i, d in enumerate(shortest_dist): d_new = mahalanobis(data[centroid[k], :], data[i, :], cov_inv) if d_new < d: shortest_dist[i] = d_new select_prob = normalize_logspace( pow(shortest_dist.reshape(1, len(shortest_dist)), 2, 1)) select_prob = select_prob.flatten() return centroid
import numpy as np import pylab as pl import scipy.spatial.distance as dist def plotSamples(x, y, z=None): stars = np.matrix([[3., -2., 0.], [3., 2., 0.]]) if z is not None: x, y = z * np.matrix([x, y]) stars = z * stars pl.scatter(x, y, s=10) # 画 gaussian 随机点 pl.scatter(np.array(stars[0]), np.array(stars[1]), s=200, marker='*', color='r') # 画三个指定点 pl.axhline(linewidth=2, color='g') # 画 x 轴 pl.axvline(linewidth=2, color='g') # 画 y 轴 pl.axis('equal') pl.axis([-5, 5, -5, 5]) pl.show() # 产生高斯分布的随机点 mean = [0, 0] # 平均值 cov = [[2, 1], [1, 2]] # 协方差 x, y = np.random.multivariate_normal(mean, cov, 1000).T plotSamples(x, y) covMat = np.matrix(np.cov(x, y)) # 求 x 与 y 的协方差矩阵 Z = np.linalg.cholesky(covMat).I # 仿射矩阵 plotSamples(x, y, Z) # 求马氏距离 print '\n到原点的马氏距离分别是' print dist.mahalanobis([0,0], [3,3], covMat.I), dist.mahalanobis([0,0], [-2,2], covMat.I) # 求变换后的欧几里得距离 dots = (Z * np.matrix([[3, -2, 0], [3, 2, 0]])).T print '\n变换后到原点的欧几里得距离分别是:' print dist.minkowski([0, 0], np.array(dots[0]), 2), dist.minkowski([0, 0], np.array(dots[1]), 2)
def dbScanDistance(a, b): covInv = np.cov(np.vstack((a,b)).T) return mahalanobis(a, b, covInv)
def y_vec(self, centers, w): return np.array([mahalanobis(w, centers[label], np.linalg.inv(self.Gammas[label])) + self.noise.fabric( self.iteration_num) for label in xrange(self.n_clusters)])
example_dist[label]['i_cov'] = cov_.precision_ print 'Inverted covariance estimated...' # Get predictions of target dataset (unlabelled) # We simply apply classifier to target dataset classifier_prediction_tar = results['predictions'] mahalanobis_values = np.zeros_like(ds_tar.targets, dtype=np.float) distances = dict() # For each class it is computed the distance of samples from class distribution for c in np.unique(classifier_prediction_tar): distances[c] = [] for j, ex in enumerate(ds_tar.samples): dist_ = mahalanobis(example_dist[c]['mean'], ex, example_dist[c]['i_cov']) distances[c].append(dist_) # If the class is the same of the classifier prediction we store it # It makes nosense to store two arrays! But now I did it this way!!! ## TODO: Create only one vector and filter it afterwards if c == classifier_prediction_tar[j]: mahalanobis_values[j] = dist_ distances[c] = np.array(distances[c]) ** 2 ''' Squared Mahalanobis distance is similar to a chi square distribution with degrees of freedom equal to the number of features. ''' mahalanobis_values = np.array(mahalanobis_values) ** 2
def cluster_decision(self, point): return np.argmin( [mahalanobis(self.cluster_centers_[label], point, np.linalg.inv(self.Gammas[label])) for label in range(self.n_clusters)])
def gaussian_weights(bundle, n_points=100, return_mahalnobis=False, stat=np.mean): """ Calculate weights for each streamline/node in a bundle, based on a Mahalanobis distance from the core the bundle, at that node (mean, per default). Parameters ---------- bundle : Streamlines The streamlines to weight. n_points : int, optional The number of points to resample to. *If the `bundle` is an array, this input is ignored*. Default: 100. Returns ------- w : array of shape (n_streamlines, n_points) Weights for each node in each streamline, calculated as its relative inverse of the Mahalanobis distance, relative to the distribution of coordinates at that node position across streamlines. """ # Resample to same length for each streamline: bundle = set_number_of_points(bundle, n_points) # This is the output w = np.zeros((len(bundle), n_points)) # If there's only one fiber here, it gets the entire weighting: if len(bundle) == 1: if return_mahalnobis: return np.array([np.nan]) else: return np.array([1]) for node in range(n_points): # This should come back as a 3D covariance matrix with the spatial # variance covariance of this node across the different streamlines # This is a 3-by-3 array: node_coords = bundle.data[node::n_points] c = np.cov(node_coords.T, ddof=0) # Reorganize as an upper diagonal matrix for expected Mahalnobis input: c = np.array([[c[0, 0], c[0, 1], c[0, 2]], [0, c[1, 1], c[1, 2]], [0, 0, c[2, 2]]]) # Calculate the mean or median of this node as well # delta = node_coords - np.mean(node_coords, 0) m = stat(node_coords, 0) # Weights are the inverse of the Mahalanobis distance for fn in range(len(bundle)): # In the special case where all the streamlines have the exact same # coordinate in this node, the covariance matrix is all zeros, so # we can't calculate the Mahalnobis distance, we will instead give # each streamline an identical weight, equal to the number of # streamlines: if np.allclose(c, 0): w[:, node] = len(bundle) break # Otherwise, go ahead and calculate Mahalanobis for node on # fiber[fn]: w[fn, node] = mahalanobis(node_coords[fn], m, np.linalg.inv(c)) if return_mahalnobis: return w # weighting is inverse to the distance (the further you are, the less you # should be weighted) w = 1 / w # Normalize before returning, so that the weights in each node sum to 1: return w / np.sum(w, 0)
precision = np.load('precision_test.npy') # Needed for finding best conformer lowest_dist = float('inf') lowest_pose = [] # Best pose, saved for latter output for i in xrange(n_cycles): params_new = tecto.update_get_connection() params[i] = params_new # Find the distance to ideal tetraloop-receptor params diff = PARAMS_TLR - params_new # Make sure the diff angles are within (-pi, pi] for i in xrange(3, 6): # index 3,4,5 are angles, others are distances if diff[i] > np.pi: diff[i] -= 2 * np.pi elif diff[i] <= -np.pi: diff[i] += 2 * np.pi dist = mahalanobis(diff, np.zeros(6), precision) if dist < lowest_dist: lowest_dist = dist lowest_pose = [tecto.pose1.copy(), tecto.pose2.copy()] ###### Likelyhood Computation ###### # Fold the angles in params into proper range, such that # they centered at the mean. N_CYCLE_FOLD_ANGLE = 10 for j in xrange(N_CYCLE_FOLD_ANGLE): mean = np.mean(params, axis=0) for i in xrange(3, 6): # index 3,4,5 are angles, others are distances params[:, i][params[:, i] > mean[i] + np.pi] -= 2 * np.pi params[:, i][params[:, i] < mean[i] - np.pi] += 2 * np.pi if PARAMS_TLR[i] > mean[i] + np.pi: PARAMS_TLR[i] += 2 * np.pi
def remove_outliers(treeList, strategy, outpath, e, summary): print "the strategy is: " + strategy if len(treeList) < 10: print "number of trees is " + str(len(treeList)) + ". This is not enough for outlier removal!" return treeList if strategy == "consensus10" or strategy == "consensus3": ftmp = findMRL(treeList, e, outpath, summary) ref_tree = dendropy.Tree.get(path=ftmp, schema="newick") treeList.append(ref_tree) d = list() for tree in treeList: tree.encode_bipartitions() ref_tree.encode_bipartitions() res = treecompare.false_positives_and_negatives(ref_tree, tree) d.append(res[1]) if strategy == "consensus3": mean = np.mean(d) # mean = mstats.mode(d) # mean = mean[0] print "the mean distance to consensus tree was: " + str(mean) st = np.std(d) print "the std of distances to consensus tree was: " + str(st) for i in range(len(d) - 1, 0, -1): if d[i] > mean + 2.0 * st: print "deleting " + str(i) + "th tree!" print "d[i] to delete: " + str(d[i]) del treeList[i] else: sortIdx = np.argsort(d, 0) print len(sortIdx) print sortIdx m = int(len(sortIdx) / 4.0) print "deleting " + str(m) + " of the trees" idx = sorted([x for x in sortIdx[len(sortIdx) - m : len(sortIdx)]], reverse=True) print idx print d for i in idx: print "deleting the tree " + str(i) + "the. The distance to consensus tree was: " + str(d[i]) del treeList[i] elif strategy == "pairwise1" or strategy == "pairwise2" or strategy == "pairwise3": D = np.ndarray(shape=(len(treeList), len(treeList)), dtype=float) for i in range(0, len(treeList)): D[i][i] = 0.0 for j in range(i + 1, len(treeList)): tree1 = treeList[i] tree2 = treeList[j] tree1.encode_bipartitions() tree2.encode_bipartitions() res1 = treecompare.false_positives_and_negatives(tree1, tree2) D[i][j] = res1[1] D[j][i] = res1[0] if strategy == "pairwise1": d = np.mean(D, 1) C = np.cov(D) v = [distance.mahalanobis(D[:, i], d, C) for i in range(0, len(treeList))] print v sortIdx = np.argsort(v, 0) m = int(len(sortIdx) * 0.15) idx = sorted([x for x in sortIdx[len(sortIdx) - m : len(sortIdx)]], reverse=True) for i in idx: print "deleting the tree " + str(i) + "the. The distance to consensus tree was: " + str(v[i]) del treeList[i] elif strategy == "pairwise3": d = np.mean(D, 0) sortIdx = np.argsort(d, 0) print len(sortIdx) print sortIdx m = int(len(sortIdx) / 5.0) print "deleting " + str(m) + " of the trees" idx = sorted([x for x in sortIdx[len(sortIdx) - m : len(sortIdx)]], reverse=True) print idx print d for i in idx: print "deleting the tree " + str(i) + "the. The distance to consensus tree was: " + str(d[i]) del treeList[i] else: d = np.mean(D, 0) print d mean = np.mean(d) st = np.std(d) idx = list() for k in range(len(d) - 1, 0, -1): if d[k] > mean + 1.5 * st: print "deleting the tree " + str(k) + "the. The distance to consensus tree was: " + str(d[k]) del treeList[k] return treeList