def test_random_projection_embedding_quality(): data, _ = make_sparse_random_data(8, 5000, 15000) eps = 0.2 original_distances = euclidean_distances(data, squared=True) original_distances = original_distances.ravel() non_identical = original_distances != 0.0 # remove 0 distances to avoid division by 0 original_distances = original_distances[non_identical] for RandomProjection in all_RandomProjection: rp = RandomProjection(n_components='auto', eps=eps, random_state=0) projected = rp.fit_transform(data) projected_distances = euclidean_distances(projected, squared=True) projected_distances = projected_distances.ravel() # remove 0 distances to avoid division by 0 projected_distances = projected_distances[non_identical] distances_ratio = projected_distances / original_distances # check that the automatically tuned values for the density respect the # contract for eps: pairwise distances are preserved according to the # Johnson-Lindenstrauss lemma assert_less(distances_ratio.max(), 1 + eps) assert_less(1 - eps, distances_ratio.min())
def do_stuff(dataset = None, metric = True, drtype = "mds", components = 2): data_for_mds = np.array(dataset) if drtype: if drtype == "mds": mds = manifold.MDS(n_components=components, n_init=10, max_iter=3000, dissimilarity="euclidean", n_jobs=1, metric=metric) mds_result = mds.fit(data_for_mds) elif drtype == "pca": pca = PCA(n_components=2) mds_result = pca.fit(euclidean_distances(data_for_mds)).transform(data_for_mds) elif drtype == "tsne": model = manifold.TSNE(n_components=2, random_state=0, learning_rate=1000, early_exaggeration=10.0) mds_result = model.fit_transform(data_for_mds) clusterings = {} for i in range(10, 1, -1): clustering = ac(n_clusters=i, memory=mkdtemp()) clusterings[i] = clustering.fit(data_for_mds).labels_.tolist() clustering = ac(n_clusters=1, memory=mkdtemp()) clustering.fit(data_for_mds) output = { "drInfo": None, "embedding": None, "clustering": { "tree": clustering.children_.tolist(), "labels": clusterings } } if drtype: median_distance = False stress1 = False raw_stress = False if drtype == "mds": raw_stress = mds_result.stress_ disparities = euclidean_distances(data_for_mds) disparityHalfMatrix = np.triu(disparities) sumSquaredDisparities = np.sum(np.square(disparityHalfMatrix)) stress1 = math.sqrt(mds_result.stress_ / sumSquaredDisparities) median_distance = np.median(euclidean_distances(mds_result.embedding_)) embedding = mds_result.embedding_.tolist() print mds_result.stress_ else: embedding = mds_result.tolist() output["drInfo"] = { "type": drtype, "metric": metric, "components": components, "stress1": stress1, "rawStress":raw_stress, "medianDistance": median_distance } output["embedding"] = embedding return output
def agregation(Entity_struc, Entity_Tex): while(not(Entity_struc==Entity_Tex).all()): w1_1=x/(0.01+euclidean_distances(Entity_struc,Entity_struc)) w1_2=x/(0.01+euclidean_distances(Entity_struc,Entity_Tex)) f=solve((w1_1+w1_2)-1, x) p1_1=f/(0.01+euclidean_distances(Entity_struc,Entity_struc)) p1_2=f/(0.01+euclidean_distances(Entity_struc,Entity_Tex)) Entity_struc=(p1_1*Entity_struc)+(p1_2*Entity_Tex) Entity_Tex=(p1_2*Entity_struc)+((p1_1)*Entity_Tex) Agregated_evidence=Entity_Tex return Agregated_evidence
def estimate_X_test(): n = 50 random_state = np.random.RandomState(42) X_true = random_state.rand(n, 3) dis = euclidean_distances(X_true) alpha, beta = -3., 1. counts = beta * dis ** alpha counts = np.triu(counts) counts[np.arange(len(counts)), np.arange(len(counts))] = 0 counts = sparse.coo_matrix(counts) X = mds.estimate_X(counts, random_state=random_state) assert_array_almost_equal(dis, euclidean_distances(X), 2)
def test_negative_binomial_gradient_sparse_dispersed(): n = 10 random_state = np.random.RandomState(42) X = random_state.rand(n, 3) dis = euclidean_distances(X) alpha, beta = -3, 1 fdis = beta * dis**alpha fdis[np.isinf(fdis)] = 1 dispersion = fdis + fdis ** 2 p = fdis / (fdis + dispersion) counts = random_state.negative_binomial(dispersion, 1 - p) counts = np.triu(counts) counts[np.arange(len(counts)), np.arange(len(counts))] = 0 counts = sparse.coo_matrix(counts, dtype=float) return True # from minorswing import dispersion mean, variance = dispersion.compute_mean_variance( counts, np.array([counts.shape[0]])) mean, variance = mean[:-1], variance[:-1] d = dispersion.DispersionPolynomial() d.fit(mean, variance) gradient_sparse = negative_binomial_structure.negative_binomial_gradient( X, counts, dispersion=d)
def test_negative_binomial_obj_sparse_dispersion_biased(): n = 10 random_state = np.random.RandomState(42) X = random_state.rand(n, 3) dis = euclidean_distances(X) alpha, beta = -3, 1 counts = beta * dis ** alpha return True from minorswing import dispersion mean, variance = dispersion.compute_mean_variance( counts**2, np.array([counts.shape[0]])) mean, variance = mean[:-1], variance[:-1] d = dispersion.Dispersion() d.fit(mean, variance) counts = np.triu(counts) counts[np.arange(len(counts)), np.arange(len(counts))] = 0 counts = sparse.coo_matrix(counts) obj = negative_binomial_structure.negative_binomial_obj( X, counts, dispersion=d, alpha=alpha, beta=beta) obj_ = negative_binomial_structure.negative_binomial_obj( random_state.rand(*X.shape), counts, dispersion=d, alpha=alpha, beta=beta) assert(obj < obj_)
def test_estimate_X_biased_dispersion(): n = 50 random_state = np.random.RandomState(42) X_true = random_state.rand(n, 3) dis = euclidean_distances(X_true) alpha, beta = -3, 1 fdis = beta * dis ** alpha fdis[np.isinf(fdis)] = 1 dispersion = fdis + fdis ** 2 p = fdis / (fdis + dispersion) counts = random_state.negative_binomial(dispersion, 1 - p) counts = np.triu(counts) counts[np.arange(len(counts)), np.arange(len(counts))] = 0 counts = sparse.coo_matrix(counts, dtype=np.float) lengths = np.array([counts.shape[0]]) return True from minorswing import dispersion mean, variance = dispersion.compute_mean_variance(counts, lengths) mean, variance = mean[:-1], variance[:-1] d = dispersion.DispersionPolynomial() d.fit(mean, variance) X = negative_binomial_structure.estimate_X(counts, alpha, beta, dispersion=d, random_state=random_state)
def test_estimate_X(): n = 50 random_state = np.random.RandomState(42) X_true = random_state.rand(n, 3) dis = euclidean_distances(X_true) alpha, beta = -3, 1 counts = beta * dis ** alpha counts = np.triu(counts) counts[np.arange(len(counts)), np.arange(len(counts))] = 0 counts = sparse.coo_matrix(counts) X = negative_binomial_structure.estimate_X(counts, alpha, beta, random_state=random_state) assert_array_almost_equal(dis, euclidean_distances(X), 2)
def plotMap(maparr, freq, nest, seqs, dbfile, map2d, outfile, plotm='T'): #mutli-dimensional scaling similarities = euclidean_distances(np.matrix(maparr)) mds = MDS(n_components=2, max_iter=3000, eps=1e-9, random_state=np.random.RandomState(seed=3), dissimilarity="precomputed", n_jobs=1) pos = mds.fit(similarities).embedding_ #plot attributes N = len(pos) #size = [20*n for n in freq] size = 8000 color = np.array(range(N)) if str(plotm) == 'T': #plot MDS fig, ax = plt.subplots(figsize=(10,10)) warnings.filterwarnings("ignore") scatter = ax.scatter(np.array(pos[:,0]), np.array(pos[:,1]), c=color, s=size, alpha=0.3, cmap=plt.cm.viridis, marker='s') plt.xlabel('Dimension 1', fontsize=20, labelpad=20) plt.ylabel('Dimension 2', fontsize=20, labelpad=20) #plt.axis([xmin, xmax, ymin, ymax]) plt.tick_params(labelsize=15, length=14, direction='out', pad=15, top='off', right='off') #save figures fig.savefig(outfile + '.png', bbox_inches='tight', format='png') fig.savefig(outfile + '.pdf', bbox_inches='tight', format='pdf') plt.close(fig) warnings.resetwarnings() #write csv file writePlotMDS(freq, nest, seqs, dbfile, pos, maparr, map2d, outfile) return pos
def fit(self,X,y=None): """ Create affinity matrix from negative euclidean distances, then apply affinity propagation clustering. Parameters ---------- X: array-like, shape (n_samples, n_features) or (n_samples, n_samples) Data matrix or, if affinity is ``precomputed``, matrix of similarities / affinities. """ X = check_array(X, accept_sparse='csr') if self.affinity == "precomputed": self.affinity_matrix_ = X elif self.affinity == "euclidean": self.affinity_matrix_ = -euclidean_distances(X,squared=True) else: raise ValueError("Affinity must be 'precomputed' or " "'euclidean'. Got %s instead" % str(self.affinity)) self.cluster_centers_indices_, self.labels_, self.n_iter_ = \ affinity_propagation( self.affinity_matrix_, self.preference, max_iter=self.max_iter, convergence_iter=self.convergence_iter, damping=self.damping, copy=self.copy, verbose=self.verbose, return_n_iter=True) if self.affinity != "precomputed": self.cluster_centers_ = X[self.cluster_centers_indices_].copy() return self
def eval_grad_f_X(X, user_data=None): """ Evaluate the gradient of the function in X """ global niter niter += 1 if not niter % 10: X.dump('%d.sol.npy' % niter) if VERBOSE: print "Poisson exponential model : eval_grad_f_X (evaluation in f X)" m, n, counts, alpha, beta, d = user_data X = X.reshape((m, n)) dis = euclidean_distances(X) tmp = X.repeat(m, axis=0).reshape((m, m, n)) dif = tmp - tmp.transpose(1, 0, 2) dis = dis.repeat(n).reshape((m, m, n)) counts = counts.repeat(n).reshape((m, m, n)) grad = - alpha * beta * dif / dis * (dis ** (alpha - 1)) + \ counts * alpha * dif / (dis ** 2) grad[np.isnan(grad)] = 0 return - grad.sum(1)
def _get_enemies_dists(data, target): """Get the distances to the nearest enemy of each instance in a training set. Args: data: Data values. target: Target values. Returns: Array with the distance to the nearest enemy of each instance. """ # Enemies of each label ('label': [list of enemies]) enemies = {} for label in np.unique(target): # For every label indices = np.nonzero(label != target)[0] enemies[label] = data[indices].copy() # Compute the distance to the nearest enemy of each instance dists = np.zeros(len(data)) for p in range(len(data)): enemies_dists = metrics.euclidean_distances(data[p], enemies[target[p]]) nearest_enemy_dist = enemies_dists.min() dists[p] = nearest_enemy_dist return dists
def query(self, query): c = self.c m = self.m query = np.array(query) # descend phase max_depth = 0 for i in range(len(self.trees)): bin_query = self._hash(query, self.hash_functions[i]) k = self.trees[i].find_prefix_match(bin_query) if k > max_depth: max_depth = k # asynchronous ascend phase candidates = list() number_of_candidates = c * len(self.trees) while max_depth > 0 and (len(candidates) < number_of_candidates or len(set(candidates)) < m): for i in range(len(self.trees)): bin_query = self._hash(query, self.hash_functions[i]) candidates.extend(self.trees[i].query(bin_query, max_depth)) max_depth = max_depth - 1 if len(candidates) == 0: candidates = range(len(self.xs)) candidates = np.array(list(set(candidates))) if self.debug: print('md:', max_depth) print('c:', candidates) distances = euclidean_distances(query, self.xs[candidates]) return sorted(zip(distances[0], candidates))[:self.m]
def betacv_simple(data, labels, size=3000, metric='euclidean'): n = labels.shape[0] n_slices = ceil(n/size) intra = 0 inter = 0 n_in = 0 n_out = 0 last = 0 labels_unq = np.unique(labels) members = np.array([member_count(labels, i) for i in labels_unq]) N_in = np.array([i*(i-1) for i in members]) n_in = np.sum(N_in) N_out = np.array([i*(n-i) for i in members]) n_out = np.sum(N_out) for i in range(n_slices): x = data[last:(last+size), :] distances = euclidean_distances(x, data) j_range = min(size, n-size*i) A = np.array([intra_cluster_distance(distances[j], labels, j+last) for j in range(j_range)]) B = np.array([inter_cluster_distance(distances[j], labels, j+last) for j in range(j_range)]) intra += np.sum(A) inter += np.sum(B) last += size betacv = (intra/n_in)/(inter/n_out) print('simple intra:', intra) print('simple inter:', inter) print('simple n_in :', n_in) print('simple n_out:', n_out) return betacv
def fit_transform(self, X, y=None, init=None): """ Fit the data from X, and returns the embedded coordinates Parameters ---------- X : array, shape=[n_samples, n_features], or [n_samples, n_samples] \ if dissimilarity='precomputed' Input data. init : {None or ndarray, shape (n_samples,)}, optional If None, randomly chooses the initial configuration if ndarray, initialize the SMACOF algorithm with this array. """ X = check_array(X) if X.shape[0] == X.shape[1] and self.dissimilarity != "precomputed": warnings.warn("The MDS API has changed. ``fit`` now constructs an" " dissimilarity matrix from data. To use a custom " "dissimilarity matrix, set " "``dissimilarity='precomputed'``.") if self.dissimilarity == "precomputed": self.dissimilarity_matrix_ = X elif self.dissimilarity == "euclidean": self.dissimilarity_matrix_ = euclidean_distances(X) else: raise ValueError("Proximity must be 'precomputed' or 'euclidean'." " Got %s instead" % str(self.dissimilarity)) self.embedding_, self.stress_, self.n_iter_, self.last_n_embeddings = smacof_dispatch(self.config, self.variant, self.dissimilarity_matrix_, metric=self.metric, n_components=self.n_components, init=init, n_init=self.n_init, n_jobs=self.n_jobs, max_iter=self.max_iter, verbose=self.verbose, eps=self.eps, random_state=self.random_state, return_n_iter=True) return self.embedding_, self.last_n_embeddings
def test_affinity_propagation_equal_mutual_similarities(): X = np.array([[-1, 1], [1, -1]]) S = -euclidean_distances(X, squared=True) # setting preference > similarity cluster_center_indices, labels = assert_warns_message( UserWarning, "mutually equal", affinity_propagation, S, preference=0) # expect every sample to become an exemplar assert_array_equal([0, 1], cluster_center_indices) assert_array_equal([0, 1], labels) # setting preference < similarity cluster_center_indices, labels = assert_warns_message( UserWarning, "mutually equal", affinity_propagation, S, preference=-10) # expect one cluster, with arbitrary (first) sample as exemplar assert_array_equal([0], cluster_center_indices) assert_array_equal([0, 0], labels) # setting different preferences cluster_center_indices, labels = assert_no_warnings( affinity_propagation, S, preference=[-20, -10]) # expect one cluster, with highest-preference sample as exemplar assert_array_equal([1], cluster_center_indices) assert_array_equal([0, 0], labels)
def _cluster_variance(cls, num_points, clusters, centroids): s = 0 denom = float(num_points - len(centroids)) for cluster, centroid in zip(clusters, centroids): distances = euclidean_distances(cluster, centroid) s += (distances*distances).sum() return s / denom
def run(self): """Implement method from ISABase.""" sel = np.zeros(len(self._x), bool) # Mask of selected instances (none) aval = np.ones(len(self._x), bool) # Mask of available instances (all) # Calculate distances to nearest enemies enemy_dists = self._get_enemies_dists(self._x, self._y) # For every unique label for l in np.unique(self._y): while True: # Get available instances with a label other than `l` candidates = (aval & (self._y == l)).nonzero()[0] candidates_dists = enemy_dists[candidates] if len(candidates_dists) == 0: break # Choose the candidate with the smallest distance to its enemy candidate = candidates[candidates_dists.argmin()] sel[candidate] = True # Mark candidate as selected aval[candidate] = False # Mark candidate as unavailable rest = candidates[candidates != candidate] # rest of candidates # Work out the distances from `candidate` to `rest` rest_dists = metrics.euclidean_distances(self._x[candidate], self._x[rest])[0] # Pick instances closer to the candidate that the candidate's # nearest enemy picked_candidates = rest[rest_dists < enemy_dists[candidate]] # Mark picked candidates as unavailable aval[picked_candidates] = False # Mark picked candidates self._sel = sel
def histogram_colors_strict(lab_array, palette, plot_filename=None): """ Return a palette histogram of colors in the image. Parameters ---------- lab_array : (N,3) ndarray The L*a*b color of each of N pixels. palette : rayleigh.Palette Containing K colors. plot_filename : string, optional If given, save histogram to this filename. Returns ------- color_hist : (K,) ndarray """ # This is the fastest way that I've found. # >>> %%timeit -n 200 from sklearn.metrics import euclidean_distances # >>> euclidean_distances(palette, lab_array, squared=True) dist = euclidean_distances(palette.lab_array, lab_array, squared=True).T min_ind = np.argmin(dist, axis=1) num_colors = palette.lab_array.shape[0] num_pixels = lab_array.shape[0] color_hist = 1. * np.bincount(min_ind, minlength=num_colors) / num_pixels if plot_filename is not None: plot_histogram(color_hist, palette, plot_filename) return color_hist
def wordMoverDistance(d1, d2): ###d1 list ###d2 list # Rule out words that not in vocabulary d1 = " ".join([w for w in d1 if w in vocab_dict]) d2 = " ".join([w for w in d2 if w in vocab_dict]) #print d1 #print d2 vect = CountVectorizer().fit([d1,d2]) feature_names = vect.get_feature_names() W_ = W[[vocab_dict[w] for w in vect.get_feature_names()]] #Word Matrix D_ = euclidean_distances(W_) # Distance Matrix D_ = D_.astype(np.double) #D_ /= D_.max() # Normalize for comparison v_1, v_2 = vect.transform([d1, d2]) v_1 = v_1.toarray().ravel() v_2 = v_2.toarray().ravel() ### EMD v_1 = v_1.astype(np.double) v_2 = v_2.astype(np.double) v_1 /= v_1.sum() v_2 /= v_2.sum() #print("d(doc_1, doc_2) = {:.2f}".format(emd(v_1, v_2, D_))) emd_d = emd(v_1, v_2, D_) ## WMD #print emd_d return emd_d
def closest(pipeline, records, record, n=10): """Find the closest records from the given record. :param pipeline: A classification pipeline, as returned by ``train``. :param records: Records are expected as a list of dictionaries. :param record: Record is expected as a dictionary. :param n: The number of closest records to return. :return list: The ``n`` closest records. """ transformer = pipeline.steps[0][1] X = transformer.transform(np.array(records, dtype=np.object)) X_record = transformer.transform(np.array([record], dtype=np.object)) top = np.argsort(euclidean_distances(X, X_record), axis=0) return [records[i] for i in top[:n]]
def run_kmeans(inFile, n_colors): china = cv2.imread(inFile) china = np.array(china, dtype=np.float64) / 255 w, h, d = original_shape = tuple(china.shape) assert d == 3 image_array = np.reshape(china, (w * h, d)) print("\tFitting model on a small sub-sample of the data") t0 = time() image_array_sample = shuffle(image_array, random_state=0)[:1000] kmeans = KMeans(k=n_colors, random_state=0).fit(image_array_sample) print("\tdone in %0.3fs." % (time() - t0)) # Get labels for all points print("\tPredicting color indices on the full image (k-means)") t0 = time() labels = kmeans.predict(image_array) print("\tdone in %0.3fs." % (time() - t0)) codebook_random = shuffle(image_array, random_state=0)[:n_colors + 1] print("\tPredicting color indices on the full image (random)") t0 = time() dist = euclidean_distances(codebook_random, image_array, squared=True) labels_random = dist.argmin(axis=0) print("\tdone in %0.3fs." % (time() - t0)) img_kmeans = recreate_image(kmeans.cluster_centers_, labels, w, h) img_random = recreate_image(codebook_random, labels_random, w, h) return china, img_kmeans, img_random
def test_affinity_propagation(): """Affinity Propagation algorithm """ # Compute similarities S = -euclidean_distances(X, squared=True) preference = np.median(S) * 10 # Compute Affinity Propagation cluster_centers_indices, labels = affinity_propagation(S, preference=preference) n_clusters_ = len(cluster_centers_indices) assert_equal(n_clusters, n_clusters_) af = AffinityPropagation(preference=preference, affinity="precomputed") labels_precomputed = af.fit(S).labels_ af = AffinityPropagation(preference=preference) labels = af.fit(X).labels_ assert_array_equal(labels, labels_precomputed) cluster_centers_indices = af.cluster_centers_indices_ n_clusters_ = len(cluster_centers_indices) assert_equal(np.unique(labels).size, n_clusters_) assert_equal(n_clusters, n_clusters_) # Test also with no copy _, labels_no_copy = affinity_propagation(S, preference=preference, copy=False) assert_array_equal(labels, labels_no_copy)
def complete_linkage(X, connectivity=None, n_clusters=4): from sklearn.cluster.hierarchical import _hc_cut if connectivity is None: d = euclidean_distances(X) else: connectivity = connectivity.copy() # Remove the diagonal mask = connectivity.row != connectivity.col connectivity.row = connectivity.row[mask] connectivity.col = connectivity.col[mask] connectivity.data = connectivity.data[mask] d_ = X[connectivity.row] d_ -= X[connectivity.col] d_ **= 2 d_ = d_.sum(axis=-1) # XXX: not necessary: complete_linkage is invariant by increasing # function d_ = np.sqrt(d_) d = connectivity d.data = d_ L = nn_chain_core(d) a, b, height = np.array(L).T children = np.c_[a, b].astype(np.int) labels = _hc_cut(n_clusters=n_clusters, children=children, n_leaves=len(X)) return labels
def euclidean_MDS(data): seed = np.random.RandomState(seed=3) similarities = euclidean_distances(data) mds = manifold.MDS(n_components=2, max_iter=3000, eps=1e-9, random_state=seed, dissimilarity="precomputed", n_jobs=1) pos = mds.fit_transform(similarities) return pos
def _poisson_exp_dense(X, counts, alpha, bias, beta=None, use_empty_entries=False): m, n = X.shape d = euclidean_distances(X) if use_empty_entries: mask = (np.invert(np.tri(m, dtype=np.bool))) else: mask = np.invert(np.tri(m, dtype=np.bool)) & (counts != 0) & (d != 0) bias = bias.reshape(-1, 1) if beta is None: beta = counts[mask].sum() / ( (d[mask] ** alpha) * (bias * bias.T)[mask]).sum() g = beta * d ** alpha g *= bias * bias.T g = g[mask] ll = counts[mask] * np.log(beta) + \ alpha * counts[mask] * np.log(d[mask]) + \ counts[mask] * np.log(bias * bias.T)[mask] ll -= g # We are trying to maximise, so we need the opposite of the log likelihood if np.isnan(ll.sum()): raise ValueError("Objective function is Not a Number") return - ll.sum()
def distance_matrix_visualization(): xx, yy = create_mesh_data(-1, 1.01, 0.2, -1, 1.01, 0.2) points = np.array([[x,y] for x,y in zip(xx.ravel(), yy.ravel())]) distance_matrix_lin = get_distance_matrix(xx, yy, linear_kernel) distance_matrix_pol = get_distance_matrix(xx, yy, get_pol_kernel_closure(10.0)) distance_matrix_rbf = get_distance_matrix(xx, yy, get_rbf_kernel_closure(10.0)) distance_matrix_sig = get_distance_matrix(xx, yy, get_sigmoid_kernel_closure(0.1)) distance_matrix_orig = euclidean_distances(points, points) plt.figure() plt.pcolor(distance_matrix_orig) plt.colorbar() plt.figure() plt.pcolor(distance_matrix_lin) plt.colorbar() plt.figure() plt.pcolor(distance_matrix_pol) plt.colorbar() plt.figure() plt.pcolor(distance_matrix_rbf) plt.colorbar() plt.figure() plt.pcolor(distance_matrix_sig) plt.colorbar() plt.show()
def __call__(self, track, slice=None): # remove WHERE when table cleaned up to remove header rows statement = ( "SELECT transcript_id, TPM, sample_id FROM %(table)s " "where transcript_id != 'Transcript'") # fetch data df = pd.DataFrame.from_dict(self.getAll(statement)) df = df.pivot('transcript_id', 'sample_id')['TPM'] # calculate dissimilarities similarities = euclidean_distances(df.transpose()) # run MDS mds = manifold.MDS(n_components=2, max_iter=3000, eps=1e-9, dissimilarity="precomputed", n_jobs=1) mds = mds.fit(similarities) pos = pd.DataFrame(mds.embedding_) pos.columns = ["MD1", "MD2"] pos['sample'] = df.columns return pos
def test_poisson_exp(): random_state = np.random.RandomState(seed=42) n = 50 X = random_state.rand(n, 3) counts = euclidean_distances(X)**(-3) counts[np.isinf(counts) | np.isnan(counts)] = 0 eps = poisson_model.poisson_exp(X, counts, -2) assert eps < 1e-6
def mds_positions(df, identifier, hash_map): euc = pd.DataFrame(euclidean_distances(df), index=df.index, columns=df.columns) mds = manifold.MDS(dissimilarity='precomputed', max_iter=3000) posdf = pd.DataFrame(mds.fit(euc).embedding_, index=euc.index) clf = PCA(n_components=2) posdf = pd.DataFrame(clf.fit_transform(posdf), index=posdf.index) posdf[identifier] = [hash_map[abb] for abb in posdf.index] return posdf
def _cluster_variance(cls, num_points, clusters, centroids): s = 0 num_dims = clusters[0][0].shape[0] denom = float(num_points - len(centroids)) * num_dims for cluster, centroid in zip(clusters, centroids): distances = euclidean_distances(cluster, centroid) s += (distances * distances).sum() return s / denom
def discr_stat(X, Y, dissimilarity="euclidean", remove_isolates=True, return_rdfs=False): """ Computes the discriminability statistic. Parameters ---------- X : array, shape (n_samples, n_features) or (n_samples, n_samples) Input data. If dissimilarity=='precomputed', the input should be the dissimilarity matrix. Y : 1d-array, shape (n_samples) Input labels. dissimilarity : str, {"euclidean" (default), "precomputed"} Dissimilarity measure to use: - 'euclidean': Pairwise Euclidean distances between points in the dataset. - 'precomputed': Pre-computed dissimilarities. remove_isolates : bool, optional, default=True Whether to remove data that have single label. return_rdfs : bool, optional, default=False Whether to return rdf for all data points. Returns ------- stat : float Discriminability statistic. rdfs : array, shape (n_samples, max{len(id)}) Rdfs for each sample. Only returned if ``return_rdfs==True``. """ check_X_y(X, Y, accept_sparse=True) uniques, counts = np.unique(Y, return_counts=True) if (counts != 1).sum() <= 1: msg = "You have passed a vector containing only a single unique sample id." raise ValueError(msg) if remove_isolates: idx = np.isin(Y, uniques[counts != 1]) labels = Y[idx] if dissimilarity == "euclidean": X = X[idx] else: X = X[np.ix_(idx, idx)] else: labels = Y if dissimilarity == "euclidean": dissimilarities = euclidean_distances(X) else: dissimilarities = X rdfs = _discr_rdf(dissimilarities, labels) stat = np.nanmean(rdfs) if return_rdfs: return stat, rdfs else: return stat
def visualize(reader, visualization_method, value_column, segment_column): labels, data = organize_data(reader, visualization_method, value_column, segment_column) if visualization_method == 'hc': link = linkage(data) dendrogram(link, leaf_label_func=lambda i: labels[i]) plt.gcf() plt.show() if visualization_method == 'mds': n = len(labels) data -= data.mean() clf = PCA(n_components=2) data = clf.fit_transform(data) similarities = euclidean_distances(data) # Add noise to the similarities noise = np.random.rand(n, n) noise = noise + noise.T noise[np.arange(noise.shape[0]), np.arange(noise.shape[0])] = 0 similarities += noise fig = plt.figure(1) ax = plt.axes([0., 0., 1., 1.]) similarities = similarities.max() / similarities * 100 similarities[np.isinf(similarities)] = 0 plt.scatter(data[:, 0], data[:, 1], c='r', s=20) plt.legend('Position', loc='best') start_idx, end_idx = np.where(data) segments = [[data[i, :], data[j, :]] for i in range(len(data)) for j in range(len(data))] values = np.abs(similarities) lc = LineCollection(segments, zorder=0, cmap=plt.cm.hot_r, norm=plt.Normalize(0, values.max())) lc.set_array(similarities.flatten()) lc.set_linewidths(0.5 * np.ones(len(segments))) ax.add_collection(lc) for label, x, y in zip(labels, data[:, 0], data[:, 1]): plt.annotate(label, xy=(x, y), xytext=(-20, 20), textcoords='offset points', ha='right', va='bottom', bbox=dict(boxstyle='round,pad=0.5', fc='yellow', alpha=0.5), arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=0')) plt.show()
def test_pairwise_distances_radius_neighbors( n_features, translation, metric, strategy, n_samples=100, dtype=np.float64, ): rng = np.random.RandomState(0) spread = 1000 radius = spread * np.log(n_features) X = translation + rng.rand(n_samples, n_features).astype(dtype) * spread Y = translation + rng.rand(n_samples, n_features).astype(dtype) * spread metric_kwargs = _get_metric_params_list(metric, n_features)[0] # Reference for argkmin results if metric == "euclidean": # Compare to scikit-learn GEMM optimized implementation dist_matrix = euclidean_distances(X, Y) else: dist_matrix = cdist(X, Y, metric=metric, **metric_kwargs) # Getting the neighbors for a given radius neigh_indices_ref = [] neigh_distances_ref = [] for row in dist_matrix: ind = np.arange(row.shape[0])[row <= radius] dist = row[ind] sort = np.argsort(dist) ind, dist = ind[sort], dist[sort] neigh_indices_ref.append(ind) neigh_distances_ref.append(dist) neigh_indices_ref = np.array(neigh_indices_ref) neigh_distances_ref = np.array(neigh_distances_ref) neigh_distances, neigh_indices = PairwiseDistancesRadiusNeighborhood.compute( X, Y, radius, metric=metric, metric_kwargs=metric_kwargs, return_distance=True, # So as to have more than a chunk, forcing parallelism. chunk_size=n_samples // 4, strategy=strategy, sort_results=True, ) ASSERT_RESULT[PairwiseDistancesRadiusNeighborhood](neigh_distances, neigh_distances_ref, neigh_indices, neigh_indices_ref)
def _select_targets(self): target_neighbors = np.empty((self.X_.shape[0], self.k), dtype=int) for label in self.labels_: inds, = np.nonzero(self.label_inds_ == label) dd = euclidean_distances(self.X_[inds], squared=True) np.fill_diagonal(dd, np.inf) nn = np.argsort(dd)[..., :self.k] target_neighbors[inds] = inds[nn] return target_neighbors
def squared_difference_mean(data1, data2): distance = euclidean_distances(data1, data2) num = distance.shape[0] error = 0 for i in range(num): error += distance[i][i]**2 error_mean = error / num return error_mean
def from_points(cls, points: np.array): if points.ndim != 2: raise ValueError('"points" should have two dimensions.') if points.shape[0] < 3: raise ValueError('"points" should contain at least 3 points.') if points.shape[1] != 3: raise ValueError('"points" should be X*3 (x,y,z).') distance_matrix_nparray = euclidean_distances(points) return cls(distance_matrix_nparray)
def get_representative_jobs(df, kmeans): cluster_centers = kmeans.cluster_centers_ for cent in cluster_centers: print('\nCluster Represnetations') dist = euclidean_distances(cent.reshape(1, -1), tfidf) order = np.argsort(dist) for o in order[0][:5]: title = df['Job_Title'].iloc[o] print(title)
def test_shuffle_equal(verbose): # for this data set there shouldn't be any equal distances, # and shuffle should make no difference X, _ = make_classification(random_state=12354) dist = euclidean_distances(X) skew_shuffle, skew_no_shuffle = \ [Hubness(metric='precomputed', shuffle_equal=v, verbose=verbose) .fit(dist).score() for v in [True, False]] assert skew_no_shuffle == skew_shuffle
def _select_targets(X, y, k): target_neighbors = np.empty((X.shape[0], k), dtype=int) for label in np.unique(y): inds, = np.nonzero(y == label) dd = euclidean_distances(X[inds], squared=True) np.fill_diagonal(dd, np.inf) nn = np.argsort(dd)[..., :k] target_neighbors[inds] = inds[nn] return target_neighbors
def neighbor_test(): from sklearn.metrics import euclidean_distances A = np.array([[0, 0], [0, 1], [1, 0], [1, 1]]) D = euclidean_distances(A) nbrs = NearestNeighbors(n_neighbors=2, metric='precomputed').fit(D) distance, knn = nbrs.kneighbors() # 这里返回的KNN是不包括它自己的。 print(distance) print(knn)
def calculateDistance(mean_images): length = len(mean_images) distance = np.zeros((10, 10)) for i in range(length): for j in range(length): a = mean_images[i].reshape(1, -1) b = mean_images[j].reshape(1, -1) distance[i, j] = euclidean_distances(a, b) return np.square(distance)
def showMDSAnalysis(X, q_class, n_components): similarities = euclidean_distances(X) print 'similarities...' # similarities = 1 - chi2_kernel(X, gamma=.5) print 'mds...' mds = manifold.MDS(n_components=2, max_iter=3000, eps=1e-9, dissimilarity="precomputed", n_jobs=1) pos = mds.fit(similarities).embedding_ # print 'nmds...' # nmds = manifold.MDS(n_components=2, metric=False, max_iter=3000, eps=1e-12, # dissimilarity="precomputed", n_jobs=1, # n_init=1) # npos = nmds.fit_transform(similarities, init=pos) clf = PCA(n_components=2) X = clf.fit_transform(X) pos = clf.fit_transform(pos) # npos = clf.fit_transform(npos) fig = plt.figure(1) ax = plt.axes([0., 0., 1., 1.]) color = 'wmgbr' mark = 'ox+Ds' c = 0 start = 0 for n in q_class: end = start + n print start, end # plt.scatter(X[i:i+50, 0], X[i:i+50, 1], c=color[c], marker = mark[c]) plt.scatter(pos[start:end, 0], pos[start:end, 1], s=20, c=color[c], marker=mark[c]) # plt.scatter(npos[i:i+50, 0], npos[i:i+50, 1], s=20, c=color[c], marker = mark[c]) # plt.legend(('True position', 'MDS', 'NMDS'), loc='best') c += 1 start = end plt.legend(('equation', 'photo', 'scheme', 'table', 'visualization'), loc='best') similarities = similarities.max() / similarities * 100 similarities[np.isinf(similarities)] = 0 plt.show() return pos
def dimension_reduction(df, sample_limit, category_feature, method="MDS", n_components=3, n_jobs=2, whiten=True): ''' The function is used to conduct multidimentional scaling Inputs: df: dataframe sample_limit: given restriction for the size of rows category_features: feature used as predefined label methods: "MDS" for multidimensional scaling, "PCA" for principal component analysis n_components: the final dimensions n_jobs: parallel computing factor whiten: True if removing relative variance between components Returns: numpy array with n dinmensions and labels, index for the label ''' if df.shape[0] > sample_limit: sub_df = df.sample(n=sample_limit).reset_index() else: sub_df = df.reset_index() used_columns = list(sub_df.columns) if category_feature: used_columns.remove(category_feature) sub_dfm = np.matrix(sub_df[used_columns]) if method == "MDS": similarities = euclidean_distances(sub_dfm) mds = manifold.MDS(n_components=n_components, max_iter=3000, eps=1e-9, dissimilarity='precomputed', n_jobs=1) pos = mds.fit(similarities).embedding_ else: pca = PCA(n_components=n_components, copy=True, whiten=whiten) pos = pca.fit_transform(sub_dfm) print(pca.explained_variance_ratio_) if category_feature: category_index = {} sub_df["label"] = 0 for i, category in enumerate( sorted(list(sub_df[category_feature].unique()))): sub_df.loc[sub_df[category_feature] == category, "label"] = i category_index[category] = i new_pos = np.zeros((pos.shape[0], n_components + 1)) new_pos[:, :-1] = pos new_pos[:, -1] = sub_df["label"] return new_pos, category_index
def predict(self, X): # Check is fit had been called check_is_fitted(self, ['X_', 'y_']) # Input validation X = check_array(X) closest = np.argmin(euclidean_distances(X, self.X_), axis=1) return self.y_[closest]
def calculate_distance(gdf, norm): xy = np.asarray( gdf[['x', 'y']] * 10000 ) # pd.merge(gdf[geom_col].x, gdf[geom_col].y, left_index=True, right_index=True) spatial_distance = euclidean_distances(xy) norm_spatial_distance = preprocessing.normalize(spatial_distance, norm=norm) t = np.asarray(gdf[['t']]) temporal_distance = euclidean_distances(t) norm_temporal_distance = preprocessing.normalize(temporal_distance, norm=norm) c = np.asarray(gdf['c']) vectorizer = TfidfVectorizer() c_vect = vectorizer.fit_transform(c) content_distance = np.absolute(cosine_distances(c_vect)) norm_content_distance = preprocessing.normalize(content_distance, norm=norm) distances = alpha * norm_spatial_distance + beta * norm_content_distance + gama * norm_temporal_distance return distances
def evaluate(x, z, hyp): if len(x.shape) == 1: x = x.reshape(1, -1) if len(z.shape) == 1: z = z.reshape(1, -1) ell = np.exp(hyp[0]) sf2 = np.exp(2 * hyp[1]) K = euclidean_distances(x / ell, z / ell, squared=True) # (x-z)^T (x-z) return sf2 * np.exp(-K / 2)
def test_equal_similarities_and_preferences(): # Unequal distances X = np.array([[0, 0], [1, 1], [-2, -2]]) S = -euclidean_distances(X, squared=True) assert not _equal_similarities_and_preferences(S, np.array(0)) assert not _equal_similarities_and_preferences(S, np.array([0, 0])) assert not _equal_similarities_and_preferences(S, np.array([0, 1])) # Equal distances X = np.array([[0, 0], [1, 1]]) S = -euclidean_distances(X, squared=True) # Different preferences assert not _equal_similarities_and_preferences(S, np.array([0, 1])) # Same preferences assert _equal_similarities_and_preferences(S, np.array([0, 0])) assert _equal_similarities_and_preferences(S, np.array(0))
def predict(self, xtest): """Predict method""" # Check is fit had been called check_is_fitted(self, ['xtrain_', 'ytrain_']) # Input validation xtest = check_array(xtest) closest = np.argmin(euclidean_distances(xtest, self.xtrain_), axis=1) return self.ytrain_[closest]
def getDisMatrixEuclidean(meanList): eucDisMat = [] for i in range(len(meanList)): eucDisMat.append([]) for j in range(len(meanList)): dis = sum( euclidean_distances(meanList[i].reshape(1, -1), meanList[j].reshape(1, -1)))[0] eucDisMat[i].append(dis * dis) return eucDisMat
def getvec(self, s1, s2): vect = CountVectorizer(token_pattern='(?u)\\b\\w+\\b').fit([s1, s2]) v1, v2 = vect.transform([s1, s2]) v1 = v1.toarray().ravel() v2 = v2.toarray().ravel() w = numpy.array([self.model[w] for w in vect.get_feature_names()]) d = euclidean_distances(w) d = d.astype(numpy.double) d /= d.max() return v1, v2, d
def f(R, *params): thetaxm, thetaym, thetazm, thetaxp, thetayp, thetazp = R d, X, Y, distances = params Rm = rotation(thetaxm, thetaym, thetazm) Rp = rotation(thetaxp, thetayp, thetazp) Xr = Rm.dot(X.T).T Yr = Rp.dot(Y.T).T + np.tile([d, 0, 0], (Y.shape[0], 1)) dis = euclidean_distances(Xr, Yr) obj = 1. / (distances**2) * ((dis - distances)**2) return obj[np.invert(np.isnan(obj) | np.isinf(obj))].sum()
def eval_stress(X, user_data=None): """ """ if VERBOSE: print("Computing stress: eval_stress") m, n, distances, alpha, beta, d = user_data X = X.reshape((m, n)) dis = euclidean_distances(X) stress = ((dis - distances)**2)[distances != 0].sum() return stress
def get_derivative(self, X, Y, P, Q, P0, beta): Dy = euclidean_distances(Y) H = hessian_y_matrix_fast(Dy, P, Q, Y) J = derivative_X_matrix_fast(X, Y, Dy, beta, P0) self.H = H self.J = J Pxy = Jxy(H, J) self.P = Pxy return Pxy
def cal_sim(): for i in range(len(tasks)): for j in range(i, len(tasks)): print( tasks[i], tasks[j], cosine_similarity([domain_embedding[i], domain_embedding[j]])) print( tasks[i], tasks[j], euclidean_distances([domain_embedding[i], domain_embedding[j]]))
def poisson_lambda(x, cdis, beta, alpha, bias=None): d = euclidean_distances(x) n = int(x.shape[0] / 2) # set inter distance to centroid distance d[:n, n:] = cdis d[n:, :n] = cdis if bias is None: bias = np.ones(d.shape[0], dtype=float) lambda_mat = (beta * d**alpha) * np.outer(bias, bias) return lambda_mat.astype(float)
def spanning_tree_length(X): """Compute the length of the euclidean MST of X. Parameters ---------- X: ndarray, shape=[n_samples, n_features] """ if X.shape[0] < 2: return 0 return minimum_spanning_tree(euclidean_distances(X)).sum()
def calculate_costvalue(dists, red_dists): """Only for testing""" low_dists = euclidean_distances(red_dists) n_conf = dists.shape[0] costvalue = [] for i in range(n_conf - 1): for j in range(i + 1, n_conf): costvalue.append(abs(dists[i][j] - low_dists[i][j])) costvalue = sum(costvalue) / len(costvalue) return costvalue
def find_closest(in_vector, proto_vectors): closest = None closest_distance = 99999 for p_v in proto_vectors: distance = euclidean_distances(in_vector.reshape(1, 4), p_v.p_vector.reshape(1, 4)) if distance < closest_distance: closest_distance = distance closest = p_v return closest
def predict(self, X): print('Predict', len(X)) # Check if fit had been called check_is_fitted(self, ['p5p_']) # Input validation X = check_array(X) closest = np.argmin(euclidean_distances(X, self.X_), axis=1) #print(closest, self.p5p_, self.y_[0 0]) return self.y_[closest]