def test_paired_distances(): """ Test the pairwise_distance helper function. """ rng = np.random.RandomState(0) # Euclidean distance should be equivalent to calling the function. X = rng.random_sample((5, 4)) # Euclidean distance, with Y != X. Y = rng.random_sample((5, 4)) for metric, func in iteritems(PAIRED_DISTANCES): S = paired_distances(X, Y, metric=metric) S2 = func(X, Y) assert_array_almost_equal(S, S2) if metric in PAIRWISE_DISTANCE_FUNCTIONS: # Check the the pairwise_distances implementation # gives the same value distances = PAIRWISE_DISTANCE_FUNCTIONS[metric](X, Y) distances = np.diag(distances) assert_array_almost_equal(distances, S) # Check the callable implementation S = paired_distances(X, Y, metric='manhattan') S2 = paired_distances(X, Y, metric=lambda x, y: np.abs(x -y).sum(axis=0)) assert_array_almost_equal(S, S2) # Test that a value error is raised when the lengths of X and Y should not # differ Y = rng.random_sample((3, 4)) assert_raises(ValueError, paired_distances, X, Y)
def movement_df(self, other, metric="euclidean"): """ Creates a dataframe that shows the movement from one embeddingset to another one. Arguments: other: the other embeddingset to compare against, will only keep the overlap metric: metric to use to calculate movement, must be scipy or sklearn compatible Usage: ```python from whatlies.language import SpacyLanguage lang = SpacyLanguage("en_core_web_sm") names = ['red', 'blue', 'green', 'yellow', 'cat', 'dog', 'mouse', 'rat', 'bike', 'car'] emb = lang[names] emb_ort = lang[names] | lang['cat'] emb.movement_df(emb_ort) ``` """ overlap = list( set(self.embeddings.keys()).intersection( set(other.embeddings.keys()))) mat1 = np.array([w.vector for w in self[overlap]]) mat2 = np.array([w.vector for w in other[overlap]]) return (pd.DataFrame({ "name": overlap, "movement": paired_distances(mat1, mat2, metric) }).sort_values(["movement"], ascending=False).reset_index())
def compute_distances(vectorizer, data, vectorizer_name="", train=True, col1_name="question1", col2_name="question2"): vector1 = vectorizer.transform(data[col2_name].values) vector2 = vectorizer.transform(data[col1_name].values) print(vectorizer_name, " data prepared") for distance_type in DISTANCES: metric_name = "distance_{0}_{1}".format(vectorizer_name, distance_type) metric_values = np.array([]) if distance_type in ['cosine', 'l1', 'l2']: metric_values = paired_distances(vector1, vector2, metric=distance_type) else: metric = lambda x, y: pairwise_distances( x.reshape(1, -1), y.reshape(1, -1), metric=distance_type) for el1, el2 in zip(vector1, vector2): metric_values = np.append(metric_values, metric(el1.toarray(), el2.toarray())) if distance_type in ['canberra', 'l1']: if train: print(metric_name) metric_values = fit_min_max_scale(metric_values.reshape( -1, 1)).flatten() else: metric_values = min_max_scale(metric_values.reshape(-1, 1), metric_name) data[metric_name] = metric_values print("metric {0}, minimum {1:.4f}, maximum {2:.4f}".format( distance_type, np.min(metric_values), np.max(metric_values)))
def transform_utterance(self, utt): """ Computes vector representations, ranges, and cluster assignments for an utterance, using the two `ExpectedContextModelTransformer` instances. Also computes utterance-level orientation and shift. Note that the utterance must contain the input representation as a metadata field, specified by what was passed into the constructor as the `vect_field` argument. Will write all of these characterizations (including vectors) to the utterance's metadata. :param utt: Utterance :return: the utterance, with per-utterance attributes. """ utt = self.ec_models[0].transform_utterance(utt) utt = self.ec_models[1].transform_utterance(utt) if self.wrapper_output_prefix == '': orn_field = 'orn' shift_field = 'shift' else: orn_field = self.wrapper_output_prefix + '_orn' shift_field = self.wrapper_output_prefix + '_shift' utt.meta[orn_field] = utt.meta[self.output_prefixes[0] + '_range'] \ - utt.meta[self.output_prefixes[1] + '_range'] utt.meta[shift_field] = float( paired_distances( np.array([utt.meta[self.output_prefixes[0] + '_repr']]), np.array([utt.meta[self.output_prefixes[1] + '_repr']]))[0]) return utt
def transform(self, corpus, selector=lambda x: True): """ Computes vector representations, ranges, and cluster assignments for utterances in a corpus, using the two `ExpectedContextModelTransformer` instances. Also computes utterance-level orientation and shift. :param corpus: Corpus :param selector: a boolean function of signature `filter(utterance)` that determines which utterances to transform. defaults to all utterances. :return: the Corpus, with per-utterance attributes. """ self.ec_models[0].transform(corpus, selector=selector) self.ec_models[1].transform(corpus, selector=selector) if self.wrapper_output_prefix == '': orn_field = 'orn' shift_field = 'shift' else: orn_field = self.wrapper_output_prefix + '_orn' shift_field = self.wrapper_output_prefix + '_shift' for ut in corpus.iter_utterances(selector=selector): ut.meta[orn_field] = ut.meta[ self.output_prefixes[0] + '_range'] - ut.meta[self.output_prefixes[1] + '_range'] utt_shifts = paired_distances( corpus.get_vectors(self.output_prefixes[0] + '_repr'), corpus.get_vectors(self.output_prefixes[1] + '_repr')) for id, shift in zip( corpus.get_vector_matrix(self.output_prefixes[0] + '_repr').ids, utt_shifts): corpus.get_utterance(id).meta[shift_field] = shift
def test_encoder(metric='euclidean'): root = '../../../data/AIC20_track3/train/S03' cams = ['c010', 'c011', 'c012', 'c013', 'c014', 'c015'] detections = {} cap = {} for cam in cams: frame_detections = defaultdict(list) for det in parse_annotations_from_txt( os.path.join(root, cam, 'mtsc', 'mtsc_tc_mask_rcnn.txt')): if det.height >= 128 and det.width >= 128: frame_detections[det.frame].append(det) detections[cam] = frame_detections cap[cam] = cv2.VideoCapture(os.path.join(root, cam, 'vdo.avi')) def random_detection(cam=None, id=None): if cam is None: cam = np.random.choice(cams) if id is None: frame = np.random.choice(list(detections[cam].keys())) det = np.random.choice(detections[cam][frame]) else: for frame in np.random.permutation(list(detections[cam].keys())): found = False for det in detections[cam][frame]: if det.id == id: found = True break if found: break else: raise ValueError(f'id {id} not found in cam {cam}') cap[cam].set(cv2.CAP_PROP_POS_FRAMES, det.frame) ret, img = cap[cam].read() img = img[int(det.ytl):int(det.ybr), int(det.xtl):int(det.xbr)] return img, (cam, det.id) encoder = Encoder(path='../metric_learning/checkpoints/epoch_19__ckpt.pth') print(encoder) encoder.eval() pairs = [(('c010', 15), ('c011', 29)), None] for p in pairs: if p is not None: img1, info1 = random_detection(*p[0]) img2, info2 = random_detection(*p[1]) else: img1, info1 = random_detection() img2, info2 = random_detection() embd1 = encoder.get_embedding(img1) embd2 = encoder.get_embedding(img2) dist = paired_distances([embd1], [embd2], metric).squeeze() print(dist) cv2.imshow('{}:{}'.format(*info1), img1) cv2.imshow('{}:{}'.format(*info2), img2) cv2.waitKey(0) cv2.destroyAllWindows()
def cost_func(x0, *args) -> float: """Cost function to optimize weights from which the best trajectory for a joint is calculated. :param x0: 3 lambda weights for linear combination of marker vectors to retrieve joint location. :type x0: numpy.ndarray :param args: marker trajectories matrix, marker indices belonging to rigid body 1 & rigid body 2, distance penalty weight factor. :type args: tuple :return: cost :rtype: float """ trajectories = args[0] rigid1_indices = args[1] rigid2_indices = args[2] penalty = float(args[3]) # First, construct the joint trajectory from rigid body 1 and weights. j = joint_from_markers(trajectories[:, rigid1_indices, :], x0) all_marker_indices = rigid1_indices + rigid2_indices # Todo: Is there a faster way? Distances of all markers to joint in parallel. Or use n_jobs for speedup? # Then calculate cost q. distances_to_joint = np.array([ paired_distances(t, j, n_jobs=-1) for t in np.swapaxes(trajectories[:, all_marker_indices], 0, 1) ]) mean_distances = np.mean(distances_to_joint, axis=1) var_distances = np.var(distances_to_joint, axis=1) q = (var_distances + penalty * mean_distances).sum() / len(all_marker_indices) return q
def find_shot_time(data): num = np.unique(data['shot_num']) for u in range(0, 1): one_shot = data.loc[data['shot_num'] == num[u]] orig_shot_time = one_shot.loc[one_shot['shot_ind'] == 1]['game_clock'].item() shooter = one_shot.iloc[0]['shooter'] time_range = one_shot.loc[ (one_shot['game_clock'] <= orig_shot_time) & (one_shot['game_clock'] >= orig_shot_time - 500)] player_ball = time_range.loc[(time_range['player_name'] == 'ball') | (time_range['player_name'] == shooter)] moms = np.unique(player_ball['moment']) time_dist = pd.DataFrame(columns=['moment', 'distance'], index=np.arange(len(moms))) for p in range(0, len(moms)): sub = player_ball.loc[player_ball['moment'] == moms[p]] player_x = sub.loc[sub['player_name'] == shooter, ]['x_loc'].item() player_y = sub.loc[sub['player_name'] == shooter, ]['y_loc'].item() ball_x = sub.loc[sub['player_name'] == 'ball', ]['x_loc'].item() ball_y = sub.loc[sub['player_name'] == 'ball', ]['y_loc'].item() dist = paired_distances([player_x, player_y], [ball_x, ball_y]) time_dist.loc[p, 'moment'] = moms[p] time_dist.loc[p, 'distance'] = dist return (time_dist)
def fitness_manhattan_similarity_avg(catalog_matrix, exposure_data, original_data): new_matrix = np.dot(np.array(catalog_matrix), np.array(exposure_data)) similarity = paired_distances(new_matrix.reshape(-1, 1), np.array(original_data).reshape(-1, 1), metric='manhattan') return -np.average(similarity)
def fitness_manhattan_similarity_sum(catalog_matrix, exposure_data, original_data): new_matrix = np.dot(np.array(catalog_matrix), np.array(exposure_data)) similarity = paired_distances(new_matrix.reshape(1, -1), np.array(original_data).reshape(1, -1), metric='euclidean') return -np.sum(similarity)
def getDistances(x,y): distances = {} distances['mae'] = mean_absolute_error(x, y) distances['mse'] = mean_squared_error(x, y) distances['euclidean'] = np.mean(paired_distances(x, y, metric='euclidean')) distances['manhattan'] = np.mean(paired_distances(x, y, metric='manhattan')) distances['cosine'] = np.mean(paired_distances(x, y, metric='cosine')) distances['mae'] = round(distances['mae'], 5) distances['mse'] = round(distances['mse'], 5) distances['euclidean'] = round(distances['euclidean'], 5) distances['manhattan'] = round(distances['manhattan'], 5) distances['cosine'] = round(distances['cosine'], 5) return distances
def findClosestCentroids(X, centroids): idx = np.zeros((len(X), )) for i in range(len(X)): x = np.repeat(X[i].reshape(-1, 1), len(centroids), axis=1).T eucl_dist = paired_distances(x, centroids) idx[i] = np.argmin(eucl_dist) return idx
def delaunay_graph(X, weighted=False): '''Delaunay triangulation graph. ''' e1, e2 = _delaunay_edges(X) pairs = np.column_stack((e1, e2)) w = paired_distances(X[e1], X[e2]) if weighted else None return Graph.from_edge_pairs(pairs, num_vertices=X.shape[0], symmetric=True, weights=w)
def test_paired_distances_callable(): # Test the pairwise_distance helper function # with the callable implementation rng = np.random.RandomState(0) # Euclidean distance should be equivalent to calling the function. X = rng.random_sample((5, 4)) # Euclidean distance, with Y != X. Y = rng.random_sample((5, 4)) S = paired_distances(X, Y, metric='manhattan') S2 = paired_distances(X, Y, metric=lambda x, y: np.abs(x - y).sum(axis=0)) assert_array_almost_equal(S, S2) # Test that a value error is raised when the lengths of X and Y should not # differ Y = rng.random_sample((3, 4)) assert_raises(ValueError, paired_distances, X, Y)
def gabriel_graph(X, metric='euclidean'): a,b = np.triu_indices(X.shape[0], k=1) midpoints = (X[a] + X[b]) / 2 Dmid = pairwise_distances(midpoints, X, metric=metric).min(axis=1) Dedge = paired_distances(X[a], X[b], metric=metric) mask = (Dedge - Dmid * 2) < 1e-10 pairs = np.transpose((a[mask],b[mask])) return Graph.from_edge_pairs(pairs, num_vertices=X.shape[0], symmetric=True)
def movement_df(self, other, metric="euclidean"): overlap = list( set(self.embeddings.keys()).union(set(other.embeddings.keys()))) mat1 = np.array([w.vector for w in self[overlap]]) mat2 = np.array([w.vector for w in other[overlap]]) return pd.DataFrame({ 'name': overlap, 'movement': paired_distances(mat1, mat2, metric) }).sort_values(['movement'], ascending=False).reset_index()
def get_average_displacement(df): if len(df) > 1: test = paired_distances( df[[self.identifiers[0], self.identifiers[1]]].iloc[1:, :], df[[self.identifiers[0], self.identifiers[1]]].shift().iloc[1:, :]) return np.mean(test) else: pass
def analogy_solver(man, woman, king, W, top_n=5, return_score=False): """ In the famous "man is to woman as king is to queen" example, queen is the word w that maximizes: cos(w, king) - cos(w, man) + cos(w, woman). """ A = np.array([king] * len(W)) B = np.array([man] * len(W)) Y = np.array([woman] * len(W)) score = paired_distances(W, A, 'cosine') - paired_distances(W, B, 'cosine') + paired_distances(W, Y, 'cosine') # score = score.flatten() sorted_score = score.argsort() # [::-1] if not return_score: return sorted_score[:top_n] else: return sorted_score[:top_n], score[sorted_score][:top_n]
def gabriel_graph(X, metric='euclidean', weighted=False): n = X.shape[0] a, b = np.triu_indices(n, k=1) midpoints = (X[a] + X[b]) / 2 _, Dmid = pairwise_distances_argmin_min(midpoints, X, metric=metric) Dedge = paired_distances(X[a], X[b], metric=metric) mask = (Dedge - Dmid * 2) < 1e-10 pairs = np.column_stack((a[mask], b[mask])) w = Dedge[mask] if weighted else None return Graph.from_edge_pairs(pairs, num_vertices=n, symmetric=True, weights=w)
def _converged(self, old_centers, iteration): if old_centers is None: return False diff = np.sum(paired_distances(self.centers, old_centers)) if self.verbose: print("Iteration %s - Convergence crit. = %s" % (iteration, diff)) return diff < self.tol**2 or iteration >= self.max_iter
def embedding_distance_bulk( embeddings1: Embedding, embeddings2: Embedding, distance_metric: DistanceMetric) -> np.ndarray: """Compares the distance between two arrays of embeddings """ if distance_metric == DistanceMetric.EUCLIDEAN_SQUARED: return np.square( paired_distances( embeddings1, embeddings2, metric='euclidean')) elif distance_metric == DistanceMetric.ANGULAR_DISTANCE: # Angular Distance: https://en.wikipedia.org/wiki/Cosine_similarity similarity = 1 - paired_distances( embeddings1, embeddings2, metric='cosine') return np.arccos(similarity) / math.pi
def calculate(self, pairs: Iterable[Pair]) -> np.ndarray: embeddings1 = [] embeddings2 = [] for pair in pairs: embeddings1.append(pair.image1) embeddings2.append(pair.image2) if self._distance_metric == DistanceMetric.EUCLIDEAN_SQUARED: return np.square( paired_distances(embeddings1, embeddings2, metric='euclidean')) if self._distance_metric == DistanceMetric.ANGULAR_DISTANCE: # Angular Distance: https://en.wikipedia.org/wiki/Cosine_similarity similarity = 1 - paired_distances( embeddings1, embeddings2, metric='cosine') return np.arccos(similarity) / math.pi metrics = [str(metric) for metric in DistanceMetric] err = f"Undefined {DistanceMetric.__qualname__}. \ Choose from {metrics}" raise DistanceMetricException(err)
def node_feature_similarities(node_features, sources, sinks): similarities = [] row_features = node_features[sources] col_features = node_features[sinks] for metric in [cosine, euclidean, rbf]: similarities.append( paired_distances(row_features, col_features, metric=metric)) similarities = np.array(similarities).T logger.info(f'node_feature_similarities generated: {similarities.shape}') return similarities
def vertex_connectivity(self, surface: Surface, mode: str="sparse", metric: Optional[str]=None, symmetric: bool=False, verts_mask: Union[numpy.ndarray, list]=None) \ -> Union[numpy.ndarray, scipy.sparse.csr.csr_matrix]: """ It computes a sparse matrix of the connectivity among the vertices of a surface. :param surface: input surface object :param mode: "sparse" by default or "2D" :param metric: None by default, could be "euclidean" :param symmetric: True for symmetric matrix output :param verts_mask: a mask to apply the method to a a sub-surface of the original surface :return: the computed matrix. """ if verts_mask is not None: (vertices, triangles) = self.extract_subsurf( surface, verts_mask, output='verts_triangls')[:2] else: vertices = surface.vertices triangles = surface.triangles # Get all pairs of vertex indexes (i.e., edges) that appear in each # face (triangle) edges = numpy.r_[triangles[:, [0, 1]], triangles[:, [1, 2]], triangles[:, [2, 0]]] # Remove repetitions edges = numpy.vstack(set(map(tuple, edges))) # Mark all existing pairs to 1 n_v = vertices.shape[0] n_e = edges.shape[0] # For symmetric output... if symmetric: # ...create for the moment the "double" edges edges2 = numpy.r_[edges, edges[:, [1, 0]]] if metric is None: # For symmetric output... if symmetric: # ...remove repetitions of edges2 edges = numpy.vstack(set(map(tuple, edges2))) n_e = edges.shape[0] con = csr_matrix( (numpy.ones((n_e,)), (edges[:, 0], edges[:, 1])), shape=(n_v, n_v)) if mode != "sparse": # Create non-sparse matrix con = con.todense() else: d = paired_distances(vertices[edges[:, 0]], vertices[ edges[:, 1]], metric) # For symmetric output... if symmetric: # double also d... d = numpy.r_[d, d] edges = edges2 if mode == "sparse": # Create sparse matrix con = csr_matrix( (d, (edges[:, 0], edges[:, 1])), shape=(n_v, n_v)) return con
def fit(self, attributes0, label0): attributes = np.array(attributes0) label = np.array(label0) self.clustering_model.fit(attributes) pred = self.clustering_model.predict(attributes) lab = pd.Series(label, name='lab', dtype=int) pre = pd.Series(pred, name='pre', dtype=int) kmcomp = pd.concat([lab, pre], axis=1) kmc1 = kmcomp[kmcomp['lab'] == 1] kmc0 = kmcomp[kmcomp['lab'] == 0] sta1 = kmc1['pre'].groupby(kmc1['pre']).count() sta0 = kmc0['pre'].groupby(kmc0['pre']).count() align = pd.concat([sta1 / len(kmc1), sta0 / len(kmc0)], axis=1).fillna(0) align.columns = ['sta1a', 'sta0a'] sta1a = align['sta1a'] sta0a = align['sta0a'] dif = sta1a / (sta1a + sta0a) result = pd.concat( [pd.concat([sta1, sta0, sta1a, sta0a], axis=1).fillna(0), dif], axis=1) result.columns = ['sta1', 'sta0', 'sta1a', 'sta0a', 'dif'] resee = result.sort_values('dif', ascending=False) resee['cumsta1a'] = np.cumsum(resee['sta1a']) resee['cumsta0a'] = np.cumsum(resee['sta0a']) resee['cumsta1'] = np.cumsum(resee['sta1']) resee['cumsta0'] = np.cumsum(resee['sta0']) kmcomp['dis'] = pd.Series( map( lambda x1, x2: paired_distances( x1.reshape(1, -1), self.clustering_model.cluster_centers_[ x2].reshape(1, -1))[0], attributes, pred)) distance_max = kmcomp['dis'].groupby( kmcomp['pre']).max().rename('distance_max') distance_mean = kmcomp['dis'].groupby( kmcomp['pre']).mean().rename('distance_mean') distances = pd.concat([distance_max, distance_mean], axis=1) resee = pd.merge(resee, pd.DataFrame(distances), left_index=True, right_index=True) self.comparison_summary = resee self.ordered_centers = pd.DataFrame( self.clustering_model.cluster_centers_).iloc[resee.index]
def getdd(self,tx,ty): # calculate distances and pad shortest array # pad shortest array with it's last value gap = tx.shape[0]-ty.shape[0] if gap>0: ty=np.pad(ty, ((0,gap),(0,0)), 'edge') elif gap <0: tx=np.pad(tx, ((0,-gap),(0,0)), 'edge') else: pass # use any distance metric that you would like return paired_distances(tx,ty,metric='l2').sum()
def statistic(centers, labels, vectors): times = dict(zip(*np.unique(labels, return_counts=True))) for i in times: time = times[i] var = 0 sum = vectors[0] * 0 num = 0 for index, label in enumerate(labels): if label == i: num += 1 sum += vectors[index] mean = sum / num for index, label in enumerate(labels): if label == i: var += paired_distances(vectors[index].reshape(1, -1), mean.reshape(1, -1)) var /= num norm = paired_distances(centers[i].reshape(1, -1), (centers[i] * 0).reshape(1, -1)) times[i] = [time, var, norm] return times
def getdd(tx, ty): # calculate distances and pad shortest array # pad shortest array with it's last value gap = tx.shape[0] - ty.shape[0] if gap > 0: ty = np.pad(ty, ((0, gap), (0, 0)), 'edge') elif gap < 0: tx = np.pad(tx, ((0, -gap), (0, 0)), 'edge') else: pass # use any distance metric that you would like return paired_distances(tx, ty, metric='l2').sum()
def distance_cluster(image_feats, clusters): X = [] len_clusters = clusters.shape[0] """ For each row, calculate its distance to each k-cluster and determine the k-cluster with the smallest distance and build a matrix of these k-clusters """ for e in image_feats: row_matrix = np.tile(e, (len_clusters, 1)) distances = paired_distances(row_matrix, clusters) X.append(np.argmin(distances)) return X
def urquhart_graph(X, weighted=False): '''Urquhart graph: made from the 2 shortest edges of each Delaunay triangle. ''' e1, e2 = _delaunay_edges(X) w = paired_distances(X[e1], X[e2]) mask = np.ones_like(w, dtype=bool) bad_inds = w.reshape((-1, 3)).argmax(axis=1) + np.arange(0, len(e1), 3) mask[bad_inds] = False weights = w[mask] if weighted else None pairs = np.column_stack((e1[mask], e2[mask])) return Graph.from_edge_pairs(pairs, num_vertices=X.shape[0], symmetric=True, weights=weights)
def _distance_between_embeddings(embeddings1: np.ndarray, embeddings2: np.ndarray) -> np.ndarray: # if distance_metric == DistanceMetric.EUCLIDEAN_SQUARED: # return np.square( # paired_distances( # embeddings1, # embeddings2, # metric='euclidean')) # elif distance_metric == DistanceMetric.ANGULAR_DISTANCE: # Angular Distance: https://en.wikipedia.org/wiki/Cosine_similarity similarity = 1 - paired_distances( embeddings1, embeddings2, metric='cosine') return np.arccos(similarity) / math.pi
def K_initialize(vectors, k, zero_ini): n = vectors.shape[0] d = vectors.shape[1] labels_dist = np.zeros((n, 2)) centers = np.zeros((k, d)) # np.random.seed(0) # ini = np.random.randint(n) ini = zero_ini for i, item in enumerate(labels_dist): item[0] = 0 item[1] = paired_distances(vectors[ini].reshape((1, -1)), vectors[i].reshape((1, -1)))[0] centers[0] = vectors[ini] return labels_dist, centers
def evaluate(self, sess, X_test, Y_test, n_test, dropout): total_pred = np.zeros(Y_test.shape) if self.n_neurons_aleat_unc > 1: total_aleat_unc_pred = np.zeros(Y_test.shape) else: total_aleat_unc_pred = np.zeros(len(Y_test)) total_loss = [] for batch_idx, batch in enumerate(range(0, n_test, self.batch_size)): start_idx = batch end_idx = batch + self.batch_size x = X_test[start_idx:end_idx] y = Y_test[start_idx:end_idx] exclude = 0 if len(x) < self.batch_size: exclude = self.batch_size - len(x) x = np.pad(x, ((0, exclude), (0, 0), (0, 0)), 'constant') y = np.pad(y, ((0, exclude), (0, 0)), 'constant') p, al_un, l = sess.run( [self.out_layer, self.noise_out_layer, self.loss], feed_dict={ self.input_pl: x, self.output_pl: y, self.dropout_pl: dropout }) if self.n_neurons_aleat_unc <= 1: al_un = al_un.flatten() if exclude > 0: total_pred[start_idx:end_idx] = p[:-exclude] #total_loss += l total_aleat_unc_pred[start_idx:end_idx] = al_un[:-exclude] else: total_pred[start_idx:end_idx] = p total_loss.append(l) total_aleat_unc_pred[start_idx:end_idx] = al_un mse = np.mean(np.square(total_pred - Y_test)) print('Test MSE Loss {:5.8f} '.format(mse)) if not self.use_aleat_unc: total_aleat_unc_pred = None distances = paired_distances(Y_test, total_pred) return total_pred, total_aleat_unc_pred, distances
def reweight_by_distance(self, coords, metric='l2', copy=False): '''Replaces existing edge weights by distances between connected vertices. The new weight of edge (i,j) is given by: metric(coords[i], coords[j]). coords : (num_vertices x d) array of coordinates, in vertex order metric : str or callable, see sklearn.metrics.pairwise.paired_distances''' if not self.is_weighted(): warnings.warn('Cannot supply weights for unweighted graph; ' 'ignoring call to reweight_by_distance') return self # TODO: take advantage of symmetry of metric function ii, jj = self.pairs().T if metric == 'precomputed': assert coords.ndim == 2 and coords.shape[0] == coords.shape[1] d = coords[ii,jj] else: d = paired_distances(coords[ii], coords[jj], metric=metric) return self._update_edges(d, copy=copy)
def extract_feature(net,transformer,ImagePath1, ImagePath2,layer_name, image_as_grey = False): """ Extracts features for given model and image list. Input network_proto_path: network definition file, in prototxt format. network_model_path: trainded network model file image_list: A list contains paths of all images, which will be fed into the network and their features would be saved. layer_name: The name of layer whose output would be extracted. save_path: The file path of extracted features to be saved. """ net.blobs['data'].reshape(2,3,128,128) img = cv2.imread(ImagePath1) img1 = cv2.imread(ImagePath2) shape0 = img.shape shape1 = img1.shape if shape0[0]!=128 and shape0[1]!=128: cv2.resize(img,(128,128)) if shape1[0]!=128 and shape1[1]!=128: cv2.resize(img1,(128,128)) gray = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY) gray1 = cv2.cvtColor(img1,cv2.COLOR_BGR2GRAY) gray = gray / 256.0 gray1 = gray1 / 256.0 ImageBatch= [] ImageBatch.append(gray) ImageBatch.append(gray) net.blobs['data'].data[0] = transformer.preprocess('data', gray) net.blobs['data'].data[1] = transformer.preprocess('data', gray1) out = net.forward() a = net.blobs[layer_name].data[0].copy() b = net.blobs[layer_name].data[1].copy() #b = b.reshape(256,1) #dst = dis_cos(a,b) dst = pw.paired_distances(a,b,'cosine') #b = b.reshape(256,1) #dst = dis_cos(a,b) #print 'dst0:',dst0,' dst:',dst return 1-dst
def test_paired_distances(metric, func): # Test the pairwise_distance helper function. rng = np.random.RandomState(0) # Euclidean distance should be equivalent to calling the function. X = rng.random_sample((5, 4)) # Euclidean distance, with Y != X. Y = rng.random_sample((5, 4)) S = paired_distances(X, Y, metric=metric) S2 = func(X, Y) assert_array_almost_equal(S, S2) S3 = func(csr_matrix(X), csr_matrix(Y)) assert_array_almost_equal(S, S3) if metric in PAIRWISE_DISTANCE_FUNCTIONS: # Check the pairwise_distances implementation # gives the same value distances = PAIRWISE_DISTANCE_FUNCTIONS[metric](X, Y) distances = np.diag(distances) assert_array_almost_equal(distances, S)
def _prune_edges(G, X, traj_lengths, pruning_thresh=0.1, verbose=False): '''Prune edges in graph G via cosine distance with trajectory edges.''' W = G.matrix(dense=True).copy() degree = G.degree(kind='out', weighted=False) i = 0 num_bad = 0 for n in traj_lengths: s, t = np.nonzero(W[i:i+n-1]) graph_edges = X[t] - X[s+i] traj_edges = np.diff(X[i:i+n], axis=0) traj_edges = np.repeat(traj_edges, degree[i:i+n-1], axis=0) theta = paired_distances(graph_edges, traj_edges, 'cosine') bad_edges = theta > pruning_thresh s, t = s[bad_edges], t[bad_edges] if verbose: # pragma: no cover num_bad += np.count_nonzero(W[s,t]) W[s,t] = 0 i += n if verbose: # pragma: no cover print('removed %d bad edges' % num_bad) return Graph.from_adj_matrix(W)
econ = econ.suburb_econ() geo = geography.suburb_geo() suburbs = [] for item in geo: suburbs.append(item) cos_values = [] ecld_dist = [] level_dist = [] gnlz_dist = [] for (suburb_one, suburb_two) in combinations(suburbs, 2): # economy test cos_values.append(1 - paired_distances(econ[suburb_one], econ[suburb_two], metric="cosine")[0]) # population test # cos_values.append(1 - paired_distances(pop[suburb_one], pop[suburb_two], metric="cosine")[0]) # euclidean distance # hypot(x2 - x1, y2 - y1) (x_one, y_one) = geo[suburb_one][0] (x_two, y_two) = geo[suburb_two][0] dist = hypot(x_one - x_two, y_one - y_two) ecld_dist.append(dist) ''' # generalized distance based on euclidean distance # polar distance to GPO polar_dist_one = geo[suburb_one][1][0]
def linkage_tree(X, connectivity=None, n_components=None, n_clusters=None, linkage='complete', affinity="euclidean", return_distance=False): """Linkage agglomerative clustering based on a Feature matrix. The inertia matrix uses a Heapq-based representation. This is the structured version, that takes into account some topological structure between samples. Read more in the :ref:`User Guide <hierarchical_clustering>`. Parameters ---------- X : array, shape (n_samples, n_features) feature matrix representing n_samples samples to be clustered connectivity : sparse matrix (optional). connectivity matrix. Defines for each sample the neighboring samples following a given structure of the data. The matrix is assumed to be symmetric and only the upper triangular half is used. Default is None, i.e, the Ward algorithm is unstructured. n_components : int (optional) Number of connected components. If None the number of connected components is estimated from the connectivity matrix. NOTE: This parameter is now directly determined directly from the connectivity matrix and will be removed in 0.18 n_clusters : int (optional) Stop early the construction of the tree at n_clusters. This is useful to decrease computation time if the number of clusters is not small compared to the number of samples. In this case, the complete tree is not computed, thus the 'children' output is of limited use, and the 'parents' output should rather be used. This option is valid only when specifying a connectivity matrix. linkage : {"average", "complete"}, optional, default: "complete" Which linkage critera to use. The linkage criterion determines which distance to use between sets of observation. - average uses the average of the distances of each observation of the two sets - complete or maximum linkage uses the maximum distances between all observations of the two sets. affinity : string or callable, optional, default: "euclidean". which metric to use. Can be "euclidean", "manhattan", or any distance know to paired distance (see metric.pairwise) return_distance : bool, default False whether or not to return the distances between the clusters. Returns ------- children : 2D array, shape (n_nodes-1, 2) The children of each non-leaf node. Values less than `n_samples` correspond to leaves of the tree which are the original samples. A node `i` greater than or equal to `n_samples` is a non-leaf node and has children `children_[i - n_samples]`. Alternatively at the i-th iteration, children[i][0] and children[i][1] are merged to form node `n_samples + i` n_components : int The number of connected components in the graph. n_leaves : int The number of leaves in the tree. parents : 1D array, shape (n_nodes, ) or None The parent of each node. Only returned when a connectivity matrix is specified, elsewhere 'None' is returned. distances : ndarray, shape (n_nodes-1,) Returned when return_distance is set to True. distances[i] refers to the distance between children[i][0] and children[i][1] when they are merged. See also -------- ward_tree : hierarchical clustering with ward linkage """ X = np.asarray(X) if X.ndim == 1: X = np.reshape(X, (-1, 1)) n_samples, n_features = X.shape linkage_choices = {'complete': _hierarchical.max_merge, 'average': _hierarchical.average_merge} try: join_func = linkage_choices[linkage] except KeyError: raise ValueError( 'Unknown linkage option, linkage should be one ' 'of %s, but %s was given' % (linkage_choices.keys(), linkage)) if connectivity is None: from scipy.cluster import hierarchy # imports PIL if n_clusters is not None: warnings.warn('Partial build of the tree is implemented ' 'only for structured clustering (i.e. with ' 'explicit connectivity). The algorithm ' 'will build the full tree and only ' 'retain the lower branches required ' 'for the specified number of clusters', stacklevel=2) if affinity == 'precomputed': # for the linkage function of hierarchy to work on precomputed # data, provide as first argument an ndarray of the shape returned # by pdist: it is a flat array containing the upper triangular of # the distance matrix. i, j = np.triu_indices(X.shape[0], k=1) X = X[i, j] elif affinity == 'l2': # Translate to something understood by scipy affinity = 'euclidean' elif affinity in ('l1', 'manhattan'): affinity = 'cityblock' elif callable(affinity): X = affinity(X) i, j = np.triu_indices(X.shape[0], k=1) X = X[i, j] out = hierarchy.linkage(X, method=linkage, metric=affinity) children_ = out[:, :2].astype(np.int) if return_distance: distances = out[:, 2] return children_, 1, n_samples, None, distances return children_, 1, n_samples, None if n_components is not None: warnings.warn( "n_components is now directly calculated from the connectivity " "matrix and will be removed in 0.18", DeprecationWarning) connectivity, n_components = _fix_connectivity(X, connectivity) connectivity = connectivity.tocoo() # Put the diagonal to zero diag_mask = (connectivity.row != connectivity.col) connectivity.row = connectivity.row[diag_mask] connectivity.col = connectivity.col[diag_mask] connectivity.data = connectivity.data[diag_mask] del diag_mask if affinity == 'precomputed': distances = X[connectivity.row, connectivity.col] else: # FIXME We compute all the distances, while we could have only computed # the "interesting" distances distances = paired_distances(X[connectivity.row], X[connectivity.col], metric=affinity) connectivity.data = distances if n_clusters is None: n_nodes = 2 * n_samples - 1 else: assert n_clusters <= n_samples n_nodes = 2 * n_samples - n_clusters if return_distance: distances = np.empty(n_nodes - n_samples) # create inertia heap and connection matrix A = np.empty(n_nodes, dtype=object) inertia = list() # LIL seems to the best format to access the rows quickly, # without the numpy overhead of slicing CSR indices and data. connectivity = connectivity.tolil() # We are storing the graph in a list of IntFloatDict for ind, (data, row) in enumerate(zip(connectivity.data, connectivity.rows)): A[ind] = IntFloatDict(np.asarray(row, dtype=np.intp), np.asarray(data, dtype=np.float64)) # We keep only the upper triangular for the heap # Generator expressions are faster than arrays on the following inertia.extend(_hierarchical.WeightedEdge(d, ind, r) for r, d in zip(row, data) if r < ind) del connectivity heapify(inertia) # prepare the main fields parent = np.arange(n_nodes, dtype=np.intp) used_node = np.ones(n_nodes, dtype=np.intp) children = [] # recursive merge loop for k in xrange(n_samples, n_nodes): # identify the merge while True: edge = heappop(inertia) if used_node[edge.a] and used_node[edge.b]: break i = edge.a j = edge.b if return_distance: # store distances distances[k - n_samples] = edge.weight parent[i] = parent[j] = k children.append((i, j)) # Keep track of the number of elements per cluster n_i = used_node[i] n_j = used_node[j] used_node[k] = n_i + n_j used_node[i] = used_node[j] = False # update the structure matrix A and the inertia matrix # a clever 'min', or 'max' operation between A[i] and A[j] coord_col = join_func(A[i], A[j], used_node, n_i, n_j) for l, d in coord_col: A[l].append(k, d) # Here we use the information from coord_col (containing the # distances) to update the heap heappush(inertia, _hierarchical.WeightedEdge(d, k, l)) A[k] = coord_col # Clear A[i] and A[j] to save memory A[i] = A[j] = 0 # Separate leaves in children (empty lists up to now) n_leaves = n_samples # # return numpy array for efficient caching children = np.array(children)[:, ::-1] if return_distance: return children, n_components, n_leaves, parent, distances return children, n_components, n_leaves, parent