def compress(self, X, n_components, n_neighbours): n = X.shape[0] k = self.k numNeighbours = self.numNeighbours # find the distances to every other point euclD = utils.euclidean_dist_squared(X, X) euclD = np.sqrt(euclD) knnD = np.zeros((n, n)) # get the KNN of point i for i in range(n): # finds numNeighbours smallest distances from obj_i # +1 because it will always select itself as (distance of 0), and distances are non-negative minIndexes = np.argsort(euclD[i])[:numNeighbours + 1] for index in minIndexes: # add distances of KNN_i to the distance matrix knnD[i, index] = euclD[i, index] D = np.zeros((n, n)) # get distance of every other path using only KNN for i in range(n): for j in range(n): if i != j: D[i, j] = utils.dijkstra(knnD, i, j) Z = AlternativePCA(k).fit(X).compress(X) z = find_min(self._fun_obj_z, Z.flatten(), 500, False, D) Z = z.reshape(n, k) return Z
def compress(self, X): n = X.shape[0] k = self.k K = self.K # Compute Euclidean distances D = utils.euclidean_dist_squared(X, X) D = np.sqrt(D) nbrs = np.argsort(D, axis=1)[:, 1:K + 1] G = np.zeros((n, n)) for i in range(n): for j in nbrs[i]: G[i, j] = D[i, j] G[j, i] = D[j, i] D = utils.dijkstra(G) D[D == np.inf] = -np.inf max = np.max(D) D[D == -np.inf] = max # Initialize low-dimensional representation with PCA Z = PCA(k).fit(X).compress(X) # Solve for the minimizer z = find_min(self._fun_obj_z, Z.flatten(), 500, False, D) Z = z.reshape(n, k) return Z
def fit(self, X): N, D = X.shape y = np.ones(N) means = np.zeros((self.k, D)) for kk in range(self.k): i = np.random.randint(N) means[kk] = X[i] self.means = means while True: y_old = y # Compute euclidean distance to each mean dist2 = euclidean_dist_squared(X, means) dist2[np.isnan(dist2)] = np.inf y = np.argmin(dist2, axis=1) # Update means for kk in range(self.k): if np.any( y == kk ): # don't update the mean if no examples are assigned to it (one of several possible approaches) means[kk] = X[y == kk].mean(axis=0) changes = np.sum(y != y_old) # print('Running K-means, changes in cluster assignment = {}'.format(changes)) # Stop if no point changed cluster self.error(X) if changes == 0: break self.means = means
def compress(self, X): n = X.shape[0] # Compute Euclidean distances D = utils.euclidean_dist_squared(X, X) D = np.sqrt(D) sorted_indices = np.argsort(D) G = np.zeros((n, n)) for i in range(D.shape[0]): for j in range(self.nn + 1): G[i, sorted_indices[i, j]] = D[i, sorted_indices[i, j]] G[sorted_indices[i, j], i] = D[sorted_indices[i, j], i] dist = utils.dijkstra(G) dist[np.isinf(dist)] = dist[~np.isinf(dist)].max() # Initialize low-dimensional representation with PCA pca = PCA(self.k) pca.fit(X) Z = pca.compress(X) # Solve for the minimizer z, f = findMin(self._fun_obj_z, Z.flatten(), 500, dist) Z = z.reshape(n, self.k) return Z
def predict(self, Xtest): n_train = self.X.shape[0] n_test = Xtest.shape[0] # dist_squared will be a n_test by n_train numpy array # utils.euclidean_dist_squared takes args (X, Xtest) # but this yields an array with the size of arg X as the first dimension # which I don't want for the following operations dist_squared = utils.euclidean_dist_squared(Xtest, self.X) # indices of the array, sorted by the array's values sorted_indices = np.argsort(dist_squared) out = np.zeros(n_test) # assumes that both n_train and n_test are >= self.k for i in range(sorted_indices.shape[0]): indices = sorted_indices[i, :self.k] # maps from index in sorted indices to training y val values = np.fromiter((self.y[j] for j in indices), int) value_sum = np.sum(values) # this implementation favors 0 in the case of a tie if value_sum > self.k / 2: # out is zeroes to begin with, so we only need to set in the true case out[i] = 1 return out
def fit(self, X, y): """ Parameters ---------- X : an N by D numpy array y : an N by 1 numpy array of integers in {1,2,3,...,c} """ Xcondensed = X[0:1, :] ycondensed = y[0:1] for i in range(1, len(X)): x_i = X[i:i + 1, :] dist2 = utils.euclidean_dist_squared(Xcondensed, x_i) inds = np.argsort(dist2[:, 0]) yhat = utils.mode(ycondensed[inds[:min(self.k, len(Xcondensed))]]) if yhat != y[i]: Xcondensed = np.append(Xcondensed, x_i, 0) ycondensed = np.append(ycondensed, y[i]) self.X = Xcondensed self.y = ycondensed print(self.y.shape[0])
def compress(self, X): n = X.shape[0] # Compute Euclidean distances D = utils.euclidean_dist_squared(X, X) D = np.sqrt(D) # D is symmetric matrix geoD = np.zeros((n, n)) # find nn-neighbours for i in range(n): sort = np.argsort(D[:, i]) neigh = np.setdiff1d(sort[0:self.nn + 1], i) # find the nn+1 smallest indexes that are not i for j in range(len(neigh)): t = neigh[j] geoD[i, t] = D[i, t] geoD[t, i] = D[t, i] D = utils.dijkstra(geoD) # for disconnected vertices (distance is Inf) # set their dist = max_dist(graph) # to encourage they are far away from each other D[np.isinf(D)] = D[~np.isinf(D)].max() # Initialize low-dimensional representation with PCA pca = PCA(self.k) pca.fit(X) Z = pca.compress(X) # Solve for the minimizer z, f = findMin(self._fun_obj_z, Z.flatten(), 500, D) Z = z.reshape(n, self.k) return Z
def fit(X, y, k): """ Parameters ---------- X : an N by D numpy array y : an N by 1 numpy array of integers in {1,2,3,...,c} k : the k in k-NN """ # Just memorize the training dataset N, D = X.shape Xcond = X[0,None] ycond = y[0,None] ncond = 1 for i in range (N):#go through subsequent training example #y_pred = predict(X,Xtest) dist = utils.euclidean_dist_squared(Xcond,X[i,:]) ds = np.argsort(dist, axis=0) y_pred = stats.mode(ycond[ds[:min(k,ncond)]])[0][0] if y_pred != y[i]:#if the example is incorrectly classified by the KNN classifier using the current subset then Xcond = np.append(Xcond,X[i,None],axis=0) ycond = np.append(ycond,y[i,None],axis=0) ncond = ncond + 1 model = dict() model['X'] = Xcond model['y'] = ycond model['k'] = k model['predict'] = predict return model
def compress(self, X): n = X.shape[0] # Compute Euclidean distances D = utils.euclidean_dist_squared(X,X) D = np.sqrt(D) #TODO: D = self.construct_dist_graph(X , D) # If two points are disconnected (distance is Inf) # then set their distance to the maximum # distance in the graph, to encourage them to be far apart. D[np.isinf(D)] = D[~np.isinf(D)].max() # Initialize low-dimensional representation with PCA pca = PCA(self.k) pca.fit(X) Z = pca.compress(X) # Solve for the minimizer z,f = findMin(self._fun_obj_z, Z.flatten(), 500, D) Z = z.reshape(n, self.k) return Z
def compress(self, X): n = X.shape[0] # Compute Euclidean distances D = utils.euclidean_dist_squared(X, X) D = np.sqrt(D) # TODO: Convert these Euclidean distances into geodesic distances order = np.argsort(D, axis=1)[:, :self.k + 1] distance_mask = np.zeros(D.shape) for i in range(n): for j in order[i]: distance_mask[i, j] = 1 distance_mask[j, i] = 1 D = utils.dijkstra(D * distance_mask) # If two points are disconnected (distance is Inf) # then set their distance to the maximum # distance in the graph, to encourage them to be far apart. D[np.isinf(D)] = D[~np.isinf(D)].max() # Initialize low-dimensional representation with PCA pca = PCA(self.k) pca.fit(X) Z = pca.transform(X) # Solve for the minimizer z, f = findMin(self._fun_obj_z, Z.flatten(), 500, D) Z = z.reshape(n, self.k) return Z
def compress(self, X): n = X.shape[0] # Compute Euclidean distances D = utils.euclidean_dist_squared(X, X) D = np.sqrt(D) # Convert these Euclidean distances into geodesic distances sorted_dist_indices = np.argsort(D) G = np.zeros((n, n)) for i in range(n): for j in range(self.nn): G[i, sorted_dist_indices[i, j]] = D[i, sorted_dist_indices[i, j]] G[sorted_dist_indices[i, j], i] = D[sorted_dist_indices[i, j], i] D = utils.dijkstra(G) # If two points are disconnected (distance is Inf) # then set their distance to the maximum # distance in the graph, to encourage them to be far apart. D[np.isinf(D)] = D[~np.isinf(D)].max() # Initialize low-dimensional representation with PCA pca = PCA(self.k) pca.fit(X) Z = pca.transform(X) # Solve for the minimizer z, f = findMin(self._fun_obj_z, Z.flatten(), 500, D) Z = z.reshape(n, self.k) return Z
def error(self, X, means=None): if means is None: means = self.means dist = np.sqrt(euclidean_dist_squared(X, means)) minVal = np.amin(dist, axis=1) # print(np.sum(minVal)) return np.sum(minVal)
def fit(self, X): N, D = X.shape y = np.ones(N) error = None means = np.zeros((self.k, D)) for kk in range(self.k): i = np.random.randint(N) means[kk] = X[i] while True: y_old = y # Compute euclidean distance to each mean dist2 = euclidean_dist_squared(X, means) dist2[np.isnan(dist2)] = np.inf y = np.argmin(dist2, axis=1) # Update means for kk in range(self.k): means[kk] = X[y == kk].mean(axis=0) changes = np.sum(y != y_old) # print('Running K-means, changes in cluster assignment = {}'.format(changes)) self.means = means # Stop if no point changed cluster if changes == 0: error = self.error(X) break self.means = means return means
def predict(self, Xtest): #Compute the Euclidean distance N, D = self.X.shape T, D = Xtest.shape y_pred = np.zeros((T, self.y.shape[1])) if self.method == "L2": distance = utils.euclidean_dist_squared(self.X, Xtest) elif self.method == "cosine": distance = utils.cosine_similarity(self.X, Xtest) #print(distance.shape) elif self.method == "pearson": distance = utils.pearson_corr(self.X, Xtest) #print(distance.shape) for t in range(T): sorted_distance_k = np.argsort(distance[:, t])[:self.k] #print(sorted_distance_k) for l in range(self.labels): #calculate the conditional probability that P(y_j = 1|x) p = (1/self.k)*np.sum(self.y[:,l][sorted_distance_k]) #print(p) if p>0.5: y_pred[t,l] = 1 else: y_pred[t, l] = 0 #y_pred[t] = utils.mode(self.y[sorted_distance_k] ) return y_pred
def compress(self, X): n = X.shape[0] # nearest_neighbours = np.zeros((n, self.nn)) # Compute Euclidean distances D = utils.euclidean_dist_squared(X, X) D = np.sqrt(D) # If two points are disconnected (distance is Inf) # then set their distance to the maximum # distance in the graph, to encourage them to be far apart. adjacency_matrix = np.zeros((n, n)) nearest_neighbours = self.knn(X) for i, j in enumerate(nearest_neighbours): for neighbour in j: adjacency_matrix[i, neighbour] = D[i, neighbour] adjacency_matrix[neighbour, i] = D[neighbour, i] dijkstra = utils.dijkstra(adjacency_matrix) dijkstra[np.isinf(dijkstra)] = dijkstra[~np.isinf(dijkstra)].max() # Initialize low-dimensional representation with PCA Z = PCA(self.k).fit(X).compress(X) # Solve for the minimizer z = find_min(self._fun_obj_z, Z.flatten(), 500, False, dijkstra) Z = z.reshape(n, self.k) return Z
def predict(self, Xtest): # print('k = %i' % self.k) # distArray = utils.euclidean_dist_squared(self.X, Xtest) # sidx = distArray.argsort(axis=0) # arrangedDistArray = distArray[sidx, np.arange(sidx.shape[1])] # y_pred = [] # for i in range(arrangedDistArray.shape[1]): # targets = [] # for j in range(self.k): # dist = arrangedDistArray[j][i] # index = np.nonzero(distArray[:,i] == dist)[0] # targets.append(self.y[index]) # targetNPArray = np.array(targets) # y_pred.append(utils.mode(targetNPArray)) distances = utils.euclidean_dist_squared(self.X, Xtest) sorted_indexes = np.argsort(distances, axis=0) sorted_indexes = sorted_indexes[:self.k, :] y_pred = self.y[sorted_indexes] y_pred = stats.mode(y_pred)[0] return y_pred
def predict(self, Xtest): # from utils: Computes the Euclidean distance between rows of 'X' and rows of 'Xtest' # return N by T array with pairwise squared Euclidean distances dist_squared = utils.euclidean_dist_squared(self.X, Xtest) # sort dist_squared by squared distance idx = np.argsort(dist_squared, axis=0) # restrict to k nearest in X # idx_k = idx[:,:self.k] y_pred = [] n, d = Xtest.shape # iterate through each test entry for i in range(0, n): # y values of neighbors y_neighbors = [] # iterate through the neighbor for j in range(0, self.k): # add y associated with k-th neighbor idx_neighbor = idx[j][i] y_neighbors = np.append(y_neighbors, self.y[idx_neighbor]) # get most common y y_mode = stats.mode(y_neighbors) # add most common label to predicted values y_pred = np.append(y_pred, y_mode) # print(y_pred) return np.array(y_pred)
def compress(self, X): n = X.shape[0] # Compute Euclidean distances D = utils.euclidean_dist_squared(X, X) D = np.sqrt(D) # Construct nearest neighbour graph G = np.zeros([n, n]) for i in range(n): neighbours = np.argsort(D[i])[:self.nn + 1] for j in neighbours: G[i, j] = D[i, j] G[j, i] = D[j, i] # Compute ISOMAP distances D = utils.dijkstra(G) # If two points are disconnected (distance is Inf) # then set their distance to the maximum # distance in the graph, to encourage them to be far apart. D[np.isinf(D)] = D[~np.isinf(D)].max() # Initialize low-dimensional representation with PCA pca = PCA(self.k) pca.fit(X) Z = pca.compress(X) # Solve for the minimizer z, f = findMin(self._fun_obj_z, Z.flatten(), 500, D) Z = z.reshape(n, self.k) return Z
def error(self, X): # get closest indices from predict() indices = self.predict(X) total = 0 for i in range(self.means.shape[0]): total += np.sum(euclidean_dist_squared(X[indices == i], self.means[[i]])) return total
def error(self, X): N, D = X.shape #print(self.means.shape) #print(X.shape) d = euclidean_dist_squared(X, self.means) return np.sum(d.min(1))
def error(self, X): retval = 0 y = self.predict(X) dist = euclidean_dist_squared(X, self.means) for i in range(len(y)): idx = y[i] retval += dist[i][idx]**.5 return retval
def error(self, X): means = self.means d = self.predict(X) error = 0 for i in range(means.shape[0]): error += np.sum(euclidean_dist_squared(X[d == i], means[[i]])) return error
def predict(self, X): """ prediction entry point where linear algebra is used to measure group distance located groups - of dataset points. """ medians = self.medians dist2 = utils.euclidean_dist_squared(X, medians) dist2[np.isnan(dist2)] = np.inf return np.argmin(dist2, axis=1)
def predict(self, Xtest): T, D = Xtest.shape N, D = self.X.shape dists = utils.euclidean_dist_squared(self.X, Xtest) sortedDists = np.argsort(dists, axis=0) y_pred = np.empty(T) for ti in range(T): y_pred[ti] = stats.mode(self.y[sortedDists[:self.k, ti]])[0][0] return y_pred
def predict(model, X): means = model['means'] dist2 = utils.euclidean_dist_squared(X, means) # print np.argmin(dist2, axis=1) dist2[np.isnan(dist2)] = np.inf return np.argmin(dist2, axis=1)
def error(self, X): means = self.means y = self.predict(X) tot_dist_error = 0 for kk in range(means.shape[0]): tot_dist_error += np.sum( utils.euclidean_dist_squared(X[y == kk], means[[kk]])) return tot_dist_error
def error(self, X): N, D = X.shape means = self.means dist2 = euclidean_dist_squared(X, means) dist2[np.isnan(dist2)] = np.inf y = self.predict(X) dist_error = 0 for n in range(N): dist_error += dist2[n, y[n]] return dist_error
def predict(self, Xtest): X = self.X k = self.k y = self.y y_pred = np.zeros(Xtest.shape[0]) euclidean_distances = np.argsort(utils.euclidean_dist_squared( Xtest, X)) for n in range(Xtest.shape[0]): y_pred[n] = utils.mode(y[euclidean_distances[n, 0:k]]) return y_pred
def predict(model, Xtest): X = model['X'] y = model['y'] k = model['k'] D = utils.euclidean_dist_squared(X, Xtest) D = np.argsort(D, axis=0) D = D[0:k, :] Y = y[D] yhat = np.amax(Y, axis=0) return yhat
def error(self, X): N, D = X.shape medians = self.medians closest_median_indexes = self.predict(X) error = 0 for i in range(medians.shape[0]): error += np.sum( utils.euclidean_dist_squared(X[closest_median_indexes == i], medians[[i]])) return error