def get_centroids(picklepath, radius=0.2): """This will cluster all vdms by iFG location (backbone rel_vdms) or by sidechain + iFG location (sc rel_vdms) and output new pickle files to a directory picklepath/clustered.""" if picklepath[-1] != '/': picklepath += '/' if os.path.isdir(picklepath): for pickletype in listdir(picklepath): if pickletype == 'PHI_PSI': for phipsi_type in listdir(picklepath + pickletype): for picklefile in listdir(picklepath + pickletype + '/' + phipsi_type + '/pickle/'): with open(picklepath + pickletype + '/' + phipsi_type + '/pickle/' + picklefile, 'rb') as infile: pick = pickle.load(infile) if len(pick.shape) == 1: with open(outpath + picklefile, 'wb') as outfile: pickle.dump(pick, outfile) else: ifg_flat = [coords.flatten() for coords in pick[:, -2]] nbrs = NearestNeighbors(metric='euclidean', radius=radius) nbrs.fit(ifg_flat) adj_mat = nbrs.radius_neighbors_graph(ifg_flat) mems, cents = cluster_adj_mat(adj_mat) outpath = picklepath + 'clustered/' + pickletype + '/' + phipsi_type + '/pickle/' try: os.makedirs(outpath) except: pass with open(outpath + picklefile, 'wb') as outfile: pickle.dump(pick[cents, :], outfile) else: for picklefile in listdir(picklepath + pickletype + '/pickle/'): with open(picklepath + pickletype + '/pickle/' + picklefile, 'rb') as infile: pick = pickle.load(infile) outpath = picklepath + 'clustered/' + pickletype + '/pickle/' try: os.makedirs(outpath) except: pass if len(pick.shape) == 1: with open(outpath + picklefile, 'wb') as outfile: pickle.dump(pick, outfile) else: if pickletype == 'SC': sc_flat = [coords.flatten() for coords in pick[:, -3]] ifg_flat = [coords.flatten() for coords in pick[:, -2]] sc_ifg_flat = np.hstack((sc_flat, ifg_flat)) nbrs = NearestNeighbors(metric='euclidean', radius=radius) nbrs.fit(sc_ifg_flat) adj_mat = nbrs.radius_neighbors_graph(sc_ifg_flat) mems, cents = cluster_adj_mat(adj_mat) with open(outpath + picklefile, 'wb') as outfile: pickle.dump(pick[cents, :], outfile) else: ifg_flat = [coords.flatten() for coords in pick[:, -2]] nbrs = NearestNeighbors(metric='euclidean', radius=radius) nbrs.fit(ifg_flat) adj_mat = nbrs.radius_neighbors_graph(ifg_flat) mems, cents = cluster_adj_mat(adj_mat) with open(outpath + picklefile, 'wb') as outfile: pickle.dump(pick[cents, :], outfile)
def RadiusNeighborhoodGraph(X, r): neighbor = NearestNeighbors(radius=r) neighbor.fit(X) adj_matrix = neighbor.radius_neighbors_graph(X) dist_matrix = neighbor.radius_neighbors_graph(X, mode='distance') # symmetric matrix adj_matrix = adj_matrix.toarray() dist_matrix = dist_matrix.toarray() return adj_matrix, dist_matrix
def test_cnn_sparse_precomputed_different_eps(): # test that precomputed neighbors graph is filtered if computed with # a radius larger than eps. lower_eps = 0.2 nn = NearestNeighbors(radius=lower_eps).fit(X) D_sparse = nn.radius_neighbors_graph(X, mode="distance") cnn_lower = commonnn(D_sparse, eps=lower_eps, metric="precomputed") higher_eps = lower_eps + 0.7 nn = NearestNeighbors(radius=higher_eps).fit(X) D_sparse = nn.radius_neighbors_graph(X, mode="distance") cnn_higher = commonnn(D_sparse, eps=lower_eps, metric="precomputed") assert_array_equal(cnn_lower, cnn_higher)
def test_dbscan_sparse_precomputed_different_eps(): # test that precomputed neighbors graph is filtered if computed with # a radius larger than DBSCAN's eps. lower_eps = 0.2 nn = NearestNeighbors(radius=lower_eps).fit(X) D_sparse = nn.radius_neighbors_graph(X, mode='distance') dbscan_lower = dbscan(D_sparse, eps=lower_eps, metric='precomputed') higher_eps = lower_eps + 0.7 nn = NearestNeighbors(radius=higher_eps).fit(X) D_sparse = nn.radius_neighbors_graph(X, mode='distance') dbscan_higher = dbscan(D_sparse, eps=lower_eps, metric='precomputed') assert_array_equal(dbscan_lower[0], dbscan_higher[0]) assert_array_equal(dbscan_lower[1], dbscan_higher[1])
def _neigh_internal(hs_reps1, hs_reps2, dist_cst, tol): nbrs_high = NearestNeighbors(metric='euclidean', radius=dist_cst + tol) nbrs_high.fit(hs_reps2) adj_mat_high = nbrs_high.radius_neighbors_graph(hs_reps1, mode='distance') lowtol = dist_cst - tol wh = (adj_mat_high > lowtol).nonzero() return zip(wh[0], wh[1])
def make_adj_mat_no_superpose(self): num_atoms = len(self.pdb_coords[0]) nbrs = NearestNeighbors(radius=self.rmsd_cutoff * np.sqrt(num_atoms)) nbrs_coords = np.array([s.getCoords().flatten() for s in self.pdb_coords]) nbrs.fit(nbrs_coords) self.adj_mat = nbrs.radius_neighbors_graph(nbrs_coords) self._adj_mat = True
def assign_species_names(): global agents, n_born, color_changer X = [agent.dna for agent in agents] """for agent in agents: print(agent) print(X)""" nn = NearestNeighbors(metric='cosine', algorithm='brute') nn.fit(X) nbrs_graph = nn.radius_neighbors_graph(X, radius=0.001) _, connected_components = scipy.sparse.csgraph.connected_components( nbrs_graph) #print('HI') #print(connected_components) for agent, cluster in zip(agents, connected_components): agent.update_species_name('species%d' % cluster) if n_born == 0: color_changer = Pipeline([('pca', KernelPCA(n_components=3, kernel="cosine")), ('minmax', MinMaxScaler())]) color_changer.fit(X) colors = ( MinMaxScaler().fit_transform(color_changer.transform(X) * 255.) * 255. ).astype( int ) #(MinMaxScaler().fit_transform(KernelPCA(n_components=3,kernel="cosine").fit_transform(X))*255).astype(int).tolist() #print(colors) for agent, color in zip(agents, colors): print(color) agent.color = color
def _build_graph(self, X): """Construction of connectivity graph G""" neighbors = NearestNeighbors(algorithm=self.neighbors_algorithm, metric=self.metric, n_jobs=self.n_jobs).fit(X) if self.neighborhood_method == "knn": # TODO: assert n_neighbors < number of points G = neighbors.kneighbors_graph(n_neighbors=self.n_neighbors, mode="distance") elif self.neighborhood_method == "eps_ball": # TODO: assert eps is not None G = neighbors.radius_neighbors_graph(radius=self.eps, mode="distance") else: raise ValueError( "Unrecognized method of neighborhood selection='{0}'" "".format(self.neighborhood_method)) G_sym = csr_matrix.maximum(G, G.T.tocsr()) G_sym.data = self._kernel(G_sym.data, sigma=self.sigma) G.data = self._kernel(G.data, sigma=self.sigma) return G_sym
def clusterHMM(self, minMagnitude = 10, treeR = 22, leafNum = 190, neighborR = 22, timeScale = 10, eps = 18, minPts = 170): if os.path.exists(self.clusterDirectory + 'Labels.npy') and not self.rewrite: print('Cluster label file already exists. Will not recalculate it unless rewrite flag is True') return try: self.obj except AttributeError: self.obj = HMMdata(filename = self.hmmFile) print('Identifying raw coordinate positions for cluster analysis', file = sys.stderr) if os.path.isfile(self.clusterDirectory + 'RawCoords.npy'): self.coords = np.load(self.clusterDirectory + 'RawCoords.npy') else: self.coords = self.obj.retDBScanMatrix(minMagnitude) np.save(self.clusterDirectory + 'RawCoords.npy', self.coords) print('Calculating nearest neighbors and pairwise distances between clusters', file = sys.stderr) if os.path.isfile(self.clusterDirectory + 'PairwiseDistances.npz'): dist = np.load(self.clusterDirectory + 'PairwiseDistances.npz') else: self.coords[:,0] = self.coords[:,0]*timeScale X = NearestNeighbors(radius=treeR, metric='minkowski', p=2, algorithm='kd_tree',leaf_size=leafNum,n_jobs=24).fit(self.coords) dist = X.radius_neighbors_graph(self.coords, neighborR, 'distance') scipy.sparse.save_npz(self.clusterDirectory + 'PairwiseDistances.npz', dist) label = DBSCAN(eps=eps, min_samples=minPts, metric='precomputed', n_jobs=24).fit_predict(dist) np.save(self.clusterDirectory + 'Labels.npy', label)
def test_dbscan_sparse_precomputed(): D = pairwise_distances(X) nn = NearestNeighbors(radius=0.9).fit(X) D_sparse = nn.radius_neighbors_graph(mode="distance") # Ensure it is sparse not merely on diagonals: assert D_sparse.nnz < D.shape[0] * (D.shape[0] - 1) core_sparse, labels_sparse = dbscan(D_sparse, eps=0.8, min_samples=10, metric="precomputed") core_dense, labels_dense = dbscan(D, eps=0.8, min_samples=10, metric="precomputed") assert_array_equal(core_dense, core_sparse) assert_array_equal(labels_dense, labels_sparse)
def test_cnn_sparse_precomputed(include_self): D = pairwise_distances(X) nn = NearestNeighbors(radius=0.9).fit(X) X_ = X if include_self else None D_sparse = nn.radius_neighbors_graph(X=X_, mode="distance") # Ensure it is sparse not merely on diagonals: assert D_sparse.nnz < D.shape[0] * (D.shape[0] - 1) labels_sparse = commonnn(D_sparse, eps=0.8, min_samples=5, metric="precomputed") labels_dense = commonnn(D, eps=0.8, min_samples=5, metric="precomputed") assert_array_equal(labels_dense, labels_sparse)
def test_dbscan_sparse_precomputed(): D = pairwise_distances(X) nn = NearestNeighbors(radius=.9).fit(X) D_sparse = nn.radius_neighbors_graph(mode='distance') # Ensure it is sparse not merely on diagonals: assert D_sparse.nnz < D.shape[0] * (D.shape[0] - 1) core_sparse, labels_sparse = dbscan(D_sparse, eps=.8, min_samples=10, metric='precomputed') core_dense, labels_dense = dbscan(D, eps=.8, min_samples=10, metric='precomputed') assert_array_equal(core_dense, core_sparse) assert_array_equal(labels_dense, labels_sparse)
def add_exclusion(self, object_1, object_2, margin=1, distance_metric='l1'): """Add exclsion constraint edges forcing object_1 and object_2 not to overlap.""" if object_1 == object_2: raise ValueError('object_1 and object_2 is the same object.') # Get nodeids. object_1_nodeids = self.get_nodeids(object_1) object_2_nodeids = self.get_nodeids(object_2) # Get points. object_1_points = object_1.sample_points object_2_points = object_2.sample_points # TODO Avoid searching for neigbours if possible. if margin == 0 and object_1_points.shape == object_2_points.shape and np.all( object_1_points == object_2_points): # Add containment edges. self.add_pairwise_terms(object_1_nodeids, object_2_nodeids, 0, 0, 0, self.inf_cap) else: # Create nearest neighbors tree. neigh = NearestNeighbors(radius=margin, metric=distance_metric) neigh.fit(object_1_points.reshape(-1, object_1_points.shape[-1])) # Create neighbors graph. # Get connectivity for all within margin. radius_neighbors_graph = neigh.radius_neighbors_graph( object_2_points.reshape(-1, object_2_points.shape[-1])) # Get indices for all combined graph connections. indices_2, indices_1, _ = sparse.find(radius_neighbors_graph) if indices_1.size == 0: # If there are no neighbors, return. return # Take the ids to add pairwise terms to. object_1_ids = np.take(object_1_nodeids, indices_1) object_2_ids = np.take(object_2_nodeids, indices_2) # Add exclusion edges. self.add_pairwise_terms(object_1_ids, object_2_ids, 0, 0, 0, self.inf_cap)
def add_containment(self, outer_object, inner_object, margin=1, distance_metric='l1'): """Add containment constraint edges forcing inner_object to be within outer_object.""" if outer_object == inner_object: raise ValueError( 'outer_object and inner_object is the same object.') if margin is None: return # Get nodeids. outer_nodeids = self.get_nodeids(outer_object) inner_nodeids = self.get_nodeids(inner_object) # Get points. outer_points = outer_object.sample_points inner_points = inner_object.sample_points # TODO Avoid searching for neigbours if possible. if margin == 0 and outer_points.shape == inner_points.shape and np.all( outer_points == inner_points): # Add containment edges. self.add_pairwise_terms(outer_nodeids, inner_nodeids, 0, self.inf_cap, 0, 0) else: # Create nearest neighbors tree. neigh = NearestNeighbors(radius=margin, metric=distance_metric) neigh.fit(outer_points.reshape(-1, outer_points.shape[-1])) # Create neighbors graph. # Get connectivity for all within margin. radius_neighbors_graph = neigh.radius_neighbors_graph( inner_points.reshape(-1, inner_points.shape[-1])) # Get indices for all combined graph connections. indices_2, indices_1, _ = sparse.find(radius_neighbors_graph) outer_ids = np.take(outer_nodeids, indices_1) inner_ids = np.take(inner_nodeids, indices_2) # Add containment edges. self.add_pairwise_terms(outer_ids, inner_ids, 0, self.inf_cap, 0, 0)
def positional_sparse_matrix(row_size, col_size, radius): #p2 means euclidean distance nn = NearestNeighbors(radius=radius, p=2) rows = np.arange(row_size) cols = np.arange(col_size) mesh_grid = np.empty((row_size, row_size, 2), dtype=np.intp) mesh_grid[..., 0] = rows[:, None] mesh_grid[..., 1] = cols #print("MESH_GRID : \n", mesh_grid) mesh_grid = mesh_grid.reshape(-1, 2) #print("MESH_GRID_RESHAPE : \n", mesh_grid) nn.fit(mesh_grid) pos_sparse_matrix = nn.radius_neighbors_graph(mesh_grid, radius=radius, mode='distance') return pos_sparse_matrix
def visualize_graph_partition(points, k=-1): """ Separate visualization of k normalized clusters created by region growing algorithm where the colors show the best partition of the nearest neighbor graph in each cluster """ # import graph and partition import networkx as nx import community # region growing curvatures, normals = estimate_curvature_and_normals(points, n_neighbors=50) cluster_mask = region_growing(points, normals, curvatures, min_points=100) # get cluster indices of interest ignoreing outliers cluster_idx = np.unique(cluster_mask) cluster_idx = cluster_idx[(cluster_idx!=-1)] cluster_idx = cluster_idx[:k] if k > -1 else cluster_idx # create visualizer vis = Visualizer(background=(1, 1, 1)) # visualize each cluster for i in cluster_idx: # get points of current cluster cluster = normalize_pc(points[(cluster_mask==i)]) # build ajacency matrix for neighbor graph tree = NearestNeighbors(algorithm='ball_tree', radius=0.03, n_jobs=-1).fit(cluster) graph_matrix = tree.radius_neighbors_graph(cluster, mode='distance') # build graph from sparse adjacency matrix and compute best partition G = nx.from_scipy_sparse_matrix(graph_matrix) partition = community.best_partition(G) # color each partition in a random color community_colors, colors = {}, np.empty_like(cluster) for i, c in partition.items(): # get random color if c not in community_colors: community_colors[c] = np.random.uniform(0, 1, size=3) # set color colors[i, :] = community_colors[c] # add to visualizer vis.add_by_features(cluster, colors) # show vis.run()
def make_ConformationalNetwork(self): neigh = NearestNeighbors(radius=1, metric='chebyshev') neigh.fit(self.ijk_centers) net_centers = nx.from_scipy_sparse_matrix( neigh.radius_neighbors_graph()) del (neigh) net_rotations = nx.Graph() net_rotations.add_nodes_from(range(self.num_rotations)) for ii in range(self.num_rotations): neighs = hp.get_all_neighbours(self.nside, ii, nest=False) neighs[neighs == -1] = 0 net_rotations.add_edges_from( zip(np.full(neighs.shape[0], ii), neighs)) del (neighs) net = nx.cartesian_product(net_centers, net_rotations) del (net_rotations, net_centers) return net
def createClusters(self, minMagnitude = 0, treeR = 22, leafNum = 190, neighborR = 22, timeScale = 10, eps = 18, minPts = 90, delta = 1.0): #self.loadVideo() self.loadHMM() self._print('Created ' + self.labeledCoordsFile) coords = self.obj.retDBScanMatrix(minMagnitude) np.save(self.localClusterDirectory + 'RawCoords.npy', coords) #subprocess.call(['rclone', 'copy', self.localClusterDirectory + 'RawCoordsFile.npy', self.cloudClusterDirectory], stderr = self.fnull) sortData = coords[coords[:,0].argsort()][:,0:3] #sort data by time for batch processing, throwing out 4th column (magnitude) numBatches = int(sortData[-1,0]/delta/3600) + 1 #delta is number of hours to batch together. Can be fraction. sortData[:,0] = sortData[:,0]*timeScale #scale time so that time distances between transitions are comparable to spatial differences labels = np.zeros(shape = (sortData.shape[0],1), dtype = sortData.dtype) #Calculate clusters in batches to avoid RAM overuse curr_label = 0 #Labels for each batch start from zero - need to offset these print('Calculating clusters in ' + str(numBatches) + ' total batches', file = sys.stderr) for i in range(numBatches): print('Batch: ' + str(i), file = sys.stderr) min_time, max_time = i*delta*timeScale*3600, (i+1)*delta*timeScale*3600 # Have to deal with rescaling of time. 3600 = # seconds in an hour hour_range = np.where((sortData[:,0] > min_time) & (sortData[:,0] <= max_time)) min_index, max_index = hour_range[0][0], hour_range[0][-1] + 1 X = NearestNeighbors(radius=treeR, metric='minkowski', p=2, algorithm='kd_tree',leaf_size=leafNum,n_jobs=24).fit(sortData[min_index:max_index]) dist = X.radius_neighbors_graph(sortData[min_index:max_index], neighborR, 'distance') sub_label = DBSCAN(eps=eps, min_samples=minPts, metric='precomputed', n_jobs=24).fit_predict(dist) new_labels = int(sub_label.max()) + 1 sub_label[sub_label != -1] += curr_label labels[min_index:max_index,0] = sub_label curr_label += new_labels sortData[:,0] = sortData[:,0]/timeScale self.labeledCoords = np.concatenate((sortData, labels), axis = 1).astype('int64') np.save(self.localClusterDirectory + self.labeledCoordsFile, self.labeledCoords) subprocess.call(['rclone', 'copy', self.localClusterDirectory + self.labeledCoordsFile, self.cloudClusterDirectory], stderr = self.fnull)
def hcg_cluster( features, timestamps=None, linkage='ward', distance_thr: tuple = (0.5, 0.5), timestamp_thr: float = 0, edge_thr: float = 0.7 ): dist_neigh = NearestNeighbors(radius=distance_thr[0]) dist_neigh.fit(features) dist_graph = dist_neigh.radius_neighbors_graph(mode='connectivity') if timestamps is not None and timestamp_thr > 0: time_neigh = NearestNeighbors(radius=timestamp_thr) time_neigh.fit(timestamps) time_graph = time_neigh.radius_neighbors_graph(mode='connectivity') dist_graph = dist_graph.multiply(time_graph) dist_graph.eliminate_zeros() dist_graph = nx.from_scipy_sparse_matrix(dist_graph) components = nx.connected_components(dist_graph) clusters = [] clustering = None if distance_thr[1] > 0: clustering = AgglomerativeClustering( n_clusters=None, distance_threshold=distance_thr[1], affinity='euclidean', linkage=linkage ) for component in components: n = len(component) if n > 3 and clustering is not None: sub_graph = dist_graph.subgraph(component).copy() component = list(component) if sub_graph.size() >= edge_thr * (n * (n - 1) / 2): clusters.append(component) else: clustering.fit(features[component]) clusters.extend(cluster_labels(clustering.labels_, component)) else: clusters.append(list(component)) return clusters # def cluster_features( # features, # timestamps=None, # distance_thr: float = 0.5, # timestamp_thr: float = 0, # grouped: bool = True # ): # dist_neigh = NearestNeighbors(radius=distance_thr) # dist_neigh.fit(features) # dist_graph = dist_neigh.radius_neighbors_graph(mode='connectivity') # # if timestamps is not None and timestamp_thr > 0: # time_neigh = NearestNeighbors(radius=timestamp_thr) # time_neigh.fit(timestamps) # time_graph = time_neigh.radius_neighbors_graph(mode='connectivity') # dist_graph = dist_graph.multiply(time_graph) # dist_graph.eliminate_zeros() # # clustering = AgglomerativeClustering( # n_clusters=None, # distance_threshold=distance_thr, # affinity='euclidean', # linkage='ward', # connectivity=dist_graph # ) # # clustering.fit(features) # # labels = clustering.labels_ # # if not grouped: # return labels, None # # labels_set = set(labels) - {-1} # # clusters = {label: [] for label in labels_set} # outliers = [] # # for ind, label in enumerate(labels): # if label != -1: # clusters[label].append(ind) # else: # outliers.append([ind]) # # return labels, list(clusters.values()) + outliers # import networkx as nx # from .clustering import cluster_features as aro_cluster # # def cluster_features( # features, # timestamps=None, # distance_thr: float = 0.5, # timestamp_thr: float = 0, # grouped: bool = True # ): # n_features = len(features) # dist_neigh = NearestNeighbors(radius=1.13) # dist_neigh.fit(features) # dist_graph = dist_neigh.radius_neighbors_graph(mode='connectivity') # # if timestamps is not None and timestamp_thr > 0: # time_neigh = NearestNeighbors(radius=timestamp_thr) # time_neigh.fit(timestamps) # time_graph = time_neigh.radius_neighbors_graph(mode='connectivity') # dist_graph = dist_graph.multiply(time_graph) # dist_graph.eliminate_zeros() # # dist_graph = nx.from_scipy_sparse_matrix(dist_graph) # # components = nx.connected_components(dist_graph) # # clusters = [] # # for component in components: # clusters.append(list(component)) # n = len(component) # if n > 3: # sub_graph = dist_graph.subgraph(component).copy() # component = list(component) # if sub_graph.size() == (n * (n - 1) / 2): # clusters.append(component) # else: # sub_clusters = aro_cluster( # features[component], # distance_thr=distance_thr, # n_neighbors=min(n, 10) # ) # for sub_cluster in sub_clusters: # clusters.append([component[i] for i in sub_cluster]) # else: # clusters.append(list(component)) # # return clusters # def cluster_features( # features, # timestamps=None, # distance_thr: float = 0.5, # timestamp_thr: float = 0, # grouped: bool = True # ): # n_features = len(features) # dist_neigh = NearestNeighbors(radius=distance_thr) # dist_neigh.fit(features) # dist_graph = dist_neigh.radius_neighbors_graph(mode='connectivity') # # if timestamps is not None and timestamp_thr > 0: # time_neigh = NearestNeighbors(radius=timestamp_thr) # time_neigh.fit(timestamps) # time_graph = time_neigh.radius_neighbors_graph(mode='connectivity') # dist_graph = dist_graph.multiply(time_graph) # dist_graph.eliminate_zeros() # # _, labels = connected_components( # csgraph=dist_graph, # directed=False, # return_labels=True # ) # # clusters_high = cluster_labels(labels, range(0, n_features)) # # # Second level clustering # clusters_low = [] # clustering = AgglomerativeClustering( # n_clusters=None, # distance_threshold=1.2, # affinity='euclidean', # linkage='single' # ) # # # clustering = OPTICS( # # max_eps=distance_thr, # # min_samples=3, # # metric='euclidean' # # ) # # for cluster in clusters_high: # if len(cluster) > 3: # clustering.fit(features[cluster]) # clusters_low.extend( # cluster_labels(clustering.labels_, cluster) # ) # else: # clusters_low.append(cluster) # # return clusters_low # import networkx as nx # # def cluster_features( # features, # timestamps=None, # distance_thr: float = 0.5, # timestamp_thr: float = 0, # grouped: bool = True # ): # n_features = len(features) # dist_neigh = NearestNeighbors(radius=distance_thr) # dist_neigh.fit(features) # dist_graph = dist_neigh.radius_neighbors_graph(mode='connectivity') # # if timestamps is not None and timestamp_thr > 0: # time_neigh = NearestNeighbors(radius=timestamp_thr) # time_neigh.fit(timestamps) # time_graph = time_neigh.radius_neighbors_graph(mode='connectivity') # dist_graph = dist_graph.multiply(time_graph) # dist_graph.eliminate_zeros() # # dist_graph = nx.from_scipy_sparse_matrix(dist_graph) # # components = nx.connected_components(dist_graph) # # clusters = [] # # for component in components: # if len(component) > 3: # sub_graph = dist_graph.subgraph(component).copy() # sub_components = nx.k_edge_components(sub_graph, k=2) # clusters.extend([ # list(sub_component) for sub_component in sub_components # ]) # else: # clusters.append(list(component)) # # return clusters
def _createClusters(self): print(' Creating clusters from HMM transitions,,Time: ' + str(datetime.datetime.now())) # Load in HMM data hmmObj = HA(self.videoObj.localHMMFile) # Convert into coords object and save it coords = hmmObj.retDBScanMatrix(self.projFileManager.minMagnitude) np.save(self.videoObj.localRawCoordsFile, coords) # Run data in batches to avoid RAM override sortData = coords[coords[:, 0].argsort( )][:, 0: 3] #sort data by time for batch processing, throwing out 4th column (magnitude) numBatches = int( sortData[-1, 0] / self.projFileManager.delta / 3600 ) + 1 #delta is number of hours to batch together. Can be fraction. sortData[:, 0] = sortData[:, 0] * self.projFileManager.timeScale #scale time so that time distances between transitions are comparable to spatial differences labels = np.zeros(shape=(sortData.shape[0], 1), dtype=sortData.dtype) # Initialize labels #Calculate clusters in batches to avoid RAM overuse curr_label = 0 #Labels for each batch start from zero - need to offset these print(' ' + str(numBatches) + ' total batches. On batch: ', end='', flush=True) for i in range(numBatches): print(str(i) + ',', end='', flush=True) min_time, max_time = i * self.projFileManager.delta * self.projFileManager.timeScale * 3600, ( i + 1 ) * self.projFileManager.delta * self.projFileManager.timeScale * 3600 # Have to deal with rescaling of time. 3600 = # seconds in an hour hour_range = np.where((sortData[:, 0] > min_time) & (sortData[:, 0] <= max_time)) min_index, max_index = hour_range[0][0], hour_range[0][-1] + 1 X = NearestNeighbors(radius=self.projFileManager.treeR, metric='minkowski', p=2, algorithm='kd_tree', leaf_size=self.projFileManager.leafNum, n_jobs=24).fit(sortData[min_index:max_index]) dist = X.radius_neighbors_graph(sortData[min_index:max_index], self.projFileManager.neighborR, 'distance') sub_label = DBSCAN(eps=self.projFileManager.eps, min_samples=self.projFileManager.minPts, metric='precomputed', n_jobs=self.workers).fit_predict(dist) new_labels = int(sub_label.max()) + 1 sub_label[sub_label != -1] += curr_label labels[min_index:max_index, 0] = sub_label curr_label += new_labels print() # Concatenate and save information sortData[:, 0] = sortData[:, 0] / self.projFileManager.timeScale labeledCoords = np.concatenate((sortData, labels), axis=1).astype('int64') np.save(self.videoObj.localLabeledCoordsFile, labeledCoords) print(' Concatenating and summarizing clusters,,Time: ' + str(datetime.datetime.now())) df = pd.DataFrame(labeledCoords, columns=['T', 'X', 'Y', 'LID']) clusterData = df.groupby('LID').apply( lambda x: pd.Series({ 'projectID': self.lp.projectID, 'videoID': self.videoObj.baseName, 'N': x['T'].count(), 't': int(x['T'].mean()), 'X': int(x['X'].mean()), 'Y': int(x['Y'].mean()), 't_span': int(x['T'].max() - x['T'].min()), 'X_span': int(x['X'].max() - x['X'].min()), 'Y_span': int(x['Y'].max() - x['Y'].min()), 'ManualAnnotation': 'No', 'ManualLabel': '', 'ClipCreated': 'No', 'DepthChange': np.nan, })) clusterData['TimeStamp'] = clusterData.apply( lambda row: (self.videoObj.startTime + datetime.timedelta(seconds=int(row.t))), axis=1) clusterData['ClipName'] = clusterData.apply(lambda row: '__'.join([ str(x) for x in [ self.lp.projectID, self.videoObj.baseName, row.name, row.N, row .t, row.X, row.Y ] ]), axis=1) # Identify clusters to make clips for #self._print('Identifying clusters to make clips for', log = False) delta_xy = self.projFileManager.delta_xy delta_t = self.projFileManager.delta_t smallClips, clipsCreated = 0, 0 # keep track of clips with small number of pixel changes for row in clusterData.sample(n=clusterData.shape[0]).itertuples( ): # Randomly go through the dataframe LID, N, t, x, y, time = row.Index, row.N, row.t, row.X, row.Y, row.TimeStamp if x - delta_xy < 0 or x + delta_xy >= self.videoObj.height or y - delta_xy < 0 or y + delta_xy >= self.videoObj.width: continue # Check temporal compatability (part a): elif self.videoObj.framerate * t - delta_t < 0 or LID == -1: continue # Check temporal compatability (part b): elif time < self.lightsOnTime or time > self.lightsOffTime: continue else: clusterData.loc[clusterData.index == LID, 'ClipCreated'] = 'Yes' if N < self.projFileManager.smallLimit: if smallClips > self.videoObj.nManualLabelClips / 20: continue smallClips += 1 if clipsCreated < self.videoObj.nManualLabelClips: clusterData.loc[clusterData.index == LID, 'ManualAnnotation'] = 'Yes' clipsCreated += 1 clusterData.to_csv(self.videoObj.localLabeledClustersFile, sep=',') self.clusterData = clusterData
} relvdm_backbone.load_rel_vdms_pickle(sample, subset=no_charge) relvdm_backbone.set_rel_vdm_bb_coords() relvdm_backbone.set_rois_rot_trans(sample) relvdm_backbone.set_rel_vdm_tags(sample) print('moving vdMs') relvdm_backbone.move_rel_vdms(sample) print('removing clashing vdMs') relvdm_backbone.remove_clash(sample) relvdm_backbone.reshape_ifgs() all_ifgs = functools.reduce(lambda a, b: np.vstack((a, b)), [val for val in relvdm_backbone._ifgs.values()]) print('finding hotspots preproline carbonyl') nbrs = NearestNeighbors(metric='euclidean', radius=1.0, algorithm='kd_tree') nbrs.fit(all_ifgs) adj_mat = nbrs.radius_neighbors_graph(all_ifgs) print('clustering...') mems, cents = combs.Cluster.greedy_cluster_pc(adj_mat, pc=0.8) all_resnum_chid = functools.reduce(lambda a, b: np.vstack((a, b)), [ np.array([tuple(key)] * len(val), dtype=object) for key, val in relvdm_backbone._vdm_tags.items() ]) all_vdm_tags = functools.reduce(lambda a, b: np.vstack( (a, b)), [val for val in relvdm_backbone._vdm_tags.values()]) all_resn = functools.reduce(lambda a, b: np.hstack((a, b)), [val for val in relvdm_backbone._resn.values()]) all_type = functools.reduce(lambda a, b: np.hstack((a, b)), [val for val in relvdm_backbone._type.values()]) all_indices = functools.reduce(lambda a, b: np.hstack(
def cc_regions(self, selected_components=None, n_neighbors=None, radius=None, expansion_factor=None): embedding = self.embedding ''' if embedding.shape[1] == 2: selected_components = [0,1] elif selected_components is not None: if type(selected_components) == list: selected_components = selected_components else: raise ValueError("selected_components parameter must be od type list.") else: raise ValueError("'DimensionReductionRegions' was initialized with more than two components. Provide selected_components as list.") ''' #embedding = embedding[:, selected_components] if n_neighbors is None: n_neighbors = 0 if radius is None: radius = 0 if expansion_factor is not None: embedding = embedding**expansion_factor * np.sign(embedding) nn = NearestNeighbors(n_neighbors=n_neighbors, radius=radius) nn.fit(embedding) knn = nn.kneighbors_graph() knn_nr_components, knn_cc_labels = connected_components(knn, directed=False) print("Number of knn components: %i" % (knn_nr_components)) knn_cc = [ tuple(np.where(knn_cc_labels == lbl)[0]) for lbl in range(max(knn_cc_labels) + 1) ] rnn = nn.radius_neighbors_graph() rnn_nr_components, rnn_cc_labels = connected_components(rnn, directed=False) print("Number of rnn components: %i" % (rnn_nr_components)) rnn_cc = [ tuple(np.where(rnn_cc_labels == lbl)[0]) for lbl in range(max(rnn_cc_labels) + 1) ] regions_idx = list(set(knn_cc + rnn_cc)) print("Total number of components: %i" % (len(regions_idx))) regions = [] for idx in regions_idx: idx = list(idx) img = create_empty_img(self.height, self.width, False) img[(self.gy[idx], self.gx[idx])] = 1 regions.append(img) regions = np.array(regions) regions_dict = {} for idx, region in enumerate(regions): regions_dict[idx] = region region_sum = np.sum(regions, axis=0) return region_sum, regions_dict
]).T # data = np.array([rawdata['X'], rawdata['Y'], rawdata['Z']]).T # get the true labels and group names labels_true = np.array(rawdata['row']) groups = np.array(rawdata['group']) # convert data to 32 bit data = np.array(data, dtype=np.float32) # get nearest neighbours printlog('Work out nearest neighbours...') start = time.time() neigh = NearestNeighbors(radius=20, metric='euclidean') neigh.fit(data) neighbours = neigh.radius_neighbors_graph(data, mode='distance') end = time.time() printlog('\t Time taken = {0} s'.format(end - start)) # ---------------------------------------------------------------------- # DBscan example from : # scikit-learn.org/stable/modules/clustering.html#dbscan # http://scikit-learn.org/stable/auto_examples/cluster/plot_dbscan # .html#sphx-glr-auto-examples-cluster-plot-dbscan-py printlog("Calculating clustering using 'DBSCAN'...") start = time.time() sargs = dict(eps=10, min_samples=50, metric='precomputed') db = DBSCAN(**sargs).fit(neighbours) end = time.time() # get mask and labels
def add_layered_exclusion(self, object_1, object_2, margin=1, distance_metric='l1', reduce_redundancy=True): """Add exclsion constraint edges forcing object_1 and object_2 not to overlap. This function assumes a layered boundary cost has been applied to the objects. """ if object_1 == object_2: raise ValueError('object_1 and object_2 is the same object.') # Get nodeids. object_1_nodeids = self.get_nodeids(object_1) object_2_nodeids = self.get_nodeids(object_2) # Get points. object_1_points = object_1.sample_points object_2_points = object_2.sample_points # TODO Avoid searching for neigbours if possible. # Create nearest neighbors tree. neigh = NearestNeighbors(radius=margin, metric=distance_metric) neigh.fit(object_1_points.reshape(-1, object_1_points.shape[-1])) # Create neighbors graph. # Get connectivity for all within margin. radius_neighbors_graph = neigh.radius_neighbors_graph( object_2_points.reshape(-1, object_2_points.shape[-1])) # Get indices for all combined graph connections. indices_2, indices_1, _ = sparse.find(radius_neighbors_graph) if indices_1.size == 0: # If there are no neighbors, return. return # Remove redundany neighbors. if reduce_redundancy: # Find sizes of the columns. column_size_1 = np.product(object_1_nodeids.shape[1:]) column_size_2 = np.product(object_2_nodeids.shape[1:]) # Get the column indices of the node indices. column_indices_2 = indices_2 % column_size_2 # Get first unique combination of comlumns. _, unique_column_indices = np.unique([indices_1, column_indices_2], return_index=True, axis=1) # Filter indices to have only one edge between each column. indices_1 = indices_1[unique_column_indices] indices_2 = indices_2[unique_column_indices] # Get the column indices of the node indices. column_indices_1 = indices_1 % column_size_1 # Get first unique combination of comlumns. _, unique_column_indices = np.unique([column_indices_1, indices_2], return_index=True, axis=1) # Filter indices to have only one edge between each column. indices_1 = indices_1[unique_column_indices] indices_2 = indices_2[unique_column_indices] # Add exclusion terms. self.add_pairwise_terms(object_1_nodeids.flat[indices_1], object_2_nodeids.flat[indices_2], 0, 0, 0, self.inf_cap)
def gcg_cluster(features, timestamps=None, distance_thr: float = 0.5, timestamp_thr: float = 0, edge_thr: float = 0.5): n_features = len(features) dist_neigh = NearestNeighbors(radius=distance_thr) dist_neigh.fit(features) dist_graph = dist_neigh.radius_neighbors_graph(mode='distance') if timestamps is not None and timestamp_thr > 0: time_neigh = NearestNeighbors(radius=timestamp_thr) time_neigh.fit(timestamps) time_graph = time_neigh.radius_neighbors_graph(mode='connectivity') dist_graph = dist_graph.multiply(time_graph) dist_graph.eliminate_zeros() dist_graph = nx.from_scipy_sparse_matrix(dist_graph) # Dict of nodes that have not a cluster label assigned no_labels = {u: u for u in range(n_features)} # Init the clusters list with a random one-element cluster clusters = {0: [0]} # TODO: really do this random? # Remove node 0 form no labels list del no_labels[0] # Current growing cluster cur_label = 0 node_boundary = nx.algorithms.boundary.node_boundary while True: cur_cluster = clusters[cur_label] boundary = node_boundary(dist_graph, cur_cluster, no_labels.keys()) cluster_grow = False if len(boundary): scores = [] for node in boundary: node_neighs = dist_graph[node] inside_edges = [ node_neighs[u]['weight'] for u in node_neighs if u in cur_cluster ] scores.append((len(inside_edges), sum(inside_edges), node)) best_score = sorted(scores, key=lambda s: (-s[0], s[1]))[0] if best_score[0] >= int(edge_thr * len(cur_cluster) + 1): best_node = best_score[2] clusters[cur_label].append(best_node) del no_labels[best_node] cluster_grow = True if not len(no_labels): break if not cluster_grow: # Increase current cluster label cur_label += 1 # Take the next no labeled node and seed a new cluster with it next_node = no_labels.popitem()[0] clusters[cur_label] = [next_node] if not len(no_labels): break return list(clusters.values())
class Kernel(object): """ Class abstracting the evaluation of kernel functions on the dataset. Parameters ---------- type : string, optional Type of kernel to construct. Currently the only option is 'gaussian', but more will be implemented. epsilon : string, optional Method for choosing the epsilon. Currently, the only options are to provide a scalar (epsilon is set to the provided scalar) or 'bgh' (Berry, Giannakis and Harlim). k : int, optional Number of nearest neighbors over which to construct the kernel. neighbor_params : dict or None, optional Optional parameters for the nearest Neighbor search. See scikit-learn NearestNeighbors class for details. metric : string, optional Distance metric to use in constructing the kernel. This can be selected from any of the scipy.spatial.distance metrics, or a callable function returning the distance. metric_params : dict or None, optional Optional parameters required for the metric given. """ def __init__(self, kernel_type='gaussian', epsilon='bgh', k=64, neighbor_params=None, metric='euclidean', metric_params=None, nearest_neighbors_algo='knearest'): self.type = kernel_type self.epsilon = epsilon self.k = k self.metric = metric self.metric_params = metric_params if neighbor_params is None: neighbor_params = {} self.neighbor_params = neighbor_params self.d = None self.epsilon_fitted = None self.nearest_neighbors_algo = nearest_neighbors_algo def fit(self, X): """ Fits the kernel to the data X, constructing the nearest neighbor tree. Parameters ---------- X : array-like, shape (n_query, n_features) Data upon which to fit the nearest neighbor tree. Returns ------- self : the object itself """ self.k0 = min(self.k, np.shape(X)[0]) self.data = X # Construct Nearest Neighbor Tree with warnings.catch_warnings(): warnings.filterwarnings( "ignore", message= "Parameter p is found in metric_params. The corresponding parameter from __init__ is ignored." ) self.neigh = NearestNeighbors(n_neighbors=self.k, radius=1.0, metric=self.metric, metric_params=self.metric_params, **self.neighbor_params) self.neigh.fit(X) self.choose_optimal_epsilon() return self def compute(self, Y=None): """ Computes the sparse kernel matrix. Parameters ---------- Y : array-like, shape (n_query, n_features), optional. Data against which to calculate the kernel values. If not provided, calculates against the data provided in the fit. Returns ------- K : array-like, shape (n_query_X, n_query_Y) Values of the kernel matrix. """ if Y is None: Y = self.data # perform k nearest neighbour search on X and Y and construct sparse matrix if self.nearest_neighbors_algo == 'knearest': K = self.neigh.kneighbors_graph(Y, mode='distance') elif self.nearest_neighbors_algo == 'radius': radius = 10.0 * np.sqrt(self.epsilon_fitted) K = self.neigh.radius_neighbors_graph(Y, radius, mode='distance') else: raise ValueError( 'Did not understand neares neighbors method. Choose from knearest or radius.' ) # retrieve all nonzero elements and apply kernel function to it v = K.data if (self.type == 'gaussian'): K.data = np.exp(-v**2 / self.epsilon_fitted) else: raise ("Error: Kernel type not understood.") return K def choose_optimal_epsilon(self, epsilon=None): """ Chooses the optimal value of epsilon and automatically detects the dimensionality of the data. Parameters ---------- epsilon : string or scalar, optional Method for choosing the epsilon. Currently, the only options are to provide a scalar (epsilon is set to the provided scalar) or 'bgh' (Berry, Giannakis and Harlim). Returns ------- self : the object itself """ if epsilon is None: epsilon = self.epsilon # Choose Epsilon according to method provided. if isinstance(epsilon, numbers.Number): # if user provided. self.epsilon_fitted = epsilon return self elif epsilon == 'bgh': # Berry, Giannakis Harlim method. dists = self.neigh.kneighbors_graph(self.data, mode='distance').data sq_distances = dists**2 if (self.metric != 'euclidean'): # TODO : replace with call to scipy metrics. warnings.warn( 'The BGH method for choosing epsilon assumes a euclidean metric. However, the metric being used is %s. Proceed at your own risk...' % self.metric) self.epsilon_fitted, self.d = choose_optimal_epsilon_BGH( sq_distances) else: raise ValueError( "Method for automatically choosing epsilon was given as %s, but this was not recognized" % epsilon) return self
def add_layered_containment(self, outer_object, inner_object, min_margin=0, max_margin=None, distance_metric='l2', reduce_redundancy=True): """Add layered containment.""" if outer_object == inner_object: raise ValueError( 'outer_object and inner_object is the same object.') # Get nodeids. outer_nodeids = self.get_nodeids(outer_object) inner_nodeids = self.get_nodeids(inner_object) # Get points. outer_points = outer_object.sample_points inner_points = inner_object.sample_points if outer_points.ndim != inner_points.ndim or outer_points.shape[ -1] != inner_points.shape[-1]: raise ValueError( 'outer_object points and inner_object points must have the same number of dimensions and the same size last dimension.' ) # Check if the points are identical. if outer_points.shape == inner_points.shape and np.all( outer_points == inner_points): # If shapes and points match, this is the fast way. if max_margin is not None and outer_nodeids.shape[0] > max_margin: # Add max margin edges. if max_margin == 0: self.add_pairwise_terms(inner_nodeids, outer_nodeids, 0, self.inf_cap, 0, 0) else: self.add_pairwise_terms(inner_nodeids[:-max_margin], outer_nodeids[max_margin:], 0, self.inf_cap, 0, 0) self.add_pairwise_terms(inner_nodeids[-max_margin:], outer_nodeids[-1], 0, self.inf_cap, 0, 0) if min_margin is not None and outer_nodeids.shape[0] > min_margin: # Add min margin edges. if min_margin == 0: self.add_pairwise_terms(outer_nodeids, inner_nodeids, 0, self.inf_cap, 0, 0) else: self.add_pairwise_terms(outer_nodeids[min_margin:], inner_nodeids[:-min_margin], 0, self.inf_cap, 0, 0) self.add_pairwise_terms(outer_nodeids[:min_margin], inner_nodeids[0], 0, self.inf_cap, 0, 0) # Else we need to find nodes to connect. else: # Create flattened arrays of points. outer_points_flat = outer_points.reshape(-1, outer_points.shape[-1]) inner_points_flat = inner_points.reshape(-1, inner_points.shape[-1]) # Find sizes of the columns. outer_columns_size = np.product(outer_nodeids.shape[1:]) inner_column_size = np.product(inner_nodeids.shape[1:]) # Create nearest neighbors tree. neigh = NearestNeighbors(metric=distance_metric) neigh.fit(outer_points_flat) if max_margin is not None: # Find direction of points. outer_point_gradients = np.gradient(outer_points, axis=0) inner_point_gradients = np.gradient(inner_points, axis=0) # Move inner points in the direction of the gradient. # The distance moved is the max margin. inner_points_moved = inner_points + \ (max_margin * inner_point_gradients / np.sqrt(np.sum(inner_point_gradients**2, axis=-1)[..., np.newaxis])) inner_points_moved_flat = inner_points_moved.reshape( -1, outer_points.shape[-1]) # Find the 4 nearest neighbours for moved points. This should be enough. radius_neighbors_graph = neigh.kneighbors_graph( inner_points_moved_flat, n_neighbors=4, mode='connectivity') # Get indices for all combined graph connections. inner_indices, outer_indices, _ = sparse.find( radius_neighbors_graph) if outer_indices.size > 0: # The following code filters out redundant terms before adding terms and tries to ensure a meaningful max margin. # Find distances between neighbours. # Create mask for neighbours futher than max margin away. distance_mask = np.sum( (outer_points_flat[outer_indices] - inner_points_flat[inner_indices])**2, axis=-1) > max_margin**2 # Only keep edges longer than max margin. outer_indices = outer_indices[distance_mask] inner_indices = inner_indices[distance_mask] # Find angles between gradients. angles = np.einsum( 'ij,ij->i', outer_point_gradients.reshape( -1, outer_points.shape[-1])[outer_indices], inner_point_gradients.reshape( -1, outer_points.shape[-1])[inner_indices]) angle_mask = angles > 0 # Only keep edges where the gradients are points in the same direction. outer_indices = outer_indices[angle_mask] inner_indices = inner_indices[angle_mask] angles = angles[angle_mask] # Reverse indices to get bigger indices first. outer_indices = np.flip(outer_indices) inner_indices = np.flip(inner_indices) angles = np.flip(angles) # Get the column indices of the node indices. inner_column_indices = inner_indices % inner_column_size # Get first unique combination of comlumns. _, unique_column_indices = np.unique( [outer_indices, inner_column_indices], return_index=True, axis=1) # Filter indices to have only one from an outer node to each inner column. outer_indices = outer_indices[unique_column_indices] inner_indices = inner_indices[unique_column_indices] angles = angles[unique_column_indices] # Get sort indices, large angles (dot product) first. angle_sort = np.argsort(-angles) # Sort indices. outer_indices = outer_indices[angle_sort] inner_indices = inner_indices[angle_sort] # Get the outer connection with biggest dot product. _, unique_column_indices = np.unique(outer_indices, return_index=True) # Only keep the connections to the outer node with the largest angle. outer_indices = outer_indices[unique_column_indices] inner_indices = inner_indices[unique_column_indices] outer_ids = np.take(outer_nodeids, outer_indices) inner_ids = np.take(inner_nodeids, inner_indices) # Add containment edges. self.add_pairwise_terms(inner_ids, outer_ids, 0, self.inf_cap, 0, 0) if min_margin is not None: radius_neighbors_graph = neigh.radius_neighbors_graph( inner_points_flat, radius=min_margin) # Removed K-nieghbors search for now. # Adding them may improve stability when resolution is low. # kneighbors_graph = neigh.kneighbors_graph(inner_points_flat, n_neighbors=2, mode='connectivity') # radius_neighbors_graph += kneighbors_graph # Get indices for all combined graph connections. inner_indices, outer_indices, _ = sparse.find( radius_neighbors_graph) if outer_indices.size > 0: # Remove redundany neighbors. if reduce_redundancy: # The following code filters out redundant terms before adding terms. # Get the column indices of the node indices. inner_column_indices = inner_indices % inner_column_size # Get first unique combination of comlumns. _, unique_column_indices = np.unique( [outer_indices, inner_column_indices], return_index=True, axis=1) # Filter indices to have only one edge between each column. outer_indices = outer_indices[unique_column_indices] inner_indices = inner_indices[unique_column_indices] # Reverse indices. outer_indices = np.flip(outer_indices) inner_indices = np.flip(inner_indices) # Get the column indices of the node indices. outer_column_indices = outer_indices % outer_columns_size # Get first unique combination of comlumns. _, unique_column_indices = np.unique( [outer_column_indices, inner_indices], return_index=True, axis=1) # Filter indices to have only one edge between each column. outer_indices = outer_indices[unique_column_indices] inner_indices = inner_indices[unique_column_indices] outer_ids = np.take(outer_nodeids, outer_indices) inner_ids = np.take(inner_nodeids, inner_indices) # Add containment edges. self.add_pairwise_terms(outer_ids, inner_ids, 0, self.inf_cap, 0, 0)
def createClusters(self, minMagnitude=0, treeR=22, leafNum=190, neighborR=22, timeScale=10, eps=18, minPts=90, delta=1.0, Nclips=200, delta_xy=100, delta_t=60, smallLimit=500): self.loadVideo() self.loadHMM() self._print('Clustering HMM transitions using DBScan') coords = self.obj.retDBScanMatrix(minMagnitude) np.save(self.localClusterDirectory + 'RawCoords.npy', coords) #subprocess.call(['rclone', 'copy', self.localClusterDirectory + 'RawCoordsFile.npy', self.cloudClusterDirectory], stderr = self.fnull) sortData = coords[coords[:, 0].argsort( )][:, 0: 3] #sort data by time for batch processing, throwing out 4th column (magnitude) numBatches = int( sortData[-1, 0] / delta / 3600 ) + 1 #delta is number of hours to batch together. Can be fraction. sortData[:, 0] = sortData[:, 0] * timeScale #scale time so that time distances between transitions are comparable to spatial differences labels = np.zeros(shape=(sortData.shape[0], 1), dtype=sortData.dtype) #Calculate clusters in batches to avoid RAM overuse curr_label = 0 #Labels for each batch start from zero - need to offset these print('Calculating clusters in ' + str(numBatches) + ' total batches', file=sys.stderr) for i in range(numBatches): print('Batch: ' + str(i), file=sys.stderr) min_time, max_time = i * delta * timeScale * 3600, ( i + 1 ) * delta * timeScale * 3600 # Have to deal with rescaling of time. 3600 = # seconds in an hour hour_range = np.where((sortData[:, 0] > min_time) & (sortData[:, 0] <= max_time)) min_index, max_index = hour_range[0][0], hour_range[0][-1] + 1 X = NearestNeighbors(radius=treeR, metric='minkowski', p=2, algorithm='kd_tree', leaf_size=leafNum, n_jobs=24).fit(sortData[min_index:max_index]) dist = X.radius_neighbors_graph(sortData[min_index:max_index], neighborR, 'distance') sub_label = DBSCAN(eps=eps, min_samples=minPts, metric='precomputed', n_jobs=24).fit_predict(dist) new_labels = int(sub_label.max()) + 1 sub_label[sub_label != -1] += curr_label labels[min_index:max_index, 0] = sub_label curr_label += new_labels sortData[:, 0] = sortData[:, 0] / timeScale self.labeledCoords = np.concatenate((sortData, labels), axis=1).astype('int64') np.save(self.localClusterDirectory + self.labeledCoordsFile, self.labeledCoords) subprocess.call([ 'rclone', 'copy', self.localClusterDirectory + self.labeledCoordsFile, self.cloudClusterDirectory ], stderr=self.fnull) uniqueLabels = set(self.labeledCoords[:, 3]) uniqueLabels.remove(-1) print( str(self.labeledCoords[self.labeledCoords[:, 3] != -1].shape[0]) + ' HMM transitions assigned to ' + str(len(uniqueLabels)) + ' clusters', file=sys.stderr) df = pd.DataFrame(self.labeledCoords, columns=['T', 'X', 'Y', 'LID']) clusterData = df.groupby('LID').apply( lambda x: pd.Series({ 'projectID': self.projectID, 'videoID': self.baseName, 'N': x['T'].count(), 't': int(x['T'].mean()), 'X': int(x['X'].mean()), 'Y': int(x['Y'].mean()), 't_span': int(x['T'].max() - x['T'].min()), 'X_span': int(x['X'].max() - x['X'].min()), 'Y_span': int(x['Y'].max() - x['Y'].min()), 'ManualAnnotation': 'No', 'ManualLabel': '', 'MLLabel': '' })) clusterData['X_depth'] = df.apply( lambda row: (self.transM[0][0] * row.X + self.transM[0][1] * row.Y + self.transM[0][2]) / (self.transM[2][0] * row.X + self.transM[2][1] * row.Y + self. transM[2][2]), axis=1) clusterData['Y_depth'] = df.apply( lambda row: (self.transM[1][0] * row.X + self.transM[1][1] * row.Y + self.transM[1][2]) / (self.transM[2][0] * row.X + self.transM[2][1] * row.Y + self. transM[2][2]), axis=1) clusterData.to_csv(self.localClusterDirectory + self.clusterFile, sep='\t') clusterData = pd.read_csv(self.localClusterDirectory + self.clusterFile, sep='\t', header=0) # Identify rows for manual labeling manualClips = 0 smallClips = 0 cap = cv2.VideoCapture(self.localMasterDirectory + self.videofile) framerate = cap.get(cv2.CAP_PROP_FPS) for row in clusterData.sample(n=clusterData.shape[0]).itertuples(): if manualClips > Nclips: break LID, N, t, x, y = row.LID, row.N, row.t, row.X, row.Y if x - delta_xy < 0 or x + delta_xy >= self.height or y - delta_xy < 0 or y + delta_xy >= self.width or LID == -1 or framerate * t - delta_t < 0 or framerate * t + delta_t >= self.frames: continue if smallClips > Nclips / 20: continue clusterData.loc[clusterData.LID == LID, 'ManualAnnotation'] = 'Yes' manualClips += 1 if N < smallLimit: smallClips += 1 clusterData.to_csv(self.localClusterDirectory + self.clusterFile, sep='\t') subprocess.call([ 'rclone', 'sync', self.localClusterDirectory, self.cloudClusterDirectory ], stderr=self.fnull) self.clusterData = clusterData self.createClusterClips()
def dbscan(adata, basis='tsne', n_comps=2, eps=None, min_samples=None, n_jobs=None, copy=False): """Cluster cells using DBSCAN This wraps sklearn.cluster.DBSCAN and shares most of the parameters. Parameters ---------- eps : float or None, optional The maximum distance between samples for being considered as in the same neighborhood. Clusters are "grown" from samples that have more than min_samples points in their neighborhood. Increasing eps therefore allows clusters to spread over wider regions. min_samples : int or None, optional The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. This includes the point itself. n_jobs : int (default: None) Number of threads to use. Defaults to sett.n_jobs. copy : bool (default: False) References ---------- Ester et al. (1996), "A Density-Based Algorithm for Discovering Clusters in Large Spatial Databases with Noise". In: Proceedings of the 2nd International Conference on Knowledge Discovery and Data Mining, Portland, OR, AAAI Press, pp. 226-231. Pedregosa et al. (2011) ... """ logg.m('starting DBSCAN', r=True) adata = adata.copy() if copy else adata if basis not in {'tsne', 'pca'}: raise ValueError('`basis` needs to be "tsne" or "pca"') if 'X_tsne' in adata.smp and basis == 'tsne': X = adata.smp['X_tsne'][:, :n_comps] elif 'X_pca' in adata.smp and basis == 'pca': X = adata.smp['X_pca'][:, :n_comps] else: raise ValueError('Run {} first.'.format(basis)) n_jobs = sett.n_jobs if n_jobs is None else n_jobs range_1 = np.max(X[:, 0]) - np.min(X[:, 0]) range_2 = np.max(X[:, 1]) - np.min(X[:, 1]) if eps is None: if n_comps == 2: avg_area_per_point = (range_1 * range_2 / X.shape[0]) logg.m('... the "drawing range" is', range_1, '×', range_2, 'with the average area per point', avg_area_per_point) eps = 1.7 * np.sqrt(avg_area_per_point) else: eps = 5 if min_samples is None: min_samples = 30 logg.m('... using eps =', eps, end=', ') logg.m('min_samples =', min_samples, end=', ') logg.m('basis =', basis, end=', ') logg.m('n_comps =', basis, end=', ') logg.m('n_jobs =', n_jobs) #, end=', ') logg.m('increase `min_samples` if you find too many clusters', v='hint') logg.m('reduce eps if "everything is connected"', v='hint') from sklearn.cluster import DBSCAN from sklearn.neighbors import NearestNeighbors nn = NearestNeighbors(n_neighbors=min_samples, n_jobs=n_jobs) nn.fit(X) D = nn.radius_neighbors_graph(radius=eps, mode='distance') db = DBSCAN(eps=eps, min_samples=min_samples, n_jobs=n_jobs, metric='precomputed').fit(D) labels = db.labels_ dont_know = labels == -1 labels = labels.astype(str) labels[dont_know] = '?' # loop_over_labels = (label for label in np.unique(labels) if label >= 0) adata.smp['dbscan_groups'] = labels from natsort import natsorted adata.add['dbscan_groups_order'] = np.array(natsorted(np.unique(labels)))[:-1] logg.m(' finished', t=True, end=' ') logg.m('and found', len(np.unique(labels))-1, 'clusters, added\n' ' "dbscan_groups", the cluster labels (adata.smp)\n' ' "dbscan_groups_order", the unique cluster labels (adata.add)') return adata if copy else None
print "dimension : ", len(dimen) X = np.array(X_tmp) from sklearn.neighbors import NearestNeighbors from sklearn.neighbors import LSHForest for n in range(2, 10): print "[[[[[" + str(n) + "]]]]]" start = time.time() # nbrs = NearestNeighbors(n_neighbors=n, algorithm='ball_tree').fit(X) # print nbrs.kneighbors_graph(X).toarray() neigh = NearestNeighbors(n_neighbors=n) neigh.fit(X) a = neigh.radius_neighbors_graph(X).toarray() print a # a = neigh.kneighbors_graph(X).toarray() pc.dump(a, open("knn" + str(n) + ".txt", "w")) end = time.time() print "NearestNeighbors", end - start start = time.time() lshf = LSHForest(n_neighbors=n, random_state=10000) lshf.fit(X) # distances, indices = lshf.kneighbors(X, n_neighbors=n) # print lshf.kneighbors_graph(X).toarray() a = lshf.radius_neighbors_graph(X).toarray() print a pc.dump(a, open("lsh" + str(n) + ".txt", "w")) end = time.time()
relvdm_amino.rel_vdm_path = '/Users/npolizzi/Projects/combs/results/amino/rel_vdms_hbond/20171025/' relvdm_amino.load_rel_vdms_pickle(sample) relvdm_amino.set_rel_vdm_bb_coords() relvdm_amino.set_rois_rot_trans(sample) relvdm_amino.set_rel_vdm_tags(sample) print('moving vdMs') relvdm_amino.move_rel_vdms(sample) print('removing clashing vdMs') relvdm_amino.remove_clash(sample) relvdm_amino.reshape_ifgs() all_ifgs_amino = functools.reduce(lambda a, b: np.vstack((a, b)), [val for val in relvdm_amino._ifgs.values()]) print('finding hotspots amino') nbrs = NearestNeighbors(metric='euclidean', radius=1.1, algorithm='kd_tree') nbrs.fit(all_ifgs_amino) adj_mat = nbrs.radius_neighbors_graph(all_ifgs_amino) print('clustering...') mems, cents = combs.analysis.cluster.greedy_cluster_pc(adj_mat, pc=0.7) all_resnum_chid = functools.reduce(lambda a, b: np.vstack((a, b)), [ np.array([tuple(key)] * len(val), dtype=object) for key, val in relvdm_amino._vdm_tags.items() ]) all_vdm_tags = functools.reduce(lambda a, b: np.vstack( (a, b)), [val for val in relvdm_amino._vdm_tags.values()]) all_resn = functools.reduce(lambda a, b: np.hstack((a, b)), [val for val in relvdm_amino._resn.values()]) all_type = functools.reduce(lambda a, b: np.hstack((a, b)), [val for val in relvdm_amino._type.values()]) all_indices = functools.reduce(lambda a, b: np.hstack( (a, b)), [val for val in relvdm_amino._indices.values() if len(val) > 0])
class LSAD(object): def __init__(self,name,hop,dim_list,K): assert len(dim_list) >= 2, 'specify input output dimension' self.name = name self.dim_list = dim_list self.data = None self.neigh = None self.r = None with tf.variable_scope(self.name): self.center = tf.placeholder(tf.float32, name="center") self.X = tf.placeholder(tf.float32, [None,dim_list[0] ]) self.expanded_X = tf.expand_dims(self.X,2) #print(self.expanded_X.get_shape()) self.A = tf.sparse_placeholder(tf.float32) self.P_Indices = tf.placeholder(tf.int32, [None]) self.U_Indices = tf.placeholder(tf.int32, [None]) self.Ws =[] self.layers = [] self.layers.append(self.X) with tf.variable_scope("MLP_Configuration"): for idx,dim in enumerate(dim_list): if idx==0: continue W = tf.get_variable('%s'%(idx) ,shape=[dim_list[idx-1],dim],initializer=tf.contrib.layers.xavier_initializer()) layer = tf.matmul(self.layers[idx-1],W) bias= tf.get_variable('b%s'%(idx) ,shape=[dim],initializer=tf.contrib.layers.xavier_initializer()) bias= tf.Variable(tf.zeros([dim])) layer = tf.nn.bias_add(layer,bias) if idx < len(dim_list)-1 and idx >= 1 : layer = tf.nn.relu(layer) self.Ws.append(tf.reduce_sum(tf.square(W))) self.layers.append(layer) self.vectors = [tf.nn.l2_normalize(self.layers[-1],axis=1)] if K <=0: for i in range(hop): self.vectors.append(tf.sparse_tensor_dense_matmul(self.A,self.vectors[i])) for i in range(1,hop+1): self.vectors[i] = tf.nn.l2_normalize(self.vectors[i],axis=1) else: for i in range(hop): self.vectors.append(tf.sparse_tensor_dense_matmul(self.A,self.vectors[i])) # (optional) # for i in range(1,hop+1): # self.vectors[i] = tf.nn.l2_normalize(self.vectors[i],axis=1) self.P = [] self.U = [] for i in range(hop+1): self.P.append( tf.nn.embedding_lookup(self.vectors[i], self.P_Indices) ) self.U.append( tf.nn.embedding_lookup(self.vectors[i], self.U_Indices) ) self.expanded_P = [] self.expanded_U = [] for i in range(hop+1): self.expanded_P.append( tf.expand_dims(self.P[i],1)) self.expanded_U.append( tf.expand_dims(self.U[i],1)) assert hop >= 1, "hop is less then 1" self.P_ref = self.expanded_P[1] self.U_ref = self.expanded_U[1] for h in range(2,hop+1): self.P_ref = tf.concat((self.P_ref,self.expanded_P[h]),1) self.U_ref = tf.concat((self.U_ref,self.expanded_U[h]),1) self.P_loss = tf.reduce_mean( ( tf.reduce_mean( (tf.reduce_sum(self.expanded_P[0]*self.P_ref,2)),1) ) ) self.U_loss = tf.reduce_mean( -tf.reduce_mean( (tf.reduce_sum(self.expanded_U[0]*self.U_ref,2)),1)) self.scores = -tf.reduce_mean(tf.reduce_sum(self.expanded_U[0]*self.U_ref,2),1)#1d def build_optimizer(self,learning_rate): self.loss = self.P_loss + self.U_loss self.optimizer = tf.train.AdamOptimizer(learning_rate) self.trainStep = self.optimizer.minimize(self.loss) def TPA(self,data,params,method='closest-K'): print("Converting a set of data points to simplicial complexes") if method=='persistent-homology': return self.rGraph(X=data,params=params) elif method=='closest-K': return self.kGraph(X=data,params=params) def rGraph(self,X,params): X= np.array(X) TRAIN = params['TRAIN'] if TRAIN ==True: rip = ripser(X) zero_dimensional_homology = rip['dgms'][0][:,1][:-1] one_dimensional_homology = rip['dgms'][1][:,1][:-1]#optional mu = np.mean(zero_dimensional_homology) sigma = np.std(zero_dimensional_homology) self.r = mu+2.0*sigma self.data = X self.neigh = NearestNeighbors(radius=self.r, n_jobs=-1) self.neigh.fit(X) train_graph = self.neigh.radius_neighbors_graph(X,self.r) coo = train_graph.tocoo() return np.mat([coo.row, coo.col]).transpose() else: coo = self.neigh.radius_neighbors_graph(X,self.r).tocoo() return np.mat([coo.row+len(self.data), coo.col]).transpose() def kGraph(self,X,params): X= np.array(X) k = params['K'] TRAIN = params['TRAIN'] if TRAIN == True: self.data = X self.neigh = NearestNeighbors(n_neighbors=(k+1),n_jobs=-1) self.neigh.fit(X) train_graph = self.neigh.kneighbors_graph(X) coo = train_graph.tocoo() return np.mat([coo.row, coo.col]).transpose() else: coo = self.neigh.kneighbors_graph(X).tocoo() return np.mat([coo.row+len(self.data), coo.col ]).transpose()