def obj(wb,solver): ''' The objective function used by fmin ''' # obtain w and b Khidden = solver._Khidden dim = solver._dim whidden = wb[:Khidden*dim].reshape((dim, Khidden)) tree = solver._regargs['tree'] w = mathutil.dot(whidden, tree) b = wb[Khidden*dim:] # pred is a matrix of size [num_datalocal, K] mathutil.dot(solver._X, w, out = solver._pred) solver._pred += b # compute the loss function flocal,gpred = solver.loss(solver._Y, solver._pred, solver._weight, **solver._lossargs) mathutil.dot(mathutil.dot(solver._X.T, gpred), tree.T, out = solver._glocal[:Khidden*dim].reshape(dim, Khidden)) solver._glocal[Khidden*dim:] = gpred.sum(axis=0) # add regularization term, but keep in mind that we have multiple nodes freg, greg = solver.reg(whidden, **solver._regargs) flocal += solver._num_data * solver._gamma * freg / mpi.SIZE solver._glocal[:Khidden*dim] += solver._num_data * solver._gamma \ * greg.ravel() / mpi.SIZE # do mpi reduction mpi.barrier() f = mpi.COMM.allreduce(flocal) mpi.COMM.Allreduce(solver._glocal, solver._g) return f, solver._g
def obj(wb, solver): """ The objective function used by fmin """ # obtain w and b K = solver._K dim = solver._dim w = wb[: K * dim].reshape((dim, K)) b = wb[K * dim :] # pred is a matrix of size [num_datalocal, K] pred = mathutil.dot(solver._X, w) pred += b # compute the loss function flocal, gpred = solver.loss(solver._Y, pred, solver._weight, **solver._lossargs) glocal = np.empty(wb.shape) glocal[: K * dim] = mathutil.dot(solver._X.T, gpred).flat glocal[K * dim :] = gpred.sum(axis=0) # add regularization term, but keep in mind that we have multiple nodes freg, greg = solver.reg(w, **solver._regargs) flocal += solver._num_data * solver._gamma * freg / mpi.SIZE glocal[: K * dim] += solver._num_data * solver._gamma / mpi.SIZE * greg.ravel() # do mpi reduction mpi.barrier() f = mpi.COMM.allreduce(flocal) g = np.empty(glocal.shape, dtype=glocal.dtype) mpi.COMM.Allreduce(glocal, g) return f, g
def testBarrier(self): import time # sleep for a while, and resume time.sleep(mpi.RANK) mpi.barrier() self.assertTrue(True)
def train(self, dataset, num_patches, exhaustive = False, ratio_per_image = 0.1): """ train the convolutional layer Note that we do not train the first element (patch extractor), and stop when we see the spatial pooler. There might be some post processing components after the pooler, but they should not require any training (if they do, you may want to move them to the next layer """ if len(self) == 0: return logging.debug("Training convolutional layer...") if not isinstance(self[0], Extractor): raise ValueError, \ "The first component should be a patch extractor!" patches = self[0].sample(dataset, num_patches, self._previous_layer, exhaustive, ratio_per_image) if len(self) == 1 or isinstance(self[1], Pooler): logging.debug('Nothing to be trained in this layer.') return # actually train the model for i in range(1, len(self)): component = self[i] mpi.barrier() logging.debug("Training %s..." % (component.__class__.__name__)) component.train(patches) if i == len(self) - 1 or isinstance(self[i+1], Pooler): # if we've reached a pooler, stop training break else: # prepare the next component's input patches = component.process(patches) logging.debug("Training convolutional layer done.")
def __init__(self, root_folder, extensions, prefetch = False, target_size = None, max_size = None, min_size = None, center_crop = None): """ Initialize from a two-layer storage Input: root_folder: the root that contains the data. Under root_folder there should be a list of folders, under which there should be a list of files extensions: the list of extensions that should be used to filter the files. Should be like ['png', 'jpg']. It's case insensitive. prefetch: if True, the images are prefetched to avoid disk read. If you have a large number of images, prefetch would require a lot of memory. target_size, max_size, min_size, center_crop: see manipulate() for details. """ super(TwoLayerDataset, self).__init__() if mpi.agree(not os.path.exists(root_folder)): raise OSError, "The specified folder does not exist." logging.debug('Loading from %s' % (root_folder,)) if type(extensions) is str: extensions = [extensions] extensions = set(extensions) if mpi.is_root(): # get files first files = glob.glob(os.path.join(root_folder, '*', '*')) # select those that fits the extension files = [f for f in files if any([ f.lower().endswith(ext) for ext in extensions])] logging.debug("A total of %d images." % (len(files))) # get raw labels labels = [os.path.split(os.path.split(f)[0])[1] for f in files] classnames = list(set(labels)) # sort so we get a reasonable class order classnames.sort() name2val = dict(zip(classnames, range(len(classnames)))) labels = [name2val[label] for label in labels] else: files = None classnames = None labels = None mpi.barrier() self._rawdata = mpi.distribute_list(files) self._data = self._rawdata self._prefetch = prefetch self._target_size = target_size self._max_size = max_size self._min_size = min_size self._center_crop = center_crop if target_size != None: self._dim = tuple(target_size) + (3,) else: self._dim = False self._channels = 3 if prefetch: self._data = [self._read(idx) for idx in range(len(self._data))] self._label = mpi.distribute_list(labels) self._classnames = mpi.COMM.bcast(classnames)
def average_precision(Y, pred): """Average Precision for binary classification """ # since we need to compute the precision recall curve, we have to # compute this on the root node. Y = mpi.COMM.gather(Y) pred = mpi.COMM.gather(pred) if mpi.is_root(): Y = np.hstack(Y) pred = np.hstack(pred) precision, recall, _ = metrics.precision_recall_curve(Y == 1, pred) ap = metrics.auc(recall, precision) else: ap = None mpi.barrier() return mpi.COMM.bcast(ap)
def obj(wb,solver): ''' The objective function used by fmin ''' # obtain w and b K = solver._K dim = solver._dim w = wb[:K*dim].reshape((dim, K)) b = wb[K*dim:] # pred is a matrix of size [num_datalocal, K] mathutil.dot(solver._X, w, out = solver._pred) solver._pred += b # compute the loss function if solver.gpredcache: flocal,gpred = solver.loss(solver._Y, solver._pred, solver._weight, solver._gpred, solver._gpredcache, **solver._lossargs) else: flocal,gpred = solver.loss(solver._Y, solver._pred, solver._weight, **solver._lossargs) mathutil.dot(solver._X.T, gpred, out = solver._glocal[:K*dim].reshape(dim, K)) solver._glocal[K*dim:] = gpred.sum(axis=0) # we should normalize them with the number of data flocal /= solver._num_data solver._glocal /= solver._num_data # add regularization term, but keep in mind that we have multiple nodes # so we only carry it out on root to make sure we only added one # regularization term if mpi.is_root(): freg, greg = solver.reg(w, **solver._regargs) flocal += solver._gamma * freg solver._glocal[:K*dim] += solver._gamma * greg.ravel() # do mpi reduction mpi.barrier() f = mpi.COMM.allreduce(flocal) mpi.COMM.Allreduce(solver._glocal, solver._g) ######### DEBUG PART ############## if np.isnan(f): # check all the components to see what went wrong. print 'rank %s: isnan X: %d' % (mpi.RANK,np.any(np.isnan(solver._X))) print 'rank %s: isnan Y: %d' % (mpi.RANK,np.any(np.isnan(solver._Y))) print 'rank %s: isnan flocal: %d' % (mpi.RANK,np.any(np.isnan(flocal))) print 'rank %s: isnan pred: %d' % (mpi.RANK,np.any(np.isnan(solver._pred))) print 'rank %s: isnan w: %d' % (mpi.RANK,np.any(np.isnan(w))) print 'rank %s: isnan b: %d' % (mpi.RANK,np.any(np.isnan(b))) return f, solver._g
def average_precision(Y, pred): """Average Precision for binary classification """ # since we need to compute the precision recall curve, we have to # compute this on the root node. Y = mpi.COMM.gather(Y) pred = mpi.COMM.gather(pred) if mpi.is_root(): Y = np.hstack(Y) pred = np.hstack(pred) precision, recall, _ = metrics.precision_recall_curve( Y == 1, pred) ap = metrics.auc(recall, precision) else: ap = None mpi.barrier() return mpi.COMM.bcast(ap)
def demo_kmeans(): """A simple kmeans demo """ print 'Running kmeans demo' data = np.vstack((np.random.randn(500,2)+1,\ np.random.randn(500,2)-1)) centers, labels, inertia = kmeans(data, 8, n_init=1, max_iter=5) print 'inertia =', inertia print 'centers = \n', centers try: from matplotlib import pyplot if mpi.is_root(): pyplot.scatter(data[:, 0], data[:, 1], c=labels) pyplot.show() mpi.barrier() except Exception: print 'cannot show figure. will simply pass' pass
def demo_kmeans(): """A simple kmeans demo """ print 'Running kmeans demo' data = np.vstack((np.random.randn(500,2)+1,\ np.random.randn(500,2)-1)) centers, labels, inertia = kmeans(data, 8, n_init=1, max_iter=5) print 'inertia =', inertia print 'centers = \n', centers try: from matplotlib import pyplot if mpi.is_root(): pyplot.scatter(data[:,0],data[:,1],c=labels) pyplot.show() mpi.barrier() except Exception: print 'cannot show figure. will simply pass' pass
def omp1_maximize(X, labels, val, k): '''Learn the new OMP dictionary from the given activations Input: X: the data matrix, each row being a datum. Note that X is the local data hosted in each MPI node. labels: a vector of size X.shape[0], containing the indices of the dictionary entry that is active, one for each datum. val: a vector of size X.shape[0], the activation value of the corresponding entry k: an int specifying the dictionary size. Output: centroids: a matrix of size [k, X.shape[1]] containing the new dictionary. ''' dim = X.shape[1] centroids_local = np.zeros((k, dim)) centroids_local_nonempty = np.zeros(k, dtype=np.int) # loop over the classes for q in range(k): center_mask = (labels == q) if np.any(center_mask): centroids_local[q] = np.dot(val[center_mask], X[center_mask]) centroids_local_nonempty[q] = 1 centroids_nonempty = np.zeros(k, dtype=np.int) mpi.barrier() mpi.COMM.Allreduce(centroids_local_nonempty, centroids_nonempty) # now, for those empty centroids, we need to randomly restart them for q in range(k): if centroids_nonempty[q] == 0 and mpi.is_president(): centroids_local[q] = X[np.random.randint(X.shape[0])] # collect all centroids centroids = np.zeros((k, dim)) mpi.COMM.Reduce(centroids_local, centroids) centroids /= (np.sqrt(np.sum(centroids**2, axis=1)) \ +np.finfo(np.float64).eps \ )[:, np.newaxis] # broadcast to remove any numerical unstability mpi.COMM.Bcast(centroids) return centroids
def omp1_maximize(X, labels, val, k): '''Learn the new OMP dictionary from the given activations Input: X: the data matrix, each row being a datum. Note that X is the local data hosted in each MPI node. labels: a vector of size X.shape[0], containing the indices of the dictionary entry that is active, one for each datum. val: a vector of size X.shape[0], the activation value of the corresponding entry k: an int specifying the dictionary size. Output: centroids: a matrix of size [k, X.shape[1]] containing the new dictionary. ''' dim = X.shape[1] centroids_local = np.zeros((k, dim)) centroids_local_nonempty = np.zeros(k, dtype = np.int) # loop over the classes for q in range(k): center_mask = (labels == q) if np.any(center_mask): centroids_local[q] = np.dot(val[center_mask], X[center_mask]) centroids_local_nonempty[q] = 1 centroids_nonempty = np.zeros(k, dtype=np.int) mpi.barrier() mpi.COMM.Allreduce(centroids_local_nonempty, centroids_nonempty) # now, for those empty centroids, we need to randomly restart them for q in range(k): if centroids_nonempty[q] == 0 and mpi.is_president(): centroids_local[q] = X[np.random.randint(X.shape[0])] # collect all centroids centroids = np.zeros((k, dim)) mpi.COMM.Reduce(centroids_local, centroids) centroids /= (np.sqrt(np.sum(centroids**2, axis=1)) \ +np.finfo(np.float64).eps \ )[:, np.newaxis] # broadcast to remove any numerical unstability mpi.COMM.Bcast(centroids) return centroids
def train(self, dataset, num_patches): """ train the convolutional layer Note that we do not train the first element (patch extractor), and stop when we see the spatial pooler. There might be some post processing components after the pooler, but they should not require any training (if they do, you may want to move them to the next layer """ logging.debug("Training convolutional layer...") if not isinstance(self[0], Extractor): raise ValueError, \ "The first component should be a patch extractor!" patches = self[0].sample(dataset, num_patches, self._previous_layer) for component in self[1:]: mpi.barrier() logging.debug("Training %s..." % (component.__class__.__name__)) if isinstance(component, Pooler): # if we've reached pooler, stop training break patches = component.train(patches) logging.debug("Training convolutional layer done.")
def process_dataset(self, dataset, as_list=False, as_2d=False): """Processes a whole dataset and returns an numpy ndarray Input: dataset: the input dataset. as_list: if True, return a list. This applies when the output has different sizes for each image. Default False. as_2d: if True, return a matrix where each image corresponds to a row in the matrix. Default False. """ # check if we want to use buffer if self._fixed_size: convbuffer = [None] * (len(self) + 1) else: convbuffer = None total = dataset.size_total() logging.debug("Processing a total of %s images" % (total, )) timer = util.Timer() if as_list: data = [self.process(dataset.image(i), convbuffer = convbuffer) \ for i in range(dataset.size())] else: # we assume that each image leads to the same feature size temp = self.process(dataset.image(0), as_vector=as_2d) logging.debug("Output feature shape: %s" % (str(temp.shape))) data = np.empty((dataset.size(), ) + temp.shape) data[0] = temp size = dataset.size() timer = util.Timer() for i in range(1, size): data[i] = self.process(dataset.image(i), as_vector=as_2d, convbuffer=convbuffer) # report local progress if (i * 10 / size) != ((i - 1) * 10 / size): logging.debug("rank %d: %d percent. elapsed %s" % \ (mpi.RANK, i*100 / size, timer.total())) mpi.barrier() logging.debug("Feature extration took %s" % timer.total()) return data
def omp1_predict(X, centroids): ''' omp1 prediction This function does one-dimensional orthogonal matching pursuit. the returned values are simply going to be the indices and inner products. ''' idx = np.empty(X.shape[0], dtype=np.int) val = np.empty(X.shape[0]) # in case we are going to deal with a large matrix, we buffer dots to avoid # multiple memory new / deletes. dots = np.empty((min(_MINIBATCH, X.shape[0]), centroids.shape[0]), dtype = X.dtype) for start in range(0, X.shape[0], _MINIBATCH): end = min(start+_MINIBATCH, X.shape[0]) batchsize = end-start mathutil.dot(X[start:end], centroids.T, out = dots[:batchsize]) np.abs(dots, out=dots) idx[start:end] = np.argmax(dots[:batchsize], axis=1) val[start:end] = dots[range(batchsize), idx[start:end]] mpi.barrier() return idx, val
def omp1_predict(X, centroids): ''' omp1 prediction This function does one-dimensional orthogonal matching pursuit. the returned values are simply going to be the indices and inner products. ''' idx = np.empty(X.shape[0], dtype=np.int) val = np.empty(X.shape[0]) # in case we are going to deal with a large matrix, we buffer dots to avoid # multiple memory new / deletes. dots = np.empty((min(_MINIBATCH, X.shape[0]), centroids.shape[0]), dtype=X.dtype) for start in range(0, X.shape[0], _MINIBATCH): end = min(start + _MINIBATCH, X.shape[0]) batchsize = end - start mathutil.dot(X[start:end], centroids.T, out=dots[:batchsize]) np.abs(dots, out=dots) idx[start:end] = np.argmax(dots[:batchsize], axis=1) val[start:end] = dots[range(batchsize), idx[start:end]] mpi.barrier() return idx, val
def process_dataset(self, dataset, as_list = False, as_2d = False): """Processes a whole dataset and returns an numpy ndarray Input: dataset: the input dataset. as_list: if True, return a list. This applies when the output has different sizes for each image. Default False. as_2d: if True, return a matrix where each image corresponds to a row in the matrix. Default False. """ # check if we want to use buffer if self._fixed_size: convbuffer = [None] * (len(self) + 1) else: convbuffer = None total = dataset.size_total() logging.debug("Processing a total of %s images" % (total,)) timer = util.Timer() if as_list: data = [self.process(dataset.image(i), convbuffer = convbuffer) \ for i in range(dataset.size())] else: # we assume that each image leads to the same feature size temp = self.process(dataset.image(0), as_vector = as_2d) logging.debug("Output feature shape: %s" % (str(temp.shape))) data = np.empty((dataset.size(),) + temp.shape) data[0] = temp size = dataset.size() timer = util.Timer() for i in range(1,size): data[i] = self.process(dataset.image(i), as_vector = as_2d, convbuffer = convbuffer) # report local progress if (i * 10 / size) != ((i-1) * 10 / size): logging.debug("rank %d: %d percent. elapsed %s" % \ (mpi.RANK, i*100 / size, timer.total())) mpi.barrier() logging.debug("Feature extration took %s" % timer.total()) return data
def demo_read(root): from iceberk import visualize vis = visualize.PatchVisualizer() print 'Loading training data...' traindata = STL10Dataset(root, 'train') print 'My training data size:', traindata.size() print 'Loading testing data...' testdata = STL10Dataset(root, 'test') print 'My testing data size:', testdata.size() print 'Loading unlabeled data...' unlabeleddata = STL10Dataset(root, 'unlabeled') print 'My unlabeled data size:', unlabeleddata.size() if mpi.is_root(): vis.pyplot.figure() vis.show_multiple(traindata.raw_data()[:25]) vis.pyplot.title('Sample training images.') vis.pyplot.figure() vis.show_multiple(testdata.raw_data()[:25]) vis.pyplot.title('Sample testing images.') vis.pyplot.figure() vis.show_multiple(unlabeleddata.raw_data()[:25]) vis.pyplot.title('Sample unlabeled images.') vis.pyplot.show() mpi.barrier()
def obj(param, solver): """The objective function used by fmin """ w = param[:-1] b = param[-1] # prediction is a vector pred = np.dot(solver._X, w) + b # call the loss flocal, gpred = solver.loss(solver._Y, pred, solver._weight, **solver._lossargs) # get the gradient for both w and b glocal = np.empty(param.shape) glocal[:-1] = np.dot(gpred, solver._X) glocal[-1] = gpred.sum() # do mpi reduction # for the regularization term freg, greg = solver.reg(w, **solver._regargs) flocal += solver._num_data * solver._gamma / mpi.SIZE * freg glocal[:-1] += solver._num_data * solver._gamma / mpi.SIZE * greg mpi.barrier() f = mpi.COMM.allreduce(flocal) g = np.empty(glocal.shape) mpi.COMM.Allreduce(glocal, g) return f, g
from iceberk import mpi import logging import numpy as np import time mpi.root_log_level(logging.INFO) # just a large matrix a = np.random.rand(1000, 12800) a_local = np.random.rand(1000, 12800) rank = mpi.RANK logging.info('Testing mpi size %d' % mpi.SIZE) mpi.barrier() start = time.time() mpi.COMM.Allreduce(a_local, a) logging.info('Allreduce big speed: %f s' % (time.time() - start)) mpi.barrier() start = time.time() for i in xrange(a.shape[0]): mpi.COMM.Allreduce(a_local[i], a[i]) logging.info('Allreduce small speed: %f s' % (time.time() - start))
std.resize(np.prod(regions_pooled.shape[1:-1]), regions_pooled.shape[-1]) std = std.mean(axis=0) std_order = np.argsort(std) # now, compute the within-class std regions_pooled_view = regions_pooled.reshape(regions_pooled.shape[0], np.prod(regions_pooled.shape[1:-1]), regions_pooled.shape[-1]) within_std_local = regions_pooled_view.var(axis=1) print within_std_local.shape within_std = np.sqrt(mathutil.mpi_mean(within_std_local)) within_std_order = np.argsort(within_std) std_comparison = within_std / (std + 1e-10) std_comparison_order = np.argsort(std_comparison) if mpi.is_root(): pyplot.figure() visualize.show_multiple(conv[-2].dictionary[std_order]) pyplot.savefig("codes_std_ordered.pdf") pyplot.figure() visualize.show_multiple(conv[-2].dictionary[within_std_order]) pyplot.savefig("codes_within_std_ordered.pdf") pyplot.figure() visualize.show_multiple(conv[-2].dictionary[std_comparison_order]) pyplot.savefig("codes_std_comparison_ordered.pdf") pyplot.figure() pyplot.plot(std) pyplot.show() mpi.barrier()
def apcluster_k(feature, num_centers, corr=True, tol=0): """perform the affinity propagation algorithm for the input codes. """ logging.debug("ap: preparing similarity matrix") covmat = mathutil.mpi_cov(feature) std = np.diag(covmat) # normalize std = np.sqrt(std**2 + 0.01) if corr: # compute correlation. If corr is False, we will use the covariance # directly covmat /= std covmat /= std[:, np.newaxis] # compute the similarity matrix norm = np.diag(covmat) / 2 covmat -= norm covmat -= norm[:, np.newaxis] # add a small noise to covmat noise = (covmat + np.finfo(np.float64).eps) * \ np.random.rand(covmat.shape[0], covmat.shape[1]) mpi.COMM.Bcast(noise) covmat += noise # The remaining part can just be carried out on root if mpi.is_root(): # set preference pmax = covmat.max() #af = AffinityPropagation().fit(covmat, pmax) #num_max = len(af.cluster_centers_indices_) # in fact, num_max would always be covmat.shape[0] so we don't really # run ap num_max = covmat.shape[0] logging.debug("ap: pmax = %s, num = %d" % (pmax, num_max)) pmin = covmat.min() af = AffinityPropagation().fit(covmat, pmin) # num_min is the theoretical min, but the python code seem to raise bugs... num_min = len(af.cluster_centers_indices_) logging.debug("ap: pmin = %s, num = %d" % (pmin, num_min)) if num_centers < num_min: logging.warning("num_centers too small, will return %d centers" % (num_min, )) return af.cluster_centers_indices_, af.labels_, covmat if num_centers > num_max: logging.warning("num_centers too large, will return everything.") return np.arange(covmat.shape[0], dtype=np.int), \ np.arange(covmat.shape[0], dtype=np.int) logging.debug("ap: start affinity propagation") # We will simply use bisection search to find the right number of centroids. for i in range(_AP_MAX_ITERATION): pref = (pmax + pmin) / 2 af = AffinityPropagation().fit(covmat, pref) num = len(af.cluster_centers_indices_) logging.debug("ap try %d: pref = %s, num = %s" % (i + 1, pref, num)) if num >= num_centers - tol and num <= num_centers + tol: break elif num < num_centers: pmin = pref num_min = num else: pmax = pref num_max = num else: af = None mpi.barrier() af = mpi.COMM.bcast(af) return af.cluster_centers_indices_, af.labels_, covmat
def __init__(self, root_folder, extensions, prefetch=False, target_size=None, max_size=None, min_size=None, center_crop=None): """ Initialize from a two-layer storage Input: root_folder: the root that contains the data. Under root_folder there should be a list of folders, under which there should be a list of files extensions: the list of extensions that should be used to filter the files. Should be like ['png', 'jpg']. It's case insensitive. prefetch: if True, the images are prefetched to avoid disk read. If you have a large number of images, prefetch would require a lot of memory. target_size, max_size, min_size, center_crop: see manipulate() for details. """ super(TwoLayerDataset, self).__init__() if mpi.agree(not os.path.exists(root_folder)): raise OSError, "The specified folder does not exist." logging.debug('Loading from %s' % (root_folder, )) if type(extensions) is str: extensions = [extensions] extensions = set(extensions) if mpi.is_root(): # get files first files = glob.glob(os.path.join(root_folder, '*', '*')) # select those that fits the extension files = [ f for f in files if any([f.lower().endswith(ext) for ext in extensions]) ] logging.debug("A total of %d images." % (len(files))) # get raw labels labels = [os.path.split(os.path.split(f)[0])[1] for f in files] classnames = list(set(labels)) # sort so we get a reasonable class order classnames.sort() name2val = dict(zip(classnames, range(len(classnames)))) labels = [name2val[label] for label in labels] else: files = None classnames = None labels = None mpi.barrier() self._rawdata = mpi.distribute_list(files) self._data = self._rawdata self._prefetch = prefetch self._target_size = target_size self._max_size = max_size self._min_size = min_size self._center_crop = center_crop if target_size != None: self._dim = tuple(target_size) + (3, ) else: self._dim = False self._channels = 3 if prefetch: self._data = [self._read(idx) for idx in range(len(self._data))] self._label = mpi.distribute_list(labels) self._classnames = mpi.COMM.bcast(classnames)
def apcluster_k(feature, num_centers, corr = True, tol = 0): """perform the affinity propagation algorithm for the input codes. """ logging.debug("ap: preparing similarity matrix") covmat = mathutil.mpi_cov(feature) std = np.diag(covmat) # normalize std = np.sqrt(std**2 + 0.01) if corr: # compute correlation. If corr is False, we will use the covariance # directly covmat /= std covmat /= std[:, np.newaxis] # compute the similarity matrix norm = np.diag(covmat) / 2 covmat -= norm covmat -= norm[:, np.newaxis] # add a small noise to covmat noise = (covmat + np.finfo(np.float64).eps) * \ np.random.rand(covmat.shape[0], covmat.shape[1]) mpi.COMM.Bcast(noise) covmat += noise # The remaining part can just be carried out on root if mpi.is_root(): # set preference pmax = covmat.max() #af = AffinityPropagation().fit(covmat, pmax) #num_max = len(af.cluster_centers_indices_) # in fact, num_max would always be covmat.shape[0] so we don't really # run ap num_max = covmat.shape[0] logging.debug("ap: pmax = %s, num = %d" % (pmax, num_max)) pmin = covmat.min() af = AffinityPropagation().fit(covmat, pmin) # num_min is the theoretical min, but the python code seem to raise bugs... num_min = len(af.cluster_centers_indices_) logging.debug("ap: pmin = %s, num = %d" % (pmin, num_min)) if num_centers < num_min: logging.warning("num_centers too small, will return %d centers" % (num_min,)) return af.cluster_centers_indices_, af.labels_, covmat if num_centers > num_max: logging.warning("num_centers too large, will return everything.") return np.arange(covmat.shape[0], dtype=np.int), \ np.arange(covmat.shape[0], dtype=np.int) logging.debug("ap: start affinity propagation") # We will simply use bisection search to find the right number of centroids. for i in range(_AP_MAX_ITERATION): pref = (pmax + pmin) / 2 af = AffinityPropagation().fit(covmat, pref) num = len(af.cluster_centers_indices_) logging.debug("ap try %d: pref = %s, num = %s" % (i + 1, pref, num)) if num >= num_centers - tol and num <= num_centers + tol: break elif num < num_centers: pmin = pref num_min = num else: pmax = pref num_max = num else: af = None mpi.barrier() af = mpi.COMM.bcast(af) return af.cluster_centers_indices_, af.labels_, covmat