示例#1
0
 def __init__(self, root_folder, extensions, prefetch = False, 
              target_size = None, max_size = None, min_size = None,
              center_crop = None):
     """ Initialize from a two-layer storage
     Input:
         root_folder: the root that contains the data. Under root_folder
             there should be a list of folders, under which there should be
             a list of files
         extensions: the list of extensions that should be used to filter the
             files. Should be like ['png', 'jpg']. It's case insensitive.
         prefetch: if True, the images are prefetched to avoid disk read. If
             you have a large number of images, prefetch would require a lot
             of memory.
         target_size, max_size, min_size, center_crop: see manipulate() for
             details.
     """
     super(TwoLayerDataset, self).__init__()
     if mpi.agree(not os.path.exists(root_folder)):
         raise OSError, "The specified folder does not exist."
     logging.debug('Loading from %s' % (root_folder,))
     if type(extensions) is str:
         extensions = [extensions]
     extensions = set(extensions)
     if mpi.is_root():
         # get files first
         files = glob.glob(os.path.join(root_folder, '*', '*'))
         # select those that fits the extension
         files = [f for f in files  if any([
                         f.lower().endswith(ext) for ext in extensions])]
         logging.debug("A total of %d images." % (len(files)))
         # get raw labels
         labels = [os.path.split(os.path.split(f)[0])[1] for f in files]
         classnames = list(set(labels))
         # sort so we get a reasonable class order
         classnames.sort()
         name2val = dict(zip(classnames, range(len(classnames))))
         labels = [name2val[label] for label in labels]
     else:
         files = None
         classnames = None
         labels = None
     mpi.barrier()
     self._rawdata = mpi.distribute_list(files)
     self._data = self._rawdata
     self._prefetch = prefetch
     self._target_size = target_size
     self._max_size = max_size
     self._min_size = min_size
     self._center_crop = center_crop
     if target_size != None:
         self._dim = tuple(target_size) + (3,)
     else:
         self._dim = False
     self._channels = 3
     if prefetch:
         self._data = [self._read(idx) for idx in range(len(self._data))]
     self._label = mpi.distribute_list(labels)
     self._classnames = mpi.COMM.bcast(classnames)
示例#2
0
 def __init__(self,
              root_folder,
              extensions,
              prefetch=False,
              target_size=None,
              max_size=None,
              min_size=None,
              center_crop=None):
     """ Initialize from a two-layer storage
     Input:
         root_folder: the root that contains the data. Under root_folder
             there should be a list of folders, under which there should be
             a list of files
         extensions: the list of extensions that should be used to filter the
             files. Should be like ['png', 'jpg']. It's case insensitive.
         prefetch: if True, the images are prefetched to avoid disk read. If
             you have a large number of images, prefetch would require a lot
             of memory.
         target_size, max_size, min_size, center_crop: see manipulate() for
             details.
     """
     super(TwoLayerDataset, self).__init__()
     if mpi.agree(not os.path.exists(root_folder)):
         raise OSError, "The specified folder does not exist."
     logging.debug('Loading from %s' % (root_folder, ))
     if type(extensions) is str:
         extensions = [extensions]
     extensions = set(extensions)
     if mpi.is_root():
         # get files first
         files = glob.glob(os.path.join(root_folder, '*', '*'))
         # select those that fits the extension
         files = [
             f for f in files
             if any([f.lower().endswith(ext) for ext in extensions])
         ]
         logging.debug("A total of %d images." % (len(files)))
         # get raw labels
         labels = [os.path.split(os.path.split(f)[0])[1] for f in files]
         classnames = list(set(labels))
         # sort so we get a reasonable class order
         classnames.sort()
         name2val = dict(zip(classnames, range(len(classnames))))
         labels = [name2val[label] for label in labels]
     else:
         files = None
         classnames = None
         labels = None
     mpi.barrier()
     self._rawdata = mpi.distribute_list(files)
     self._data = self._rawdata
     self._prefetch = prefetch
     self._target_size = target_size
     self._max_size = max_size
     self._min_size = min_size
     self._center_crop = center_crop
     if target_size != None:
         self._dim = tuple(target_size) + (3, )
     else:
         self._dim = False
     self._channels = 3
     if prefetch:
         self._data = [self._read(idx) for idx in range(len(self._data))]
     self._label = mpi.distribute_list(labels)
     self._classnames = mpi.COMM.bcast(classnames)
示例#3
0
 def testAgree(self):
     self.assertTrue(mpi.agree(True))
     self.assertFalse(mpi.agree(False))
     self.assertTrue(mpi.agree(mpi.RANK == 0))
     self.assertFalse(mpi.agree(mpi.RANK != 0))
     self.assertFalse(mpi.agree(mpi.RANK))
示例#4
0
 def testAgree(self):
     self.assertTrue(mpi.agree(True))
     self.assertFalse(mpi.agree(False))
     self.assertTrue(mpi.agree(mpi.RANK == 0))
     self.assertFalse(mpi.agree(mpi.RANK != 0))
     self.assertFalse(mpi.agree(mpi.RANK))
def kmeans(X, k, n_init=1, max_iter=300, tol=1e-4):
    """ K-means clustering algorithm.

    Parameters
    ----------
    X: ndarray
        A M by N array of M observations in N dimensions. X in every MPI node
        is the local data points it is responsible for.

    k: int or ndarray
        The number of clusters to form.

    n_init: int, optional, default: 1
        Number of time the k-means algorithm will be run with different
        centroid seeds. The final results will be the best output of
        n_init consecutive runs in terms of inertia.

    max_iter: int, optional, default 300
        Maximum number of iterations of the k-means algorithm to run.

    tol: float, optional
        The relative increment in the results before declaring convergence.

    Returns
    -------
    centroid: ndarray
        A k by N array of centroids found at the last iteration of
        k-means.

    label: ndarray
        label[i] is the code or index of the centroid the
        i'th observation is closest to.

    inertia: float
        The final value of the inertia criterion

    """
    # do k-means training
    # vdata helps the stop criterion
    vdata = mpi.COMM.allreduce(np.mean(np.var(X, 0))) / mpi.SIZE
    best_inertia = np.infty
    
    if k <= 0:
        raise ValueError, "The number of centers (%d) should be positive." % k
    if mpi.COMM.allreduce(X.shape[0], op=mpi.MPI.MIN) == 0:
        raise RuntimeError, "Some nodes has zero data."

    logging.debug("Kmeans: A total of %d data points." % \
                  mpi.COMM.allreduce(X.shape[0]))
    # pre-compute squared norms of data points
    x_squared_norms = (X**2).sum(axis=1)
    for init_count in range(n_init):
        logging.debug("Kmeans trial %d" % (init_count,))
        # initialization
        centers = X[np.random.randint(X.shape[0], size = k)]
        centers_all = mpi.COMM.gather(centers)
        if mpi.is_root():
            centers_all = np.vstack(centers_all)
            centers[:] = centers_all[
                    np.random.permutation(centers_all.shape[0])[:k]]
        mpi.COMM.Bcast(centers)
        
        # iterations
        for iter_id in range(max_iter):
            logging.debug("Kmeans iter %d" % (iter_id))
            centers_old = centers.copy()
            labels, inertia = _e_step(X, centers,
                                      x_squared_norms=x_squared_norms)
            inertia = mpi.COMM.allreduce(inertia)
            logging.debug("Inertia %f" % (inertia),)
            centers = _m_step(X, labels, k)
            # test convergence
            converged = (np.sum((centers_old - centers) ** 2) < tol * vdata)
            if mpi.agree(converged):
                break

        if inertia < best_inertia:
            best_labels = labels.copy()
            best_centers = centers.copy()
            best_inertia = inertia
    return best_centers, best_labels, best_inertia
示例#6
0
def kmeans(X, k, n_init=1, max_iter=300, tol=1e-4):
    """ K-means clustering algorithm.

    Parameters
    ----------
    X: ndarray
        A M by N array of M observations in N dimensions. X in every MPI node
        is the local data points it is responsible for.

    k: int or ndarray
        The number of clusters to form.

    n_init: int, optional, default: 1
        Number of time the k-means algorithm will be run with different
        centroid seeds. The final results will be the best output of
        n_init consecutive runs in terms of inertia.

    max_iter: int, optional, default 300
        Maximum number of iterations of the k-means algorithm to run.

    tol: float, optional
        The relative increment in the results before declaring convergence.

    Returns
    -------
    centroid: ndarray
        A k by N array of centroids found at the last iteration of
        k-means.

    label: ndarray
        label[i] is the code or index of the centroid the
        i'th observation is closest to.

    inertia: float
        The final value of the inertia criterion

    """
    # do k-means training
    # vdata helps the stop criterion
    vdata = mpi.COMM.allreduce(np.mean(np.var(X, 0))) / mpi.SIZE
    best_inertia = np.infty

    if k <= 0:
        raise ValueError, "The number of centers (%d) should be positive." % k
    if mpi.COMM.allreduce(X.shape[0], op=mpi.MPI.MIN) == 0:
        raise RuntimeError, "Some nodes has zero data."

    logging.debug("Kmeans: A total of %d data points." % \
                  mpi.COMM.allreduce(X.shape[0]))
    # pre-compute squared norms of data points
    x_squared_norms = (X**2).sum(axis=1)
    for init_count in range(n_init):
        logging.debug("Kmeans trial %d" % (init_count, ))
        # initialization
        centers = X[np.random.randint(X.shape[0], size=k)]
        centers_all = mpi.COMM.gather(centers)
        if mpi.is_root():
            centers_all = np.vstack(centers_all)
            centers[:] = centers_all[np.random.permutation(
                centers_all.shape[0])[:k]]
        mpi.COMM.Bcast(centers)

        # iterations
        for iter_id in range(max_iter):
            logging.debug("Kmeans iter %d" % (iter_id))
            centers_old = centers.copy()
            labels, inertia = _e_step(X,
                                      centers,
                                      x_squared_norms=x_squared_norms)
            inertia = mpi.COMM.allreduce(inertia)
            logging.debug("Inertia %f" % (inertia), )
            centers = _m_step(X, labels, k)
            # test convergence
            converged = (np.sum((centers_old - centers)**2) < tol * vdata)
            if mpi.agree(converged):
                break

        if inertia < best_inertia:
            best_labels = labels.copy()
            best_centers = centers.copy()
            best_inertia = inertia
    return best_centers, best_labels, best_inertia