示例#1
0
def test_improve_memory_error_message():
    """
    Tests that the MemoryError's message is improved correctly
    """

    try:
        improve_memory_error_message(MemoryError(), "test")
    except MemoryError as e:
        # message has been "improved"
        assert len(str(e))

    try:
        improve_memory_error_message(MemoryError("test"), "should not")
    except MemoryError as e:
        assert str(e) == "test"
示例#2
0
def test_improve_memory_error_message():
    """
    Tests that the MemoryError's message is improved correctly
    """

    try:
        improve_memory_error_message(MemoryError(), "test")
    except MemoryError as e:
        # message has been "improved"
        assert len(str(e))

    try:
        improve_memory_error_message(MemoryError("test"), "should not")
    except MemoryError as e:
        assert str(e) == "test"
示例#3
0
    def train_all(self, dataset, mu=None):
        """
        Process kmeans algorithm on the input to localize clusters.

        Parameters
        ----------
        dataset : WRITEME
        mu : WRITEME

        Returns
        -------
        rval : bool
            WRITEME
        """

        # TODO-- why does this sometimes return X and sometimes return nothing?

        X = dataset.get_design_matrix()

        n, m = X.shape
        k = self.k

        if milk is not None:
            # use the milk implementation of k-means if it's available
            cluster_ids, mu = milk.kmeans(X, k)
        else:
            # our own implementation

            # taking random inputs as initial clusters if user does not provide
            # them.
            if mu is not None:
                if not len(mu) == k:
                    raise Exception("You gave %i clusters"
                                    ", but k=%i were expected" % (len(mu), k))
            else:
                indices = numpy.random.randint(X.shape[0], size=k)
                mu = X[indices]

            try:
                dists = numpy.zeros((n, k))
            except MemoryError as e:
                improve_memory_error_message(
                    e, "dying trying to allocate "
                    "dists matrix for {0} "
                    "examples and {1} "
                    "means".format(n, k))

            old_kills = {}

            iter = 0
            mmd = prev_mmd = float('inf')
            while True:
                if self.verbose:
                    logger.info('kmeans iter {0}'.format(iter))

                # print 'iter:',iter,' conv crit:',abs(mmd-prev_mmd)
                # if numpy.sum(numpy.isnan(mu)) > 0:
                if contains_nan(mu):
                    logger.info('nan found')
                    return X

                # computing distances
                for i in xrange(k):
                    dists[:, i] = numpy.square((X - mu[i, :])).sum(axis=1)

                if iter > 0:
                    prev_mmd = mmd

                min_dists = dists.min(axis=1)

                # mean minimum distance:
                mmd = min_dists.mean()

                logger.info('cost: {0}'.format(mmd))

                if iter > 0 and (iter >= self.max_iter
                                 or abs(mmd - prev_mmd) < self.convergence_th):
                    # converged
                    break

                # finding minimum distances
                min_dist_inds = dists.argmin(axis=1)

                # computing means
                i = 0
                blacklist = []
                new_kills = {}
                while i < k:
                    b = min_dist_inds == i
                    if not numpy.any(b):
                        killed_on_prev_iter = True
                        # initializes empty cluster to be the mean of the d
                        # data points farthest from their corresponding means
                        if i in old_kills:
                            d = old_kills[i] - 1
                            if d == 0:
                                d = 50
                            new_kills[i] = d
                        else:
                            d = 5
                        mu[i, :] = 0
                        for j in xrange(d):
                            idx = numpy.argmax(min_dists)
                            min_dists[idx] = 0
                            # chose point idx
                            mu[i, :] += X[idx, :]
                            blacklist.append(idx)
                        mu[i, :] /= float(d)
                        # cluster i was empty, reset it to d far out data
                        # points recomputing distances for this cluster
                        dists[:, i] = numpy.square((X - mu[i, :])).sum(axis=1)
                        min_dists = dists.min(axis=1)
                        for idx in blacklist:
                            min_dists[idx] = 0
                        min_dist_inds = dists.argmin(axis=1)
                        # done
                        i += 1
                    else:
                        mu[i, :] = numpy.mean(X[b, :], axis=0)
                        if contains_nan(mu):
                            logger.info('nan found at {0}'.format(i))
                            return X
                        i += 1

                old_kills = new_kills

                iter += 1

        self.mu = sharedX(mu)
        self._params = [self.mu]
        return True
示例#4
0
def _load(filepath, recurse_depth=0, retry=True):
    """
    Recursively tries to load a file until success or maximum number of
    attempts.

    Parameters
    ----------
    filepath : str
        A path to a file to load. Should be a pickle, Matlab, or NumPy
        file; or a .txt or .amat file that numpy.loadtxt can load.
    recurse_depth : int, optional
        End users should not use this argument. It is used by the function
        itself to implement the `retry` option recursively.
    retry : bool, optional
        If True, will make a handful of attempts to load the file before
        giving up. This can be useful if you are for example calling
        show_weights.py on a file that is actively being written to by a
        training script--sometimes the load attempt might fail if the
        training script writes at the same time show_weights tries to
        read, but if you try again after a few seconds you should be able
        to open the file.

    Returns
    -------
    loaded_object : object
        The object that was stored in the file.
    """
    try:
        import joblib
        joblib_available = True
    except ImportError:
        joblib_available = False
    if recurse_depth == 0:
        filepath = preprocess(filepath)

    if filepath.endswith('.npy') or filepath.endswith('.npz'):
        return np.load(filepath)

    if filepath.endswith('.amat') or filepath.endswith('txt'):
        try:
            return np.loadtxt(filepath)
        except Exception:
            reraise_as("{0} cannot be loaded by serial.load (trying "
                       "to use np.loadtxt)".format(filepath))

    if filepath.endswith('.mat'):
        global io
        if io is None:
            import scipy.io
            io = scipy.io
        try:
            return io.loadmat(filepath)
        except NotImplementedError as nei:
            if str(nei).find('HDF reader') != -1:
                global hdf_reader
                if hdf_reader is None:
                    import h5py
                    hdf_reader = h5py
                return hdf_reader.File(filepath, 'r')
            else:
                raise
        # this code should never be reached
        assert False

    # for loading PY2 pickle in PY3
    encoding = {'encoding': 'latin-1'} if six.PY3 else {}

    def exponential_backoff():
        if recurse_depth > 9:
            logger.info('Max number of tries exceeded while trying to open '
                        '{0}'.format(filepath))
            logger.info('attempting to open via reading string')
            with open(filepath, 'rb') as f:
                content = f.read()
            return cPickle.loads(content, **encoding)
        else:
            nsec = 0.5 * (2.0 ** float(recurse_depth))
            logger.info("Waiting {0} seconds and trying again".format(nsec))
            time.sleep(nsec)
            return _load(filepath, recurse_depth + 1, retry)

    try:
        if not joblib_available:
            with open(filepath, 'rb') as f:
                obj = cPickle.load(f, **encoding)
        else:
            try:
                obj = joblib.load(filepath)
            except Exception as e:
                if os.path.exists(filepath) and not os.path.isdir(filepath):
                    raise
                raise_cannot_open(filepath)
    except MemoryError as e:
        # We want to explicitly catch this exception because for MemoryError
        # __str__ returns the empty string, so some of our default printouts
        # below don't make a lot of sense.
        # Also, a lot of users assume any exception is a bug in the library,
        # so we can cut down on mail to pylearn-users by adding a message
        # that makes it clear this exception is caused by their machine not
        # meeting requirements.
        if os.path.splitext(filepath)[1] == ".pkl":
            improve_memory_error_message(e,
                                         ("You do not have enough memory to "
                                          "open %s \n"
                                          " + Try using numpy.{save,load} "
                                          "(file with extension '.npy') "
                                          "to save your file. It uses less "
                                          "memory when reading and "
                                          "writing files than pickled files.")
                                         % filepath)
        else:
            improve_memory_error_message(e,
                                         "You do not have enough memory to "
                                         "open %s" % filepath)

    except (BadPickleGet, EOFError, KeyError) as e:
        if not retry:
            reraise_as(e.__class__('Failed to open {0}'.format(filepath)))
        obj = exponential_backoff()
    except ValueError:
        logger.exception

        if not retry:
            reraise_as(ValueError('Failed to open {0}'.format(filepath)))
        obj = exponential_backoff()
    except Exception:
        # assert False
        reraise_as("Couldn't open {0}".format(filepath))

    # if the object has no yaml_src, we give it one that just says it
    # came from this file. could cause trouble if you save obj again
    # to a different location
    if not hasattr(obj, 'yaml_src'):
        try:
            obj.yaml_src = '!pkl: "' + os.path.abspath(filepath) + '"'
        except Exception:
            pass

    return obj
示例#5
0
def _load(filepath, recurse_depth=0, retry=True):
    """
    Recursively tries to load a file until success or maximum number of
    attempts.
    Parameters
    ----------
    filepath : str
        A path to a file to load. Should be a pickle, Matlab, or NumPy
        file; or a .txt or .amat file that numpy.loadtxt can load.
    recurse_depth : int, optional
        End users should not use this argument. It is used by the function
        itself to implement the `retry` option recursively.
    retry : bool, optional
        If True, will make a handful of attempts to load the file before
        giving up. This can be useful if you are for example calling
        show_weights.py on a file that is actively being written to by a
        training script--sometimes the load attempt might fail if the
        training script writes at the same time show_weights tries to
        read, but if you try again after a few seconds you should be able
        to open the file.
    Returns
    -------
    loaded_object : object
        The object that was stored in the file.
    """
    try:
        import joblib
        joblib_available = True
    except ImportError:
        joblib_available = False
    if recurse_depth == 0:
        filepath = preprocess(filepath)

    if filepath.endswith('.npy') or filepath.endswith('.npz'):
        return np.load(filepath)

    if filepath.endswith('.amat') or filepath.endswith('txt'):
        try:
            return np.loadtxt(filepath)
        except Exception:
            reraise_as("{0} cannot be loaded by serial.load (trying "
                       "to use np.loadtxt)".format(filepath))

    if filepath.endswith('.mat'):
        global io
        if io is None:
            import scipy.io
            io = scipy.io
        try:
            return io.loadmat(filepath)
        except NotImplementedError as nei:
            if str(nei).find('HDF reader') != -1:
                global hdf_reader
                if hdf_reader is None:
                    import h5py
                    hdf_reader = h5py
                return hdf_reader.File(filepath, 'r')
            else:
                raise
        # this code should never be reached
        assert False

    # for loading PY2 pickle in PY3
    encoding = {'encoding': 'latin-1'} if six.PY3 else {}

    def exponential_backoff():
        if recurse_depth > 9:
            logger.info('Max number of tries exceeded while trying to open '
                        '{0}'.format(filepath))
            logger.info('attempting to open via reading string')
            with open(filepath, 'rb') as f:
                content = f.read()
            return cPickle.loads(content, **encoding)
        else:
            nsec = 0.5 * (2.0 ** float(recurse_depth))
            logger.info("Waiting {0} seconds and trying again".format(nsec))
            time.sleep(nsec)
            return _load(filepath, recurse_depth + 1, retry)

    try:
        if not joblib_available:
            with open(filepath, 'rb') as f:
                obj = cPickle.load(f, **encoding)
        else:
            try:
                obj = joblib.load(filepath)
            except Exception as e:
                if os.path.exists(filepath) and not os.path.isdir(filepath):
                    raise
                raise_cannot_open(filepath)
    except MemoryError as e:
        # We want to explicitly catch this exception because for MemoryError
        # __str__ returns the empty string, so some of our default printouts
        # below don't make a lot of sense.
        # Also, a lot of users assume any exception is a bug in the library,
        # so we can cut down on mail to pylearn-users by adding a message
        # that makes it clear this exception is caused by their machine not
        # meeting requirements.
        if os.path.splitext(filepath)[1] == ".pkl":
            improve_memory_error_message(e,
                                         ("You do not have enough memory to "
                                          "open %s \n"
                                          " + Try using numpy.{save,load} "
                                          "(file with extension '.npy') "
                                          "to save your file. It uses less "
                                          "memory when reading and "
                                          "writing files than pickled files.")
                                         % filepath)
        else:
            improve_memory_error_message(e,
                                         "You do not have enough memory to "
                                         "open %s" % filepath)

    except (BadPickleGet, EOFError, KeyError) as e:
        if not retry:
            reraise_as(e.__class__('Failed to open {0}'.format(filepath)))
        obj = exponential_backoff()
    except ValueError:
        logger.exception

        if not retry:
            reraise_as(ValueError('Failed to open {0}'.format(filepath)))
        obj = exponential_backoff()
    except Exception:
        # assert False
        reraise_as("Couldn't open {0}".format(filepath))

    # if the object has no yaml_src, we give it one that just says it
    # came from this file. could cause trouble if you save obj again
    # to a different location
    if not hasattr(obj, 'yaml_src'):
        try:
            obj.yaml_src = '!pkl: "' + os.path.abspath(filepath) + '"'
        except Exception:
            pass

    return obj
示例#6
0
def compute_log_z(rbm, free_energy_fn, max_bits=15):
    """
    Compute the log partition function of an (binary-binary) RBM.

    Parameters
    ----------
    rbm : object
        An RBM object from `pylearn2.models`.
    free_energy_fn : callable
        A callable object (e.g. Theano function) that computes the
        free energy of a stack of configuration for this RBM.
    max_bits : int, optional
        The (base-2) log of the number of states to enumerate (and
        compute free energy for) at a time.

    Notes
    -----
    This function enumerates a sum with exponentially many terms, and
    should not be used with more than a small, toy model.
    """
    # Pick whether to iterate over visible or hidden states.
    if rbm.nvis < rbm.nhid:
        width = rbm.nvis
        type = "vis"
    else:
        width = rbm.nhid
        type = "hid"

    # Determine in how many steps to compute Z.
    block_bits = width if (not max_bits or width < max_bits) else max_bits
    block_size = 2 ** block_bits

    # Allocate storage for 2**block_bits of the 2**width possible
    # configurations.
    try:
        logz_data_c = numpy.zeros((block_size, width), order="C", dtype=config.floatX)
    except MemoryError:
        reraise_as(
            MemoryError(
                "failed to allocate (%d, %d) matrix of "
                "type %s in compute_log_z; try a smaller "
                "value of max_bits" % (block_size, width, str(config.floatX))
            )
        )

    # fill in the first block_bits, which will remain fixed for all
    # 2**width configs
    tensor_10D_idx = numpy.ndindex(*([2] * block_bits))
    for i, j in enumerate(tensor_10D_idx):
        logz_data_c[i, -block_bits:] = j
    try:
        logz_data = numpy.array(logz_data_c, order="F", dtype=config.floatX)
    except MemoryError:
        reraise_as(
            MemoryError(
                "failed to allocate (%d, %d) matrix of "
                "type %s in compute_log_z; try a smaller "
                "value of max_bits" % (block_size, width, str(config.floatX))
            )
        )

    # Allocate storage for (negative) free-energy of all 2**width
    # configurations.
    try:
        nFE = numpy.zeros(2 ** width, dtype=config.floatX)
    except MemoryError as e:
        improve_memory_error_message(
            e,
            "failed to allocate free energy storage "
            "array in compute_log_z; your model is too "
            "big to use with this function",
        )

    # now loop 2**(width - block_bits) times, filling in the
    # most-significant bits
    for bi, up_bits in enumerate(numpy.ndindex(*([2] * (width - block_bits)))):
        logz_data[:, : width - block_bits] = up_bits
        nFE[bi * block_size : (bi + 1) * block_size] = -free_energy_fn(logz_data)
    alpha = nFE.max()
    # Do the subtraction and exponentiation in-place so as to not incur a copy.
    nFE -= alpha
    numpy.exp(nFE, nFE)
    log_z = numpy.log(nFE.sum()) + alpha
    return log_z
示例#7
0
            except Exception, e:
                if os.path.exists(filepath) and not os.path.isdir(filepath):
                    raise
                raise_cannot_open(filepath)
    except MemoryError as e:
        # We want to explicitly catch this exception because for MemoryError
        # __str__ returns the empty string, so some of our default printouts
        # below don't make a lot of sense.
        # Also, a lot of users assume any exception is a bug in the library,
        # so we can cut down on mail to pylearn-users by adding a message
        # that makes it clear this exception is caused by their machine not
        # meeting requirements.
        if os.path.splitext(filepath)[1] == ".pkl":
            improve_memory_error_message(e, 
                "You do not have enough memory to open %s \n"
                " + Try using numpy.{save,load} (file with extension '.npy') "
                "to save your file. It uses less memory when reading and "
                "writing files than pickled files." % filepath)
        else:
            improve_memory_error_message(e, 
                "You do not have enough memory to open %s" % filepath)

    except BadPickleGet, e:
        logger.exception('Failed to open {0} due to BadPickleGet '
                         'with exception string {1}'.format(filepath, e))

        if not retry:
            raise
        obj =  exponential_backoff()
    except EOFError, e:
示例#8
0
            except Exception, e:
                if os.path.exists(filepath) and not os.path.isdir(filepath):
                    raise
                raise_cannot_open(filepath)
    except MemoryError as e:
        # We want to explicitly catch this exception because for MemoryError
        # __str__ returns the empty string, so some of our default printouts
        # below don't make a lot of sense.
        # Also, a lot of users assume any exception is a bug in the library,
        # so we can cut down on mail to pylearn-users by adding a message
        # that makes it clear this exception is caused by their machine not
        # meeting requirements.
        if os.path.splitext(filepath)[1] == ".pkl":
            improve_memory_error_message(e, 
                "You do not have enough memory to open %s \n"
                " + Try using numpy.{save,load} (file with extension '.npy') "
                "to save your file. It uses less memory when reading and "
                "writing files than pickled files." % filepath)
        else:
            improve_memory_error_message(e, 
                "You do not have enough memory to open %s" % filepath)

    except BadPickleGet:
        if not retry:
            reraise_as(BadPickleGet('Failed to open {0}'.format(filepath)))
        obj =  exponential_backoff()
    except EOFError:
        if not retry:
            reraise_as(EOFError("Failed to open {0}".format(filepath)))
        obj =  exponential_backoff()
    except ValueError:
示例#9
0
    def train_all(self, dataset, mu=None):
        """
        Process kmeans algorithm on the input to localize clusters.

        Parameters
        ----------
        dataset : WRITEME
        mu : WRITEME

        Returns
        -------
        rval : bool
            WRITEME
        """

        # TODO-- why does this sometimes return X and sometimes return nothing?

        X = dataset.get_design_matrix()

        n, m = X.shape
        k = self.k

        if milk is not None:
            # use the milk implementation of k-means if it's available
            cluster_ids, mu = milk.kmeans(X, k)
        else:
            # our own implementation

            # taking random inputs as initial clusters if user does not provide
            # them.
            if mu is not None:
                if not len(mu) == k:
                    raise Exception("You gave %i clusters"
                                    ", but k=%i were expected"
                                    % (len(mu), k))
            else:
                indices = numpy.random.randint(X.shape[0], size=k)
                mu = X[indices]

            try:
                dists = numpy.zeros((n, k))
            except MemoryError as e:
                improve_memory_error_message(e, "dying trying to allocate "
                                                "dists matrix for {0} "
                                                "examples and {1} "
                                                "means".format(n, k))

            old_kills = {}

            iter = 0
            mmd = prev_mmd = float('inf')
            while True:
                if self.verbose:
                    logger.info('kmeans iter {0}'.format(iter))

                # print 'iter:',iter,' conv crit:',abs(mmd-prev_mmd)
                # if numpy.sum(numpy.isnan(mu)) > 0:
                if contains_nan(mu):
                    logger.info('nan found')
                    return X

                # computing distances
                for i in xrange(k):
                    dists[:, i] = numpy.square((X - mu[i, :])).sum(axis=1)

                if iter > 0:
                    prev_mmd = mmd

                min_dists = dists.min(axis=1)

                # mean minimum distance:
                mmd = min_dists.mean()

                logger.info('cost: {0}'.format(mmd))

                if iter > 0 and (iter >= self.max_iter or
                                 abs(mmd - prev_mmd) < self.convergence_th):
                    # converged
                    break

                # finding minimum distances
                min_dist_inds = dists.argmin(axis=1)

                # computing means
                i = 0
                blacklist = []
                new_kills = {}
                while i < k:
                    b = min_dist_inds == i
                    if not numpy.any(b):
                        killed_on_prev_iter = True
                        # initializes empty cluster to be the mean of the d
                        # data points farthest from their corresponding means
                        if i in old_kills:
                            d = old_kills[i] - 1
                            if d == 0:
                                d = 50
                            new_kills[i] = d
                        else:
                            d = 5
                        mu[i, :] = 0
                        for j in xrange(d):
                            idx = numpy.argmax(min_dists)
                            min_dists[idx] = 0
                            # chose point idx
                            mu[i, :] += X[idx, :]
                            blacklist.append(idx)
                        mu[i, :] /= float(d)
                        # cluster i was empty, reset it to d far out data
                        # points recomputing distances for this cluster
                        dists[:, i] = numpy.square((X - mu[i, :])).sum(axis=1)
                        min_dists = dists.min(axis=1)
                        for idx in blacklist:
                            min_dists[idx] = 0
                        min_dist_inds = dists.argmin(axis=1)
                        # done
                        i += 1
                    else:
                        mu[i, :] = numpy.mean(X[b, :], axis=0)
                        if contains_nan(mu):
                            logger.info('nan found at {0}'.format(i))
                            return X
                        i += 1

                old_kills = new_kills

                iter += 1

        self.mu = sharedX(mu)
        self._params = [self.mu]
        return True
示例#10
0
def compute_log_z(rbm, free_energy_fn, max_bits=15):
    """
    Compute the log partition function of an (binary-binary) RBM.

    Parameters
    ----------
    rbm : object
        An RBM object from `pylearn2.models`.
    free_energy_fn : callable
        A callable object (e.g. Theano function) that computes the
        free energy of a stack of configuration for this RBM.
    max_bits : int, optional
        The (base-2) log of the number of states to enumerate (and
        compute free energy for) at a time.

    Notes
    -----
    This function enumerates a sum with exponentially many terms, and
    should not be used with more than a small, toy model.
    """
    # Pick whether to iterate over visible or hidden states.
    if rbm.nvis < rbm.nhid:
        width = rbm.nvis
        type = 'vis'
    else:
        width = rbm.nhid
        type = 'hid'

    # Determine in how many steps to compute Z.
    block_bits = width if (not max_bits or width < max_bits) else max_bits
    block_size = 2**block_bits

    # Allocate storage for 2**block_bits of the 2**width possible
    # configurations.
    try:
        logz_data_c = numpy.zeros((block_size, width),
                                  order='C',
                                  dtype=config.floatX)
    except MemoryError:
        raise MemoryError("failed to allocate (%d, %d) matrix of "
                          "type %s in compute_log_z; try a smaller "
                          "value of max_bits" %
                          (block_size, width, str(config.floatX)))

    # fill in the first block_bits, which will remain fixed for all
    # 2**width configs
    tensor_10D_idx = numpy.ndindex(*([2] * block_bits))
    for i, j in enumerate(tensor_10D_idx):
        logz_data_c[i, -block_bits:] = j
    try:
        logz_data = numpy.array(logz_data_c, order='F', dtype=config.floatX)
    except MemoryError:
        raise MemoryError("failed to allocate (%d, %d) matrix of "
                          "type %s in compute_log_z; try a smaller "
                          "value of max_bits" %
                          (block_size, width, str(config.floatX)))

    # Allocate storage for (negative) free-energy of all 2**width
    # configurations.
    try:
        nFE = numpy.zeros(2**width, dtype=config.floatX)
    except MemoryError as e:
        improve_memory_error_message(
            e, "failed to allocate free energy storage "
            "array in compute_log_z; your model is too "
            "big to use with this function")

    # now loop 2**(width - block_bits) times, filling in the
    # most-significant bits
    for bi, up_bits in enumerate(numpy.ndindex(*([2] * (width - block_bits)))):
        logz_data[:, :width - block_bits] = up_bits
        nFE[bi * block_size:(bi + 1) * block_size] = -free_energy_fn(logz_data)
    alpha = nFE.max()
    # Do the subtraction and exponentiation in-place so as to not incur a copy.
    nFE -= alpha
    numpy.exp(nFE, nFE)
    log_z = numpy.log(nFE.sum()) + alpha
    return log_z