def test_improve_memory_error_message(): """ Tests that the MemoryError's message is improved correctly """ try: improve_memory_error_message(MemoryError(), "test") except MemoryError as e: # message has been "improved" assert len(str(e)) try: improve_memory_error_message(MemoryError("test"), "should not") except MemoryError as e: assert str(e) == "test"
def train_all(self, dataset, mu=None): """ Process kmeans algorithm on the input to localize clusters. Parameters ---------- dataset : WRITEME mu : WRITEME Returns ------- rval : bool WRITEME """ # TODO-- why does this sometimes return X and sometimes return nothing? X = dataset.get_design_matrix() n, m = X.shape k = self.k if milk is not None: # use the milk implementation of k-means if it's available cluster_ids, mu = milk.kmeans(X, k) else: # our own implementation # taking random inputs as initial clusters if user does not provide # them. if mu is not None: if not len(mu) == k: raise Exception("You gave %i clusters" ", but k=%i were expected" % (len(mu), k)) else: indices = numpy.random.randint(X.shape[0], size=k) mu = X[indices] try: dists = numpy.zeros((n, k)) except MemoryError as e: improve_memory_error_message( e, "dying trying to allocate " "dists matrix for {0} " "examples and {1} " "means".format(n, k)) old_kills = {} iter = 0 mmd = prev_mmd = float('inf') while True: if self.verbose: logger.info('kmeans iter {0}'.format(iter)) # print 'iter:',iter,' conv crit:',abs(mmd-prev_mmd) # if numpy.sum(numpy.isnan(mu)) > 0: if contains_nan(mu): logger.info('nan found') return X # computing distances for i in xrange(k): dists[:, i] = numpy.square((X - mu[i, :])).sum(axis=1) if iter > 0: prev_mmd = mmd min_dists = dists.min(axis=1) # mean minimum distance: mmd = min_dists.mean() logger.info('cost: {0}'.format(mmd)) if iter > 0 and (iter >= self.max_iter or abs(mmd - prev_mmd) < self.convergence_th): # converged break # finding minimum distances min_dist_inds = dists.argmin(axis=1) # computing means i = 0 blacklist = [] new_kills = {} while i < k: b = min_dist_inds == i if not numpy.any(b): killed_on_prev_iter = True # initializes empty cluster to be the mean of the d # data points farthest from their corresponding means if i in old_kills: d = old_kills[i] - 1 if d == 0: d = 50 new_kills[i] = d else: d = 5 mu[i, :] = 0 for j in xrange(d): idx = numpy.argmax(min_dists) min_dists[idx] = 0 # chose point idx mu[i, :] += X[idx, :] blacklist.append(idx) mu[i, :] /= float(d) # cluster i was empty, reset it to d far out data # points recomputing distances for this cluster dists[:, i] = numpy.square((X - mu[i, :])).sum(axis=1) min_dists = dists.min(axis=1) for idx in blacklist: min_dists[idx] = 0 min_dist_inds = dists.argmin(axis=1) # done i += 1 else: mu[i, :] = numpy.mean(X[b, :], axis=0) if contains_nan(mu): logger.info('nan found at {0}'.format(i)) return X i += 1 old_kills = new_kills iter += 1 self.mu = sharedX(mu) self._params = [self.mu] return True
def _load(filepath, recurse_depth=0, retry=True): """ Recursively tries to load a file until success or maximum number of attempts. Parameters ---------- filepath : str A path to a file to load. Should be a pickle, Matlab, or NumPy file; or a .txt or .amat file that numpy.loadtxt can load. recurse_depth : int, optional End users should not use this argument. It is used by the function itself to implement the `retry` option recursively. retry : bool, optional If True, will make a handful of attempts to load the file before giving up. This can be useful if you are for example calling show_weights.py on a file that is actively being written to by a training script--sometimes the load attempt might fail if the training script writes at the same time show_weights tries to read, but if you try again after a few seconds you should be able to open the file. Returns ------- loaded_object : object The object that was stored in the file. """ try: import joblib joblib_available = True except ImportError: joblib_available = False if recurse_depth == 0: filepath = preprocess(filepath) if filepath.endswith('.npy') or filepath.endswith('.npz'): return np.load(filepath) if filepath.endswith('.amat') or filepath.endswith('txt'): try: return np.loadtxt(filepath) except Exception: reraise_as("{0} cannot be loaded by serial.load (trying " "to use np.loadtxt)".format(filepath)) if filepath.endswith('.mat'): global io if io is None: import scipy.io io = scipy.io try: return io.loadmat(filepath) except NotImplementedError as nei: if str(nei).find('HDF reader') != -1: global hdf_reader if hdf_reader is None: import h5py hdf_reader = h5py return hdf_reader.File(filepath, 'r') else: raise # this code should never be reached assert False # for loading PY2 pickle in PY3 encoding = {'encoding': 'latin-1'} if six.PY3 else {} def exponential_backoff(): if recurse_depth > 9: logger.info('Max number of tries exceeded while trying to open ' '{0}'.format(filepath)) logger.info('attempting to open via reading string') with open(filepath, 'rb') as f: content = f.read() return cPickle.loads(content, **encoding) else: nsec = 0.5 * (2.0 ** float(recurse_depth)) logger.info("Waiting {0} seconds and trying again".format(nsec)) time.sleep(nsec) return _load(filepath, recurse_depth + 1, retry) try: if not joblib_available: with open(filepath, 'rb') as f: obj = cPickle.load(f, **encoding) else: try: obj = joblib.load(filepath) except Exception as e: if os.path.exists(filepath) and not os.path.isdir(filepath): raise raise_cannot_open(filepath) except MemoryError as e: # We want to explicitly catch this exception because for MemoryError # __str__ returns the empty string, so some of our default printouts # below don't make a lot of sense. # Also, a lot of users assume any exception is a bug in the library, # so we can cut down on mail to pylearn-users by adding a message # that makes it clear this exception is caused by their machine not # meeting requirements. if os.path.splitext(filepath)[1] == ".pkl": improve_memory_error_message(e, ("You do not have enough memory to " "open %s \n" " + Try using numpy.{save,load} " "(file with extension '.npy') " "to save your file. It uses less " "memory when reading and " "writing files than pickled files.") % filepath) else: improve_memory_error_message(e, "You do not have enough memory to " "open %s" % filepath) except (BadPickleGet, EOFError, KeyError) as e: if not retry: reraise_as(e.__class__('Failed to open {0}'.format(filepath))) obj = exponential_backoff() except ValueError: logger.exception if not retry: reraise_as(ValueError('Failed to open {0}'.format(filepath))) obj = exponential_backoff() except Exception: # assert False reraise_as("Couldn't open {0}".format(filepath)) # if the object has no yaml_src, we give it one that just says it # came from this file. could cause trouble if you save obj again # to a different location if not hasattr(obj, 'yaml_src'): try: obj.yaml_src = '!pkl: "' + os.path.abspath(filepath) + '"' except Exception: pass return obj
def compute_log_z(rbm, free_energy_fn, max_bits=15): """ Compute the log partition function of an (binary-binary) RBM. Parameters ---------- rbm : object An RBM object from `pylearn2.models`. free_energy_fn : callable A callable object (e.g. Theano function) that computes the free energy of a stack of configuration for this RBM. max_bits : int, optional The (base-2) log of the number of states to enumerate (and compute free energy for) at a time. Notes ----- This function enumerates a sum with exponentially many terms, and should not be used with more than a small, toy model. """ # Pick whether to iterate over visible or hidden states. if rbm.nvis < rbm.nhid: width = rbm.nvis type = "vis" else: width = rbm.nhid type = "hid" # Determine in how many steps to compute Z. block_bits = width if (not max_bits or width < max_bits) else max_bits block_size = 2 ** block_bits # Allocate storage for 2**block_bits of the 2**width possible # configurations. try: logz_data_c = numpy.zeros((block_size, width), order="C", dtype=config.floatX) except MemoryError: reraise_as( MemoryError( "failed to allocate (%d, %d) matrix of " "type %s in compute_log_z; try a smaller " "value of max_bits" % (block_size, width, str(config.floatX)) ) ) # fill in the first block_bits, which will remain fixed for all # 2**width configs tensor_10D_idx = numpy.ndindex(*([2] * block_bits)) for i, j in enumerate(tensor_10D_idx): logz_data_c[i, -block_bits:] = j try: logz_data = numpy.array(logz_data_c, order="F", dtype=config.floatX) except MemoryError: reraise_as( MemoryError( "failed to allocate (%d, %d) matrix of " "type %s in compute_log_z; try a smaller " "value of max_bits" % (block_size, width, str(config.floatX)) ) ) # Allocate storage for (negative) free-energy of all 2**width # configurations. try: nFE = numpy.zeros(2 ** width, dtype=config.floatX) except MemoryError as e: improve_memory_error_message( e, "failed to allocate free energy storage " "array in compute_log_z; your model is too " "big to use with this function", ) # now loop 2**(width - block_bits) times, filling in the # most-significant bits for bi, up_bits in enumerate(numpy.ndindex(*([2] * (width - block_bits)))): logz_data[:, : width - block_bits] = up_bits nFE[bi * block_size : (bi + 1) * block_size] = -free_energy_fn(logz_data) alpha = nFE.max() # Do the subtraction and exponentiation in-place so as to not incur a copy. nFE -= alpha numpy.exp(nFE, nFE) log_z = numpy.log(nFE.sum()) + alpha return log_z
except Exception, e: if os.path.exists(filepath) and not os.path.isdir(filepath): raise raise_cannot_open(filepath) except MemoryError as e: # We want to explicitly catch this exception because for MemoryError # __str__ returns the empty string, so some of our default printouts # below don't make a lot of sense. # Also, a lot of users assume any exception is a bug in the library, # so we can cut down on mail to pylearn-users by adding a message # that makes it clear this exception is caused by their machine not # meeting requirements. if os.path.splitext(filepath)[1] == ".pkl": improve_memory_error_message(e, "You do not have enough memory to open %s \n" " + Try using numpy.{save,load} (file with extension '.npy') " "to save your file. It uses less memory when reading and " "writing files than pickled files." % filepath) else: improve_memory_error_message(e, "You do not have enough memory to open %s" % filepath) except BadPickleGet, e: logger.exception('Failed to open {0} due to BadPickleGet ' 'with exception string {1}'.format(filepath, e)) if not retry: raise obj = exponential_backoff() except EOFError, e:
except Exception, e: if os.path.exists(filepath) and not os.path.isdir(filepath): raise raise_cannot_open(filepath) except MemoryError as e: # We want to explicitly catch this exception because for MemoryError # __str__ returns the empty string, so some of our default printouts # below don't make a lot of sense. # Also, a lot of users assume any exception is a bug in the library, # so we can cut down on mail to pylearn-users by adding a message # that makes it clear this exception is caused by their machine not # meeting requirements. if os.path.splitext(filepath)[1] == ".pkl": improve_memory_error_message(e, "You do not have enough memory to open %s \n" " + Try using numpy.{save,load} (file with extension '.npy') " "to save your file. It uses less memory when reading and " "writing files than pickled files." % filepath) else: improve_memory_error_message(e, "You do not have enough memory to open %s" % filepath) except BadPickleGet: if not retry: reraise_as(BadPickleGet('Failed to open {0}'.format(filepath))) obj = exponential_backoff() except EOFError: if not retry: reraise_as(EOFError("Failed to open {0}".format(filepath))) obj = exponential_backoff() except ValueError:
def train_all(self, dataset, mu=None): """ Process kmeans algorithm on the input to localize clusters. Parameters ---------- dataset : WRITEME mu : WRITEME Returns ------- rval : bool WRITEME """ # TODO-- why does this sometimes return X and sometimes return nothing? X = dataset.get_design_matrix() n, m = X.shape k = self.k if milk is not None: # use the milk implementation of k-means if it's available cluster_ids, mu = milk.kmeans(X, k) else: # our own implementation # taking random inputs as initial clusters if user does not provide # them. if mu is not None: if not len(mu) == k: raise Exception("You gave %i clusters" ", but k=%i were expected" % (len(mu), k)) else: indices = numpy.random.randint(X.shape[0], size=k) mu = X[indices] try: dists = numpy.zeros((n, k)) except MemoryError as e: improve_memory_error_message(e, "dying trying to allocate " "dists matrix for {0} " "examples and {1} " "means".format(n, k)) old_kills = {} iter = 0 mmd = prev_mmd = float('inf') while True: if self.verbose: logger.info('kmeans iter {0}'.format(iter)) # print 'iter:',iter,' conv crit:',abs(mmd-prev_mmd) # if numpy.sum(numpy.isnan(mu)) > 0: if contains_nan(mu): logger.info('nan found') return X # computing distances for i in xrange(k): dists[:, i] = numpy.square((X - mu[i, :])).sum(axis=1) if iter > 0: prev_mmd = mmd min_dists = dists.min(axis=1) # mean minimum distance: mmd = min_dists.mean() logger.info('cost: {0}'.format(mmd)) if iter > 0 and (iter >= self.max_iter or abs(mmd - prev_mmd) < self.convergence_th): # converged break # finding minimum distances min_dist_inds = dists.argmin(axis=1) # computing means i = 0 blacklist = [] new_kills = {} while i < k: b = min_dist_inds == i if not numpy.any(b): killed_on_prev_iter = True # initializes empty cluster to be the mean of the d # data points farthest from their corresponding means if i in old_kills: d = old_kills[i] - 1 if d == 0: d = 50 new_kills[i] = d else: d = 5 mu[i, :] = 0 for j in xrange(d): idx = numpy.argmax(min_dists) min_dists[idx] = 0 # chose point idx mu[i, :] += X[idx, :] blacklist.append(idx) mu[i, :] /= float(d) # cluster i was empty, reset it to d far out data # points recomputing distances for this cluster dists[:, i] = numpy.square((X - mu[i, :])).sum(axis=1) min_dists = dists.min(axis=1) for idx in blacklist: min_dists[idx] = 0 min_dist_inds = dists.argmin(axis=1) # done i += 1 else: mu[i, :] = numpy.mean(X[b, :], axis=0) if contains_nan(mu): logger.info('nan found at {0}'.format(i)) return X i += 1 old_kills = new_kills iter += 1 self.mu = sharedX(mu) self._params = [self.mu] return True
def compute_log_z(rbm, free_energy_fn, max_bits=15): """ Compute the log partition function of an (binary-binary) RBM. Parameters ---------- rbm : object An RBM object from `pylearn2.models`. free_energy_fn : callable A callable object (e.g. Theano function) that computes the free energy of a stack of configuration for this RBM. max_bits : int, optional The (base-2) log of the number of states to enumerate (and compute free energy for) at a time. Notes ----- This function enumerates a sum with exponentially many terms, and should not be used with more than a small, toy model. """ # Pick whether to iterate over visible or hidden states. if rbm.nvis < rbm.nhid: width = rbm.nvis type = 'vis' else: width = rbm.nhid type = 'hid' # Determine in how many steps to compute Z. block_bits = width if (not max_bits or width < max_bits) else max_bits block_size = 2**block_bits # Allocate storage for 2**block_bits of the 2**width possible # configurations. try: logz_data_c = numpy.zeros((block_size, width), order='C', dtype=config.floatX) except MemoryError: raise MemoryError("failed to allocate (%d, %d) matrix of " "type %s in compute_log_z; try a smaller " "value of max_bits" % (block_size, width, str(config.floatX))) # fill in the first block_bits, which will remain fixed for all # 2**width configs tensor_10D_idx = numpy.ndindex(*([2] * block_bits)) for i, j in enumerate(tensor_10D_idx): logz_data_c[i, -block_bits:] = j try: logz_data = numpy.array(logz_data_c, order='F', dtype=config.floatX) except MemoryError: raise MemoryError("failed to allocate (%d, %d) matrix of " "type %s in compute_log_z; try a smaller " "value of max_bits" % (block_size, width, str(config.floatX))) # Allocate storage for (negative) free-energy of all 2**width # configurations. try: nFE = numpy.zeros(2**width, dtype=config.floatX) except MemoryError as e: improve_memory_error_message( e, "failed to allocate free energy storage " "array in compute_log_z; your model is too " "big to use with this function") # now loop 2**(width - block_bits) times, filling in the # most-significant bits for bi, up_bits in enumerate(numpy.ndindex(*([2] * (width - block_bits)))): logz_data[:, :width - block_bits] = up_bits nFE[bi * block_size:(bi + 1) * block_size] = -free_energy_fn(logz_data) alpha = nFE.max() # Do the subtraction and exponentiation in-place so as to not incur a copy. nFE -= alpha numpy.exp(nFE, nFE) log_z = numpy.log(nFE.sum()) + alpha return log_z