def fprop(self): if self.phase == 'train': # Calculate batch mean tmp = ca.mean(self.x.out, axis=0, keepdims=True) # Center input ca.subtract(self.x.out, tmp, self._tmp_batch_centered) # Update running mean tmp *= 1 - self.momentum self.running_mean *= self.momentum self.running_mean += tmp # Calculate batch variance ca.power(self._tmp_batch_centered, 2, self.out) ca.mean(self.out, axis=0, keepdims=True, out=self._tmp_batch_inv_std) # Calculate 1 / E([x - E(x)]^2) self._tmp_batch_inv_std += self.eps ca.sqrt(self._tmp_batch_inv_std, self._tmp_batch_inv_std) ca.power(self._tmp_batch_inv_std, -1, self._tmp_batch_inv_std) # Normalize input ca.multiply(self._tmp_batch_centered, self._tmp_batch_inv_std, self.out) # Update running std self.running_std *= self.momentum ca.multiply(self._tmp_batch_inv_std, 1-self.momentum, tmp) self.running_std += tmp elif self.phase == 'test': ca.subtract(self.x.out, self.running_mean, self.out) self.out *= self.running_std else: raise ValueError('Invalid phase: %s' % self.phase) if self.affine: self.out *= self.gamma.array self.out += self.beta.array
def monitor(self): if not self._monitor: return val_mean_abs = np.array(ca.mean(ca.fabs(self._array))) grad_mean_abs = np.array(ca.mean(ca.fabs(self._tmp_grad_array))) step_mean_abs = np.array(ca.mean(ca.fabs(self._tmp_last_step))) logger.info("%s:\t%.1e [%.1e, %.1e]" % (self.name, val_mean_abs, grad_mean_abs, step_mean_abs))
def bprop(self): ca.multiply(self._tmp_batch_centered, self.out_grad, self.x.out_grad) tmp = ca.mean(self.x.out_grad, axis=0, keepdims=True) ca.multiply(self._tmp_batch_centered, tmp, self.x.out_grad) self.x.out_grad *= -1 self.x.out_grad *= self._tmp_batch_inv_std self.x.out_grad *= self._tmp_batch_inv_std ca.mean(self.out_grad, axis=0, keepdims=True, out=tmp) self.x.out_grad += self.out_grad self.x.out_grad -= tmp self.x.out_grad *= self._tmp_batch_inv_std if self.affine: self.x.out_grad *= self.gamma.array # Normalized input self._tmp_batch_centered *= self._tmp_batch_inv_std self._tmp_batch_centered *= self.out_grad ca.sum(self._tmp_batch_centered, axis=0, keepdims=True, out=self.gamma.grad_array) ca.sum(self.out_grad, axis=0, keepdims=True, out=self.beta.grad_array)
def bprop(self): ca.multiply(self._tmp_batch_centered, self.out_grad, self.x.out_grad) tmp = ca.mean(ca.mean(self.x.out_grad, axis=0, keepdims=True), axis=(2, 3), keepdims=True) ca.multiply(self._tmp_batch_centered, tmp, self.x.out_grad) self.x.out_grad *= -1 self.x.out_grad *= self._tmp_batch_inv_std self.x.out_grad *= self._tmp_batch_inv_std tmp = ca.mean(ca.mean(self.out_grad, axis=0, keepdims=True), axis=(2, 3), keepdims=True) self.x.out_grad += self.out_grad self.x.out_grad -= tmp self.x.out_grad *= self._tmp_batch_inv_std if self.affine: self.x.out_grad *= self.gamma.array # Normalized input self._tmp_batch_centered *= self._tmp_batch_inv_std self._tmp_batch_centered *= self.out_grad ca.sum(ca.sum(self._tmp_batch_centered, axis=(2, 3), keepdims=True), axis=0, keepdims=True, out=self.gamma.grad_array) ca.sum(ca.sum(self.out_grad, axis=(2, 3), keepdims=True), axis=0, keepdims=True, out=self.beta.grad_array)
def fprop(self): if self.phase == 'train': # Calculate batch mean tmp = ca.mean(self.x.out, axis=0, keepdims=True) # Center input ca.subtract(self.x.out, tmp, self._tmp_batch_centered) # Update running mean tmp *= 1 - self.momentum self.running_mean *= self.momentum self.running_mean += tmp # Calculate batch variance ca.power(self._tmp_batch_centered, 2, self.out) ca.mean(self.out, axis=0, keepdims=True, out=self._tmp_batch_inv_std) # Calculate 1 / E([x - E(x)]^2) self._tmp_batch_inv_std += self.eps ca.sqrt(self._tmp_batch_inv_std, self._tmp_batch_inv_std) ca.power(self._tmp_batch_inv_std, -1, self._tmp_batch_inv_std) # Normalize input ca.multiply(self._tmp_batch_centered, self._tmp_batch_inv_std, self.out) # Update running std self.running_std *= self.momentum ca.multiply(self._tmp_batch_inv_std, 1 - self.momentum, tmp) self.running_std += tmp elif self.phase == 'test': ca.subtract(self.x.out, self.running_mean, self.out) self.out *= self.running_std else: raise ValueError('Invalid phase: %s' % self.phase) if self.affine: self.out *= self.gamma.array self.out += self.beta.array
def bprop(self): ca.multiply(self._tmp_batch_centered, self.grad_array, self.x.grad_array) tmp = ca.mean(ca.mean(self.x.grad_array, axis=0, keepdims=True), axis=(2, 3), keepdims=True) ca.multiply(self._tmp_batch_centered, tmp, self.x.grad_array) self.x.grad_array *= -1 self.x.grad_array *= self._tmp_batch_inv_std self.x.grad_array *= self._tmp_batch_inv_std tmp = ca.mean(ca.mean(self.grad_array, axis=0, keepdims=True), axis=(2, 3), keepdims=True) self.x.grad_array += self.grad_array self.x.grad_array -= tmp self.x.grad_array *= self._tmp_batch_inv_std if self.affine: self.x.grad_array *= self.gamma.array # Normalized input self._tmp_batch_centered *= self._tmp_batch_inv_std self._tmp_batch_centered *= self.grad_array ca.sum(ca.sum(self._tmp_batch_centered, axis=(2, 3), keepdims=True), axis=0, keepdims=True, out=self.gamma.grad_array) ca.sum(ca.sum(self.grad_array, axis=(2, 3), keepdims=True), axis=0, keepdims=True, out=self.beta.grad_array)
def monitor(self): if not self._monitor: return val_mean_abs = np.array(ca.mean(ca.fabs(self._array))) grad_mean_abs = np.array(ca.mean(ca.fabs(self._tmp_grad_array))) step_mean_abs = np.array(ca.mean(ca.fabs(self._tmp_step))) log.info('%s:\t%.1e [%.1e, %.1e]', self.name, val_mean_abs, grad_mean_abs, step_mean_abs)
def monitor(self): for param, step in zip(self.params, self.steps): if param.monitor: val_mean_abs = np.array(ca.mean(ca.fabs(param.values))) grad_mean_abs = np.array(ca.mean(ca.fabs(param.grad()))) step_mean_abs = np.array(ca.mean(ca.fabs(step))) logger.info('%s:\t%.1e [%.1e, %.1e]' % (param.name, val_mean_abs, grad_mean_abs, step_mean_abs))
def monitor(self): for param, step in zip(self.params, self.steps): if param.monitor: val_mean_abs = np.array(ca.mean(ca.fabs(param.values))) grad_mean_abs = np.array(ca.mean(ca.fabs(param.grad()))) step_mean_abs = np.array(ca.mean(ca.fabs(step))) logger.info( '%s:\t%.1e [%.1e, %.1e]' % (param.name, val_mean_abs, grad_mean_abs, step_mean_abs))
def train_epoch(self): batch_losses = [] for batch in self.feed.batches(): loss = np.array(ca.mean(self.model.update(*batch))) for param, state in zip(self.params, self.learn_rule_states): self.learn_rule.step(param, state) batch_losses.append(loss) epoch_loss = np.mean(batch_losses) return epoch_loss
def fprop(self): if self.phase == 'train': # Calculate batch mean tmp = ca.mean(ca.mean(self.x.array, axis=0, keepdims=True), axis=(2, 3), keepdims=True) # Center input ca.subtract(self.x.array, tmp, self._tmp_batch_centered) # Update running mean tmp *= 1 - self.momentum self.running_mean *= self.momentum self.running_mean += tmp # Calculate batch variance ca.power(self._tmp_batch_centered, 2, self.array) ca.mean(ca.mean(self.array, axis=0, keepdims=True), axis=(2, 3), keepdims=True, out=self._tmp_batch_inv_std) # Calculate 1 / E([x - E(x)]^2) self._tmp_batch_inv_std += self.eps ca.sqrt(self._tmp_batch_inv_std, self._tmp_batch_inv_std) ca.power(self._tmp_batch_inv_std, -1, self._tmp_batch_inv_std) # Normalize input ca.multiply(self._tmp_batch_centered, self._tmp_batch_inv_std, self.array) # Update running std self.running_std *= self.momentum ca.multiply(self._tmp_batch_inv_std, 1-self.momentum, tmp) self.running_std += tmp if self.noise_std > 0.0: noise = ca.random.normal(scale=self.noise_std, size=self.shape) ca.add(self.array, noise, self.array) elif self.phase == 'test': ca.subtract(self.x.array, self.running_mean, self.array) self.array *= self.running_std else: raise ValueError('Invalid phase: %s' % self.phase) if self.affine: self.array *= self.gamma.array self.array += self.beta.array
def bprop(self): ca.multiply(self._tmp_batch_centered, self.grad_array, self.x.grad_array) tmp = ca.mean(self.x.grad_array, axis=0, keepdims=True) ca.multiply(self._tmp_batch_centered, tmp, self.x.grad_array) self.x.grad_array *= -1 self.x.grad_array *= self._tmp_batch_inv_std self.x.grad_array *= self._tmp_batch_inv_std ca.mean(self.grad_array, axis=0, keepdims=True, out=tmp) self.x.grad_array += self.grad_array self.x.grad_array -= tmp self.x.grad_array *= self._tmp_batch_inv_std if self.affine: self.x.grad_array *= self.gamma.array # Normalized input self._tmp_batch_centered *= self._tmp_batch_inv_std self._tmp_batch_centered *= self.grad_array ca.sum(self._tmp_batch_centered, axis=0, keepdims=True, out=self.gamma.grad_array) ca.sum(self.grad_array, axis=0, keepdims=True, out=self.beta.grad_array)
def test_reduce(): a_np = np.random.normal(size=(1024, )) a_ca = ca.array(a_np) c_np = np.sum(a_np) c_ca = ca.sum(a_ca) print(np.allclose(c_np, np.array(c_ca))) c_np = np.mean(a_np) c_ca = ca.mean(a_ca) print(np.allclose(c_np, np.array(c_ca))) a_np = np.random.normal(size=(5, 5)) a_ca = ca.array(a_np) c_np = np.sum(a_np) c_ca = ca.sum(a_ca) print(np.allclose(c_np, np.array(c_ca))) c_np = np.sum(a_np, axis=0) c_ca = ca.sum(a_ca, axis=0) print(np.allclose(c_np, np.array(c_ca))) c_np = np.sum(a_np, axis=1) c_ca = ca.sum(a_ca, axis=1) print(np.allclose(c_np, np.array(c_ca))) a_np = np.random.normal(size=(5, 7, 11)) a_ca = ca.array(a_np) c_np = np.sum(a_np, axis=0) c_ca = ca.sum(a_ca, axis=0) print(np.allclose(c_np, np.array(c_ca))) c_np = np.sum(a_np, axis=2) c_ca = ca.sum(a_ca, axis=2) print(np.allclose(c_np, np.array(c_ca))) c_np = np.sum(a_np, axis=(0, 1)) c_ca = ca.sum(a_ca, axis=(0, 1)) print(np.allclose(c_np, np.array(c_ca))) c_np = np.sum(a_np, axis=(1, 2)) c_ca = ca.sum(a_ca, axis=(1, 2)) print(np.allclose(c_np, np.array(c_ca))) c_np = np.argmin(a_np, axis=0) c_ca = ca.argmin(a_ca, axis=0) print(np.allclose(c_np, np.array(c_ca))) c_np = np.argmin(a_np, axis=2) c_ca = ca.argmin(a_ca, axis=2) print(np.allclose(c_np, np.array(c_ca)))
def test_reduce(): a_np = np.random.normal(size=(1024,)) a_ca = ca.array(a_np) c_np = np.sum(a_np) c_ca = ca.sum(a_ca) print(np.allclose(c_np, np.array(c_ca))) c_np = np.mean(a_np) c_ca = ca.mean(a_ca) print(np.allclose(c_np, np.array(c_ca))) a_np = np.random.normal(size=(5, 5)) a_ca = ca.array(a_np) c_np = np.sum(a_np) c_ca = ca.sum(a_ca) print(np.allclose(c_np, np.array(c_ca))) c_np = np.sum(a_np, axis=0) c_ca = ca.sum(a_ca, axis=0) print(np.allclose(c_np, np.array(c_ca))) c_np = np.sum(a_np, axis=1) c_ca = ca.sum(a_ca, axis=1) print(np.allclose(c_np, np.array(c_ca))) a_np = np.random.normal(size=(5, 7, 11)) a_ca = ca.array(a_np) c_np = np.sum(a_np, axis=0) c_ca = ca.sum(a_ca, axis=0) print(np.allclose(c_np, np.array(c_ca))) c_np = np.sum(a_np, axis=2) c_ca = ca.sum(a_ca, axis=2) print(np.allclose(c_np, np.array(c_ca))) c_np = np.sum(a_np, axis=(0, 1)) c_ca = ca.sum(a_ca, axis=(0, 1)) print(np.allclose(c_np, np.array(c_ca))) c_np = np.sum(a_np, axis=(1, 2)) c_ca = ca.sum(a_ca, axis=(1, 2)) print(np.allclose(c_np, np.array(c_ca))) c_np = np.argmin(a_np, axis=0) c_ca = ca.argmin(a_ca, axis=0) print(np.allclose(c_np, np.array(c_ca))) c_np = np.argmin(a_np, axis=2) c_ca = ca.argmin(a_ca, axis=2) print(np.allclose(c_np, np.array(c_ca)))
def train(self, model, input, error_fun=None): input = Input.from_any(input) model.setup(**input.shapes) params = [p for p in model.params if not isinstance(p, SharedParameter)] self.learn_rule.learn_rate /= input.batch_size learn_rule_states = [self.learn_rule.init_state(p) for p in params] n_params = np.sum([p.array.size for p in params]) log.info('SGD: Model contains %i parameters.', n_params) log.info('SGD: %d gradient updates per epoch.', input.epoch_size) epoch = 0 converged = False patience = self.min_epochs best_score = np.inf start_time = time.clock() while epoch < self.max_epochs and not converged: epoch += 1 batch_losses = [] for batch in input.batches(): loss = np.array(ca.mean(model.update(**batch))) batch_losses.append(loss) # Update gradient for param, state in zip(params, learn_rule_states): self.learn_rule.step(param, state) epoch_loss = np.mean(batch_losses) if error_fun is not None: error = error_fun() if error < best_score: improvement = error / best_score if improvement < self.improvement_thresh: # increase patience on significant improvement patience = max(patience, epoch*self.patience_incr) best_score = error log.info('epoch %d/%d, loss %f, error %.4f', epoch, patience, epoch_loss, error) for param in params: param.monitor() if patience <= epoch: log.info('SGD: Converged on validation set.') converged = True else: if epoch_loss < best_score: improvement = epoch_loss / best_score if improvement < self.improvement_thresh: # increase patience on significant improvement patience = max(patience, epoch*self.patience_incr) best_score = epoch_loss log.info('epoch %d/%d, loss %f', epoch, patience, epoch_loss) for param in params: param.monitor() if patience <= epoch: log.info('SGD: Converged on training set.') converged = True end_time = time.clock() if not converged: log.info('SGD: Stopped by max_epochs.') duration = float(end_time - start_time) log.info('SGD: Optimization ran for %.2f minutes (%d epochs, ' '%.1f s/epoch)', duration/60, epoch, duration/epoch)
def fprop(self): ca.mean(self.x.out, axis=self.axis, out=self.out, keepdims=self.keepdims)
def train(self, model, input, valid_error_fun=None): input = Input.from_any(input) model._setup(input) params = model._params self.learn_rule._setup(params, input.batch_size) n_params = np.sum([p.array.size for p in params]) logger.info("SGD: Model contains %i parameters." % n_params) logger.info("SGD: %d mini-batch gradient updates per epoch." % input.n_batches) epoch = 0 converged = False patience = self.min_epochs best_score = np.inf start_time = time.clock() while epoch < self.max_epochs and not converged: epoch += 1 batch_costs = [] for batch in input.batches("train"): cost = np.array(ca.mean(model._update(batch))) batch_costs.append(cost) # Update gradient self.learn_rule.step() epoch_cost = np.mean(batch_costs) if valid_error_fun is not None: val_error = valid_error_fun() if val_error < best_score: improvement = val_error / best_score if improvement < self.improvement_thresh: # increase patience on significant improvement patience = max(patience, epoch * self.patience_incr) best_score = val_error logger.info( "epoch %d/%d" % (epoch, patience) + ", cost %f" % epoch_cost + ", val_error %.4f" % val_error ) for p in params: p.monitor() if patience <= epoch: logger.info("SGD: Converged on validation set.") converged = True else: if epoch_cost < best_score: improvement = epoch_cost / best_score if improvement < self.improvement_thresh: # increase patience on significant improvement patience = max(patience, epoch * self.patience_incr) best_score = epoch_cost logger.info("epoch %d/%d" % (epoch, patience) + ", cost %f" % epoch_cost) for p in params: p.monitor() if patience <= epoch: logger.info("SGD: Converged on training set.") converged = True end_time = time.clock() if not converged: logger.info("SGD: Stopped by max_epochs.") duration = float(end_time - start_time) logger.info( "SGD: Optimization ran for %.2f minutes " % (duration / 60) + "(%d epochs, %.1f s/epoch)" % (epoch, duration / epoch) )
def loss(self, y, y_pred): y_pred = ca.maximum(y_pred, _FLT_MIN) return -ca.mean(y*ca.log(y_pred) + (1 - y)*ca.log(1 - y_pred), axis=1)
def loss(self, pred, target): return ca.mean((target - pred) ** 2, axis=1)
def loss(self, y, y_pred): return ca.mean((y - y_pred)**2, axis=1)
def loss(self, y, y_pred): return ca.mean((y-y_pred)**2, axis=1)
def loss(self, y, y_pred): return ca.mean(-ca.sum(y*ca.log(y_pred+self.eps) + (1-y) * ca.log(1-y_pred+self.eps), axis=1))
def loss(self, pred, target): return ca.mean((target-pred)**2, axis=1)
def train(self, model, input, valid_error_fun=None): input = to_input(input) model._setup(input) params = model._params() self.learn_rule._setup(params, input.batch_size) n_params = np.sum([p.array.size for p in params]) logger.info('SGD: Model contains %i parameters.' % n_params) logger.info('SGD: %d mini-batch gradient updates per epoch.' % input.n_batches) epoch = 0 converged = False patience = self.min_epochs best_score = np.inf start_time = time.clock() while epoch < self.max_epochs and not converged: epoch += 1 batch_costs = [] for batch in input.supervised_batches(): cost = np.array(ca.mean(model._update(batch))) batch_costs.append(cost) # Update gradient self.learn_rule.step() epoch_cost = np.mean(batch_costs) if valid_error_fun is not None: val_error = valid_error_fun() if val_error < best_score: improvement = val_error / best_score if improvement < self.improvement_thresh: # increase patience on significant improvement patience = max(patience, epoch*self.patience_incr) best_score = val_error logger.info('epoch %d/%d' % (epoch, patience) + ', cost %f' % epoch_cost + ', val_error %.4f' % val_error) self.learn_rule.monitor() if patience <= epoch: logger.info('SGD: Converged on validation set.') converged = True else: if epoch_cost < best_score: improvement = epoch_cost / best_score if improvement < self.improvement_thresh: # increase patience on significant improvement patience = max(patience, epoch*self.patience_incr) best_score = epoch_cost logger.info('epoch %d/%d' % (epoch, patience) + ', cost %f' % epoch_cost) self.learn_rule.monitor() if patience <= epoch: logger.info('SGD: Converged on training set.') converged = True end_time = time.clock() if not converged: logger.info('SGD: Stopped by max_epochs.') duration = float(end_time - start_time) logger.info('SGD: Optimization ran for %.2f minutes ' % (duration/60) + '(%d epochs, %.1f s/epoch)' % (epoch, duration/epoch))
def loss(self, pred, target): pred = ca.maximum(pred, _FLT_MIN) return -ca.mean(target*ca.log(pred) + (1 - target)*ca.log(1 - pred), axis=1)
def fprop(self): ca.mean(self.x.array, axis=self.axis, out=self.array, keepdims=self.keepdims)