def compute_step(self, parameter, previous_step): mean_square_step_tm1 = shared_floatx_zeros_matching( parameter, "mean_square_step_tm1") add_role(mean_square_step_tm1, ALGORITHM_BUFFER) mean_square_delta_x_tm1 = shared_floatx_zeros_matching( parameter, "mean_square_delta_x_tm1") add_role(mean_square_delta_x_tm1, ALGORITHM_BUFFER) mean_square_step_t = ( self.decay_rate * mean_square_step_tm1 + (1 - self.decay_rate) * tensor.sqr(previous_step) ) rms_delta_x_tm1 = tensor.sqrt(mean_square_delta_x_tm1 + self.epsilon) rms_step_t = tensor.sqrt(mean_square_step_t + self.epsilon) delta_x_t = rms_delta_x_tm1 / rms_step_t * previous_step mean_square_delta_x_t = ( self.decay_rate * mean_square_delta_x_tm1 + (1 - self.decay_rate) * tensor.sqr(delta_x_t) ) step = delta_x_t updates = [(mean_square_step_tm1, mean_square_step_t), (mean_square_delta_x_tm1, mean_square_delta_x_t)] return step, updates
def compute_step(self, parameter, previous_step): mean_square_step_tm1 = shared_floatx_zeros_matching( parameter, "mean_square_step_tm1") add_role(mean_square_step_tm1, ALGORITHM_BUFFER) mean_square_delta_x_tm1 = shared_floatx_zeros_matching( parameter, "mean_square_delta_x_tm1") add_role(mean_square_delta_x_tm1, ALGORITHM_BUFFER) mean_square_step_t = ( self.decay_rate * mean_square_step_tm1 + (1 - self.decay_rate) * tensor.sqr(previous_step) ) rms_delta_x_tm1 = tensor.sqrt(mean_square_delta_x_tm1 + self.epsilon) rms_step_t = tensor.sqrt(mean_square_step_t + self.epsilon) delta_x_t = rms_delta_x_tm1 / rms_step_t * previous_step mean_square_delta_x_t = ( self.decay_rate * mean_square_delta_x_tm1 + (1 - self.decay_rate) * tensor.sqr(delta_x_t) ) step = delta_x_t updates = [(mean_square_step_tm1, mean_square_step_t), (mean_square_delta_x_tm1, mean_square_delta_x_t)] return step, updates
def compute_step(self, parameter, previous_step): mean_square_step_tm1 = shared_floatx_zeros_matching(parameter, "mean_square_step_tm1") add_role(mean_square_step_tm1, ALGORITHM_BUFFER) mean_square_step_t = self.decay_rate * mean_square_step_tm1 + (1 - self.decay_rate) * tensor.sqr(previous_step) add_role(mean_square_step_t, ALGORITHM_BUFFER) rms_step_t = tensor.maximum(tensor.sqrt(mean_square_step_t), self.epsilon) step = previous_step / rms_step_t updates = [(mean_square_step_tm1, mean_square_step_t)] return step, updates
def compute_step(self, parameter, previous_step): mean = shared_floatx_zeros_matching(parameter, "mean") add_role(mean, ALGORITHM_BUFFER) variance = shared_floatx_zeros_matching(parameter, "variance") add_role(variance, ALGORITHM_BUFFER) time = shared_floatx(0.0, "time") add_role(time, ALGORITHM_BUFFER) t1 = time + 1 learning_rate = ( self.learning_rate * tensor.sqrt((1.0 - (1.0 - self.beta2) ** t1)) / (1.0 - (1.0 - self.beta1) ** t1) ) beta_1t = 1 - (1 - self.beta1) * self.decay_factor ** (t1 - 1) mean_t = beta_1t * previous_step + (1.0 - beta_1t) * mean variance_t = self.beta2 * tensor.sqr(previous_step) + (1.0 - self.beta2) * variance step = learning_rate * mean_t / (tensor.sqrt(variance_t) + self.epsilon) updates = [(mean, mean_t), (variance, variance_t), (time, t1)] return step, updates
def compute_step(self, parameter, previous_step): mean = shared_floatx_zeros_matching(parameter, 'mean') add_role(mean, ALGORITHM_BUFFER) variance = shared_floatx_zeros_matching(parameter, 'variance') add_role(variance, ALGORITHM_BUFFER) time = shared_floatx(0., 'time') add_role(time, ALGORITHM_BUFFER) t1 = time + 1 learning_rate = (self.learning_rate * tensor.sqrt( (1. - (1. - self.beta2)**t1)) / (1. - (1. - self.beta1)**t1)) beta_1t = 1 - (1 - self.beta1) * self.decay_factor**(t1 - 1) mean_t = beta_1t * previous_step + (1. - beta_1t) * mean variance_t = (self.beta2 * tensor.sqr(previous_step) + (1. - self.beta2) * variance) step = (learning_rate * mean_t / (tensor.sqrt(variance_t) + self.epsilon)) updates = [(mean, mean_t), (variance, variance_t), (time, t1)] return step, updates
def compute_step(self, parameter, previous_step): name = "adagrad_sqs" if parameter.name: name += "_" + parameter.name ssq = shared_floatx_zeros_matching(parameter, name=name) add_role(ssq, ALGORITHM_BUFFER) ssq_t = tensor.sqr(previous_step) + ssq step = self.learning_rate * previous_step / (tensor.sqrt(ssq_t) + self.epsilon) updates = [(ssq, ssq_t)] return step, updates
def compute_step(self, parameter, previous_step): mean_square_step_tm1 = shared_floatx_zeros_matching( parameter, "mean_square_step_tm1") add_role(mean_square_step_tm1, ALGORITHM_BUFFER) mean_square_step_t = ( self.decay_rate * mean_square_step_tm1 + (1 - self.decay_rate) * tensor.sqr(previous_step)) add_role(mean_square_step_t, ALGORITHM_BUFFER) rms_step_t = tensor.maximum( tensor.sqrt(mean_square_step_t), self.epsilon) step = previous_step / rms_step_t updates = [(mean_square_step_tm1, mean_square_step_t)] return step, updates
def compute_step(self, parameter, previous_step): name = 'adagrad_sqs' if parameter.name: name += '_' + parameter.name ssq = shared_floatx_zeros_matching(parameter, name=name) add_role(ssq, ALGORITHM_BUFFER) ssq_t = (tensor.sqr(previous_step) + ssq) step = (self.learning_rate * previous_step / (tensor.sqrt(ssq_t) + self.epsilon)) updates = [(ssq, ssq_t)] return step, updates
def _create_algorithm_buffer_for(param, *args, **kwargs): buf = shared_floatx_zeros_matching(param, *args, **kwargs) buf.tag.for_parameter = param add_role(buf, ALGORITHM_BUFFER) return buf
def compute_step(self, parameter, previous_step): velocity = shared_floatx_zeros_matching(parameter, "velocity") add_role(velocity, ALGORITHM_BUFFER) step = self.momentum * velocity + previous_step updates = [(velocity, step)] return step, updates
def _create_algorithm_buffer_for(param, *args, **kwargs): buf = shared_floatx_zeros_matching(param, *args, **kwargs) buf.tag.for_parameter = param add_role(buf, ALGORITHM_BUFFER) return buf
def compute_step(self, parameter, previous_step): velocity = shared_floatx_zeros_matching(parameter, "velocity") add_role(velocity, ALGORITHM_BUFFER) step = self.momentum * velocity + previous_step updates = [(velocity, step)] return step, updates