class GradientDescentBase(AcceleratedUnit): """Base class for gradient descent units. Attributes: input: input layer values. output: output layer values. err_output: error to backpropagate. err_input: backpropagated error. weights: weights. bias: bias. batch_size: current minibatch size. learning_rate: gradient descent speed (positive). learning_rate_bias weights_decay: regularization for weights (see l1_vs_l2). weights_decay_bias gradient_moment: moment coefficient for weights. gradient_moment_bias gradient_weights_with_moment: accumulated moment. gradient_bias_with_moment batch_size: effective batch size (if None, get it from y). weights_transposed: assume weights matrix as a transposed one. apply_gradient: will apply gradient. gradient_changed: when True, slave will send gradients to master (assigned to True just before the run call, so it can be set to False inside ocl_run, numpy_run if necessary). ocl_set_const_args: True when constant arguments for the kernel had been changed and need to be set again. """ hide_from_registry = True MAPPING = set() REDUCE_SIZE = 64 # used for updating bias def __init__(self, workflow, **kwargs): kwargs["view_group"] = kwargs.get("view_group", "TRAINER") super(GradientDescentBase, self).__init__(workflow, **kwargs) self.err_input = Array(shallow_pickle=True) self.ocl_set_const_args = True self.weights = None self.bias = None self.demand("input", "err_output") self.learning_rate = kwargs.get("learning_rate", 0.01) self.learning_rate_bias = kwargs.get("learning_rate_bias", self.learning_rate) self.weights_decay = kwargs.get("weights_decay", 0.00005) self.weights_decay_bias = kwargs.get("weights_decay_bias", 0.0) self.l1_vs_l2 = kwargs.get("l1_vs_l2", 0) self.l1_vs_l2_bias = kwargs.get("l1_vs_l2_bias", self.l1_vs_l2) self.gradient_moment = kwargs.get("gradient_moment", 0) self.gradient_moment_bias = kwargs.get("gradient_moment_bias", self.gradient_moment) self.weights_transposed = kwargs.get("weights_transposed", False) self.need_err_input = kwargs.get("need_err_input", True) self.include_bias = kwargs.get("include_bias", True) self.factor_ortho = kwargs.get("factor_ortho", 0) self.col_sums = Array() # for orthogonalization # Current gradient as it is without applying learning_rate etc. self.gradient_weights = Array() self.gradient_bias = Array() # Gradient with applied learning_rate etc. # optionally accumulated from the previous run self.accumulate_gradient = kwargs.get("accumulate_gradient", False) # When accumulate_gradient set to True: # 1. Calculate gd # 2. acc = acc_alpha * gd + acc_beta * acc # 3. gd = gd_alpha * acc + gd_beta * gd # 4. Apply moments to gd # 5. weights += gd if apply_gradient set to True self.acc_alpha = kwargs.get("acc_alpha", 0.0) self.acc_beta = kwargs.get("acc_beta", 0.0) self.gd_alpha = kwargs.get("gd_alpha", 0.0) self.gd_beta = kwargs.get("gd_beta", 1.0) self.accumulated_gradient_weights = Array() self.accumulated_gradient_bias = Array() # Gradient with accumulated moments self.gradient_weights_with_moment = Array() self.gradient_bias_with_moment = Array() # Sets to True when gradient changes self.gradient_changed = False # Gradient will be applied to weights immediately just after computing self.apply_gradient = kwargs.get("apply_gradient", not workflow.is_slave) @property def current_batch_size(self): batch_size = getattr(self, "batch_size", None) if batch_size is None: return self.err_output.mem.shape[0] return int(batch_size) def initialize(self, device, **kwargs): super(GradientDescentBase, self).initialize(device, **kwargs) if self.weights: assert len(self.weights.shape) == 2 self.weights_shape = (tuple(reversed(self.weights.shape)) if self.weights_transposed else self.weights.shape) else: self.weights_shape = None self.learning_rate = kwargs.get("learning_rate", self.learning_rate) self.weights_decay = kwargs.get("weights_decay", self.weights_decay) self.gradient_moment = kwargs.get("gradient_moment", self.gradient_moment) self.learning_rate_bias = kwargs.get("learning_rate_bias", self.learning_rate_bias) self.weights_decay_bias = kwargs.get("weights_decay_bias", self.weights_decay_bias) self.gradient_moment_bias = kwargs.get("gradient_moment_bias", self.gradient_moment_bias) if self.weights: if not self.gradient_weights: self.gradient_weights.reset(numpy.zeros_like(self.weights.mem)) else: assert self.gradient_weights.size == self.weights.size if self.weights and self.accumulate_gradient: if not self.accumulated_gradient_weights: self.accumulated_gradient_weights.reset( numpy.zeros_like(self.weights.mem)) else: assert (self.accumulated_gradient_weights.size == self.weights.size) if self.weights and (self.gradient_moment or not self.is_standalone): if not self.gradient_weights_with_moment: self.gradient_weights_with_moment.reset( numpy.zeros_like(self.weights.mem)) else: assert self.gradient_weights_with_moment.size == \ self.weights.size if (self.include_bias and self.bias and (not self.gradient_bias or self.gradient_bias.size != self.bias.size)): self.gradient_bias.reset(numpy.zeros_like(self.bias.mem)) if (self.include_bias and self.bias and self.accumulate_gradient and (not self.accumulated_gradient_bias or self.accumulated_gradient_bias.size != self.bias.size)): self.accumulated_gradient_bias.reset( numpy.zeros_like(self.bias.mem)) if (self.include_bias and self.bias and (self.gradient_moment_bias or not self.is_standalone)): if not self.gradient_bias_with_moment: self.gradient_bias_with_moment.reset( numpy.zeros_like(self.bias.mem)) else: assert self.gradient_bias_with_moment.size == self.bias.size dtype = self.err_output.dtype if self.need_err_input: if not self.err_input: self.err_input.reset(numpy.zeros(self.input.shape, dtype)) else: assert self.err_input.shape == self.input.shape if self.weights: side = self.weights_shape[0] other = self.weights.size // side if self.factor_ortho: if not self.col_sums: self.col_sums.reset(numpy.zeros(other, dtype=dtype)) else: assert self.col_sums.size == other self.col_sums.initialize(self.device) self.reduce_size = roundup(min(self.reduce_size, other), 32) self.weights.initialize(self.device) for vec in self.bias, self.input, self.err_input: if vec: vec.initialize(self.device) self.init_vectors(self.err_output, self.gradient_weights, self.gradient_bias, self.accumulated_gradient_weights, self.accumulated_gradient_bias, self.gradient_weights_with_moment, self.gradient_bias_with_moment) def gpu_weights_update(self): self.unmap_vectors(self.input, self.err_output, self.weights, self.gradient_weights, self.accumulated_gradient_weights, self.gradient_weights_with_moment) if self.factor_ortho: self.col_sums.unmap() self.execute_kernel(self._global_size_ortho, self._local_size_ortho, self.krn_compute_col_sums_) self._weights_const[12] = self.factor_ortho self.krn_weights_.set_arg(12, self._weights_const[12:13]) self._weights_const[4:12] = (self.learning_rate, self.weights_decay, self.l1_vs_l2, self.gradient_moment, self.acc_alpha, self.acc_beta, self.gd_alpha, self.gd_beta) self.krn_weights_.set_args( self.device.skip(4), self._weights_const[4:5], self._weights_const[5:6], self._weights_const[6:7], self._weights_const[7:8], self._weights_const[8:9], self._weights_const[9:10], self._weights_const[10:11], self._weights_const[11:12]) self.execute_kernel(self._global_size_weights, self._local_size_weights, self.krn_weights_) def gpu_bias_update(self): if not self.include_bias: return self.unmap_vectors(self.err_output, self.bias, self.gradient_bias, self.accumulated_gradient_bias, self.gradient_bias_with_moment) self._bias_const[5:13] = (self.learning_rate_bias, self.weights_decay_bias, self.l1_vs_l2_bias, self.gradient_moment_bias, self.acc_alpha, self.acc_beta, self.gd_alpha, self.gd_beta) self.krn_bias_.set_args(self.device.skip(5), self._bias_const[5:6], self._bias_const[6:7], self._bias_const[7:8], self._bias_const[8:9], self._bias_const[9:10], self._bias_const[10:11], self._bias_const[11:12], self._bias_const[12:13]) self.execute_kernel(self._global_size_bias, self._local_size_bias, self.krn_bias_) def gpu_err_output_update(self): """Multiply err_output by activation derivative by output. """ if self.krn_err_output_ is None: return self.err_output.unmap() self.output.unmap() self.execute_kernel(self._global_size_err_output, self._local_size_err_output, self.krn_err_output_) def numpy_err_output_update(self): """Multiply err_output by activation derivative by output. """ pass def print_debug_data(self): """ Show weights statistics """ if not self.logger.isEnabledFor(logging.DEBUG): return self.weights.map_read() self.bias.map_read() self.gradient_bias.map_read() self.gradient_weights.map_read() weights = self.weights.mem bias = self.bias.mem grad_weights = self.gradient_weights.mem grad_bias = self.gradient_bias.mem weight_table = PrettyTable("TYPE", "Mean", "StdDev", "Min", "Max") weight_table.float_format = ".10" for (w_name, w_array) in [("Weight", weights), ("Bias", bias), ("Grad Weight", grad_weights), ("Grad Bias", grad_bias)]: w_mean = w_stddev = w_min = w_max = None if w_array is not None and w_array.size > 0: w_mean = numpy.mean(w_array) w_stddev = numpy.std(w_array) w_min = numpy.min(w_array) w_max = numpy.max(w_array) weight_table.add_row(w_name, w_mean, w_stddev, w_min, w_max) self.debug("\n" + weight_table.get_string()) def generate_data_for_slave(self, slave): return (self.learning_rate, self.weights_decay, self.gradient_moment, self.learning_rate_bias, self.weights_decay_bias, self.gradient_moment_bias) @staticmethod def fill_zeros(vector): if not vector: return vector.map_invalidate() vector.mem[:] = 0 def apply_data_from_master(self, data): self.learning_rate = data[0] self.weights_decay = data[1] self.gradient_moment = data[2] self.learning_rate_bias = data[3] self.weights_decay_bias = data[4] self.gradient_moment_bias = data[5] self.fill_zeros(self.gradient_weights_with_moment) self.fill_zeros(self.gradient_bias_with_moment) self.fill_zeros(self.gradient_weights) self.fill_zeros(self.gradient_bias) self.fill_zeros(self.accumulated_gradient_weights) self.fill_zeros(self.accumulated_gradient_bias) def generate_data_for_master(self): if not self.gradient_changed: return None self.gradient_changed = False self.gradient_weights_with_moment.map_read() self.gradient_bias_with_moment.map_read() return (self.gradient_weights_with_moment.mem, self.gradient_bias_with_moment.mem) def apply_data_from_slave(self, data, slave): if self.weights: self.weights.map_write() self.gradient_weights_with_moment.map_write() self.gradient_weights_with_moment.mem *= self.gradient_moment self.gradient_weights_with_moment.mem += data[0] self.weights.mem += self.gradient_weights_with_moment.mem if self.bias: self.bias.map_write() self.gradient_bias_with_moment.map_write() self.gradient_bias_with_moment.mem *= self.gradient_moment_bias self.gradient_bias_with_moment.mem += data[1] self.bias.mem += self.gradient_bias_with_moment.mem def drop_slave(self, slave): pass def accumulate_gradient_f(self, accumulated_gradient, gradient): if accumulated_gradient and self.accumulate_gradient: accumulated_gradient[:] = ( gradient * self.acc_alpha + (self.acc_beta * accumulated_gradient if self.acc_beta else 0)) gradient *= self.gd_beta gradient += self.gd_alpha * accumulated_gradient return gradient @staticmethod def numpy_gradient_step(weight, gradient, lr, factor_l12, l1_vs_l2, factor_ortho=0, weights_transposed=False): gradient = gradient.copy() gradient += factor_l12 * ( (1.0 - l1_vs_l2) * weight + 0.5 * l1_vs_l2 * numpy.sign(weight)) if factor_ortho: col_sums = (reshape_transposed(weight).sum( axis=1) if weights_transposed else weight.sum(axis=0)) for i, row in enumerate(gradient): row += (col_sums - weight[i]) * factor_ortho / weight.shape[0] gradient *= lr return gradient def run(self): self.gradient_changed = True super(GradientDescentBase, self).run() self.ocl_set_const_args = False
class Forward(ForwardBase): """Class for forward propagation units. Attributes: input: input layer values. output: output layer values. weights: weights. bias: bias. weights_stddev: magnitude of the random distribution for weights. bias_stddev: magnitude of the random distribution for bias. rand: prng.Rand() object for initial weights generation. """ hide_from_registry = True MAPPING = set() def __init__(self, workflow, **kwargs): kwargs["view_group"] = kwargs.get("view_group", "WORKER") super(Forward, self).__init__(workflow, **kwargs) self.weights_stddev = kwargs.get("weights_stddev") self.bias_stddev = kwargs.get("bias_stddev", self.weights_stddev) self.weights_filling = kwargs.get("weights_filling", "uniform") self.bias_filling = kwargs.get("bias_filling", "uniform") self.rand = kwargs.get("rand", prng.get()) self.weights_transposed = kwargs.get("weights_transposed", False) self.include_bias = kwargs.get("include_bias", True) self.demand("input") self.output = Array(shallow_pickle=True) self.weights = Array() self.bias = Array() self.forward_mode = False self.exports = [ "weights", "bias", "include_bias", "weights_transposed" ] def package_export(self): data = {} for attr in self.exports: value = getattr(self, attr) if value is not None: if isinstance(value, Array): value.map_read() value = value.mem data[attr] = value return data @property def forward_mode(self): return self._forward_mode @forward_mode.setter def forward_mode(self, value): if not isinstance(value, bool): raise TypeError("forward_mode must be boolean (got %s)" % type(value)) self._forward_mode = value def initialize(self, device, **kwargs): self.forward_mode = kwargs.get("forward_mode", False) super(Forward, self).initialize(device=device, **kwargs) def generate_data_for_slave(self, slave): if self.forward_mode: return None data = [None, None] if self.weights: self.weights.map_read() data[0] = self.weights.mem if self.bias: self.bias.map_read() data[1] = self.bias.mem return data def generate_data_for_master(self): return None def apply_data_from_master(self, data): if self.forward_mode: return if self.weights: self.weights.map_invalidate() numpy.copyto(self.weights.mem, data[0]) else: self.weights.reset(data[0]) if self.bias: self.bias.map_invalidate() numpy.copyto(self.bias.mem, data[1]) else: self.bias.reset(data[1]) def apply_data_from_slave(self, data, slave): pass def drop_slave(self, slave): pass
class GradientDescentBase(AcceleratedUnit): """Base class for gradient descent units. Attributes: input: input layer values. output: output layer values. err_output: error to backpropagate. err_input: backpropagated error. weights: weights. bias: bias. batch_size: current minibatch size. learning_rate: gradient descent speed (positive). learning_rate_bias weights_decay: regularization for weights (see l1_vs_l2). weights_decay_bias gradient_moment: moment coefficient for weights. gradient_moment_bias gradient_weights_with_moment: accumulated moment. gradient_bias_with_moment batch_size: effective batch size (if None, get it from y). weights_transposed: assume weights matrix as a transposed one. apply_gradient: will apply gradient. gradient_changed: when True, slave will send gradients to master (assigned to True just before the run call, so it can be set to False inside ocl_run, numpy_run if necessary). ocl_set_const_args: True when constant arguments for the kernel had been changed and need to be set again. """ hide_from_registry = True MAPPING = set() REDUCE_SIZE = 64 # used for updating bias def __init__(self, workflow, **kwargs): kwargs["view_group"] = kwargs.get("view_group", "TRAINER") super(GradientDescentBase, self).__init__(workflow, **kwargs) self.err_input = Array(shallow_pickle=True) self.ocl_set_const_args = True self.weights = None self.bias = None self.demand("input", "err_output") self.learning_rate = kwargs.get("learning_rate", 0.01) self.learning_rate_bias = kwargs.get("learning_rate_bias", self.learning_rate) self.weights_decay = kwargs.get("weights_decay", 0.00005) self.weights_decay_bias = kwargs.get("weights_decay_bias", 0.0) self.l1_vs_l2 = kwargs.get("l1_vs_l2", 0) self.l1_vs_l2_bias = kwargs.get("l1_vs_l2_bias", self.l1_vs_l2) self.gradient_moment = kwargs.get("gradient_moment", 0) self.gradient_moment_bias = kwargs.get("gradient_moment_bias", self.gradient_moment) self.weights_transposed = kwargs.get("weights_transposed", False) self.need_err_input = kwargs.get("need_err_input", True) self.include_bias = kwargs.get("include_bias", True) self.factor_ortho = kwargs.get("factor_ortho", 0) self.col_sums = Array() # for orthogonalization # Current gradient as it is without applying learning_rate etc. self.gradient_weights = Array() self.gradient_bias = Array() # Gradient with applied learning_rate etc. # optionally accumulated from the previous run self.accumulate_gradient = kwargs.get("accumulate_gradient", False) # When accumulate_gradient set to True: # 1. Calculate gd # 2. acc = acc_alpha * gd + acc_beta * acc # 3. gd = gd_alpha * acc + gd_beta * gd # 4. Apply moments to gd # 5. weights += gd if apply_gradient set to True self.acc_alpha = kwargs.get("acc_alpha", 0.0) self.acc_beta = kwargs.get("acc_beta", 0.0) self.gd_alpha = kwargs.get("gd_alpha", 0.0) self.gd_beta = kwargs.get("gd_beta", 1.0) self.accumulated_gradient_weights = Array() self.accumulated_gradient_bias = Array() # Gradient with accumulated moments self.gradient_weights_with_moment = Array() self.gradient_bias_with_moment = Array() # Sets to True when gradient changes self.gradient_changed = False # Gradient will be applied to weights immediately just after computing self.apply_gradient = kwargs.get("apply_gradient", not workflow.is_slave) @property def current_batch_size(self): batch_size = getattr(self, "batch_size", None) if batch_size is None: return self.err_output.mem.shape[0] return int(batch_size) def initialize(self, device, **kwargs): super(GradientDescentBase, self).initialize(device, **kwargs) if self.weights: assert len(self.weights.shape) == 2 self.weights_shape = tuple(reversed(self.weights.shape)) if self.weights_transposed else self.weights.shape else: self.weights_shape = None self.learning_rate = kwargs.get("learning_rate", self.learning_rate) self.weights_decay = kwargs.get("weights_decay", self.weights_decay) self.gradient_moment = kwargs.get("gradient_moment", self.gradient_moment) self.learning_rate_bias = kwargs.get("learning_rate_bias", self.learning_rate_bias) self.weights_decay_bias = kwargs.get("weights_decay_bias", self.weights_decay_bias) self.gradient_moment_bias = kwargs.get("gradient_moment_bias", self.gradient_moment_bias) if self.weights: if not self.gradient_weights: self.gradient_weights.reset(numpy.zeros_like(self.weights.mem)) else: assert self.gradient_weights.size == self.weights.size if self.weights and self.accumulate_gradient: if not self.accumulated_gradient_weights: self.accumulated_gradient_weights.reset(numpy.zeros_like(self.weights.mem)) else: assert self.accumulated_gradient_weights.size == self.weights.size if self.weights and (self.gradient_moment or not self.is_standalone): if not self.gradient_weights_with_moment: self.gradient_weights_with_moment.reset(numpy.zeros_like(self.weights.mem)) else: assert self.gradient_weights_with_moment.size == self.weights.size if self.include_bias and self.bias and (not self.gradient_bias or self.gradient_bias.size != self.bias.size): self.gradient_bias.reset(numpy.zeros_like(self.bias.mem)) if ( self.include_bias and self.bias and self.accumulate_gradient and (not self.accumulated_gradient_bias or self.accumulated_gradient_bias.size != self.bias.size) ): self.accumulated_gradient_bias.reset(numpy.zeros_like(self.bias.mem)) if self.include_bias and self.bias and (self.gradient_moment_bias or not self.is_standalone): if not self.gradient_bias_with_moment: self.gradient_bias_with_moment.reset(numpy.zeros_like(self.bias.mem)) else: assert self.gradient_bias_with_moment.size == self.bias.size dtype = self.err_output.dtype if self.need_err_input: if not self.err_input: self.err_input.reset(numpy.zeros(self.input.shape, dtype)) else: assert self.err_input.shape == self.input.shape if self.weights: side = self.weights_shape[0] other = self.weights.size // side if self.factor_ortho: if not self.col_sums: self.col_sums.reset(numpy.zeros(other, dtype=dtype)) else: assert self.col_sums.size == other self.col_sums.initialize(self.device) self.reduce_size = roundup(min(self.reduce_size, other), 32) self.weights.initialize(self.device) for vec in self.bias, self.input, self.err_input: if vec: vec.initialize(self.device) self.init_vectors( self.err_output, self.gradient_weights, self.gradient_bias, self.accumulated_gradient_weights, self.accumulated_gradient_bias, self.gradient_weights_with_moment, self.gradient_bias_with_moment, ) def gpu_weights_update(self): self.unmap_vectors( self.input, self.err_output, self.weights, self.gradient_weights, self.accumulated_gradient_weights, self.gradient_weights_with_moment, ) if self.factor_ortho: self.col_sums.unmap() self.execute_kernel(self._global_size_ortho, self._local_size_ortho, self.krn_compute_col_sums_) self._weights_const[12] = self.factor_ortho self.krn_weights_.set_arg(12, self._weights_const[12:13]) self._weights_const[4:12] = ( self.learning_rate, self.weights_decay, self.l1_vs_l2, self.gradient_moment, self.acc_alpha, self.acc_beta, self.gd_alpha, self.gd_beta, ) self.krn_weights_.set_args( self.device.skip(4), self._weights_const[4:5], self._weights_const[5:6], self._weights_const[6:7], self._weights_const[7:8], self._weights_const[8:9], self._weights_const[9:10], self._weights_const[10:11], self._weights_const[11:12], ) self.execute_kernel(self._global_size_weights, self._local_size_weights, self.krn_weights_) def gpu_bias_update(self): if not self.include_bias: return self.unmap_vectors( self.err_output, self.bias, self.gradient_bias, self.accumulated_gradient_bias, self.gradient_bias_with_moment, ) self._bias_const[5:13] = ( self.learning_rate_bias, self.weights_decay_bias, self.l1_vs_l2_bias, self.gradient_moment_bias, self.acc_alpha, self.acc_beta, self.gd_alpha, self.gd_beta, ) self.krn_bias_.set_args( self.device.skip(5), self._bias_const[5:6], self._bias_const[6:7], self._bias_const[7:8], self._bias_const[8:9], self._bias_const[9:10], self._bias_const[10:11], self._bias_const[11:12], self._bias_const[12:13], ) self.execute_kernel(self._global_size_bias, self._local_size_bias, self.krn_bias_) def gpu_err_output_update(self): """Multiply err_output by activation derivative by output. """ if self.krn_err_output_ is None: return self.err_output.unmap() self.output.unmap() self.execute_kernel(self._global_size_err_output, self._local_size_err_output, self.krn_err_output_) def numpy_err_output_update(self): """Multiply err_output by activation derivative by output. """ pass def print_debug_data(self): """ Show weights statistics """ if not self.logger.isEnabledFor(logging.DEBUG): return self.weights.map_read() self.bias.map_read() self.gradient_bias.map_read() self.gradient_weights.map_read() weights = self.weights.mem bias = self.bias.mem grad_weights = self.gradient_weights.mem grad_bias = self.gradient_bias.mem weight_table = PrettyTable("TYPE", "Mean", "StdDev", "Min", "Max") weight_table.float_format = ".10" for (w_name, w_array) in [ ("Weight", weights), ("Bias", bias), ("Grad Weight", grad_weights), ("Grad Bias", grad_bias), ]: w_mean = w_stddev = w_min = w_max = None if w_array is not None and w_array.size > 0: w_mean = numpy.mean(w_array) w_stddev = numpy.std(w_array) w_min = numpy.min(w_array) w_max = numpy.max(w_array) weight_table.add_row(w_name, w_mean, w_stddev, w_min, w_max) self.debug("\n" + weight_table.get_string()) def generate_data_for_slave(self, slave): return ( self.learning_rate, self.weights_decay, self.gradient_moment, self.learning_rate_bias, self.weights_decay_bias, self.gradient_moment_bias, ) @staticmethod def fill_zeros(vector): if not vector: return vector.map_invalidate() vector.mem[:] = 0 def apply_data_from_master(self, data): self.learning_rate = data[0] self.weights_decay = data[1] self.gradient_moment = data[2] self.learning_rate_bias = data[3] self.weights_decay_bias = data[4] self.gradient_moment_bias = data[5] self.fill_zeros(self.gradient_weights_with_moment) self.fill_zeros(self.gradient_bias_with_moment) self.fill_zeros(self.gradient_weights) self.fill_zeros(self.gradient_bias) self.fill_zeros(self.accumulated_gradient_weights) self.fill_zeros(self.accumulated_gradient_bias) def generate_data_for_master(self): if not self.gradient_changed: return None self.gradient_changed = False self.gradient_weights_with_moment.map_read() self.gradient_bias_with_moment.map_read() return (self.gradient_weights_with_moment.mem, self.gradient_bias_with_moment.mem) def apply_data_from_slave(self, data, slave): if self.weights: self.weights.map_write() self.gradient_weights_with_moment.map_write() self.gradient_weights_with_moment.mem *= self.gradient_moment self.gradient_weights_with_moment.mem += data[0] self.weights.mem += self.gradient_weights_with_moment.mem if self.bias: self.bias.map_write() self.gradient_bias_with_moment.map_write() self.gradient_bias_with_moment.mem *= self.gradient_moment_bias self.gradient_bias_with_moment.mem += data[1] self.bias.mem += self.gradient_bias_with_moment.mem def drop_slave(self, slave): pass def accumulate_gradient_f(self, accumulated_gradient, gradient): if accumulated_gradient and self.accumulate_gradient: accumulated_gradient[:] = gradient * self.acc_alpha + ( self.acc_beta * accumulated_gradient if self.acc_beta else 0 ) gradient *= self.gd_beta gradient += self.gd_alpha * accumulated_gradient return gradient @staticmethod def numpy_gradient_step(weight, gradient, lr, factor_l12, l1_vs_l2, factor_ortho=0, weights_transposed=False): gradient = gradient.copy() gradient += factor_l12 * ((1.0 - l1_vs_l2) * weight + 0.5 * l1_vs_l2 * numpy.sign(weight)) if factor_ortho: col_sums = reshape_transposed(weight).sum(axis=1) if weights_transposed else weight.sum(axis=0) for i, row in enumerate(gradient): row += (col_sums - weight[i]) * factor_ortho / weight.shape[0] gradient *= lr return gradient def run(self): self.gradient_changed = True super(GradientDescentBase, self).run() self.ocl_set_const_args = False
class Forward(ForwardBase): """Class for forward propagation units. Attributes: input: input layer values. output: output layer values. weights: weights. bias: bias. weights_stddev: magnitude of the random distribution for weights. bias_stddev: magnitude of the random distribution for bias. rand: prng.Rand() object for initial weights generation. """ hide_from_registry = True MAPPING = set() def __init__(self, workflow, **kwargs): kwargs["view_group"] = kwargs.get("view_group", "WORKER") super(Forward, self).__init__(workflow, **kwargs) self.weights_stddev = kwargs.get("weights_stddev") self.bias_stddev = kwargs.get("bias_stddev", self.weights_stddev) self.weights_filling = kwargs.get("weights_filling", "uniform") self.bias_filling = kwargs.get("bias_filling", "uniform") self.rand = kwargs.get("rand", prng.get()) self.weights_transposed = kwargs.get("weights_transposed", False) self.include_bias = kwargs.get("include_bias", True) self.demand("input") self.output = Array(shallow_pickle=True) self.weights = Array() self.bias = Array() self.forward_mode = False self.exports = ["weights", "bias", "include_bias", "weights_transposed"] def package_export(self): data = {} for attr in self.exports: value = getattr(self, attr) if value is not None: if isinstance(value, Array): value.map_read() value = value.mem data[attr] = value return data @property def forward_mode(self): return self._forward_mode @forward_mode.setter def forward_mode(self, value): if not isinstance(value, bool): raise TypeError("forward_mode must be boolean (got %s)" % type(value)) self._forward_mode = value def initialize(self, device, **kwargs): self.forward_mode = kwargs.get("forward_mode", False) super(Forward, self).initialize(device=device, **kwargs) def generate_data_for_slave(self, slave): if self.forward_mode: return None data = [None, None] if self.weights: self.weights.map_read() data[0] = self.weights.mem if self.bias: self.bias.map_read() data[1] = self.bias.mem return data def generate_data_for_master(self): return None def apply_data_from_master(self, data): if self.forward_mode: return if self.weights: self.weights.map_invalidate() numpy.copyto(self.weights.mem, data[0]) else: self.weights.reset(data[0]) if self.bias: self.bias.map_invalidate() numpy.copyto(self.bias.mem, data[1]) else: self.bias.reset(data[1]) def apply_data_from_slave(self, data, slave): pass def drop_slave(self, slave): pass
class ZeroFiller(ForwardBase, TriviallyDistributable): """Fills weights of given unit with zero on every step""" MAPPING = {"zero_filter"} def __init__(self, workflow, **kwargs): super(ZeroFiller, self).__init__(workflow, **kwargs) self.mask = Array() self.grouping = kwargs.get("grouping", 1) self.demand("weights") def init_unpickled(self): super(ZeroFiller, self).init_unpickled() self.sources_["weights_zerofilling"] = {} @property def effective_shape(self): return (self.weights.shape[0], self.weights.size // self.weights.shape[0]) @property def grouping(self): return self._grouping @grouping.setter def grouping(self, value): if not isinstance(value, int): raise TypeError("grouping value must be an integer (got %s)" % type(value)) if value < 2: raise ValueError("grouping value %d is invalid" % value) self._grouping = value def initialize(self, device=None, **kwargs): super(ZeroFiller, self).initialize(device, **kwargs) if not self.weights: return True if not self.mask: if self.effective_shape[1] % self.grouping != 0: raise ValueError( "Non-multiple of grouping weights shape detected: " "%s, grouping=%d" % (self.weights.shape, self.grouping)) self.mask.reset( numpy.zeros(self.effective_shape, dtype=self.weights.dtype)) self.mask.map_invalidate() # TODO(a.kazantsev): add check for transposed weights. for kernel in range(self.effective_shape[0]): for chan in range(self.effective_shape[1]): self.mask[kernel, chan] = not (kernel % self.grouping == chan % self.grouping) else: assert self.mask.shape == self.effective_shape for vec in self.mask, self.weights: vec.initialize(device) def _gpu_init(self): self.build_program(cache_file_name="zero_filling_%d" % self.grouping, dtype=self.weights.dtype) self.assign_kernel("multiply_by_mask") self.set_args(self.mask, self.weights) def ocl_init(self): self._gpu_init() self._global_size = [self.weights.size] self._local_size = None def cuda_init(self): self._gpu_init() self._global_size = (self.weights.size, 1, 1) self._local_size = (1, 1, 1) def numpy_run(self): self.mask.map_read() self.weights.map_write() self.weights.mem *= self.mask.mem def _gpu_run(self): self.weights.unmap() self.mask.unmap() self.execute_kernel(self._global_size, self._local_size) def ocl_run(self): self._gpu_run() def cuda_run(self): self._gpu_run()
class ZeroFiller(ForwardBase, TriviallyDistributable): """Fills weights of given unit with zero on every step""" MAPPING = {"zero_filter"} def __init__(self, workflow, **kwargs): super(ZeroFiller, self).__init__(workflow, **kwargs) self.mask = Array() self.grouping = kwargs.get("grouping", 1) self.demand("weights") def init_unpickled(self): super(ZeroFiller, self).init_unpickled() self.sources_["weights_zerofilling"] = {} @property def effective_shape(self): return (self.weights.shape[0], self.weights.size // self.weights.shape[0]) @property def grouping(self): return self._grouping @grouping.setter def grouping(self, value): if not isinstance(value, int): raise TypeError( "grouping value must be an integer (got %s)" % type(value)) if value < 2: raise ValueError("grouping value %d is invalid" % value) self._grouping = value def initialize(self, device=None, **kwargs): super(ZeroFiller, self).initialize(device, **kwargs) if not self.weights: return True if not self.mask: if self.effective_shape[1] % self.grouping != 0: raise ValueError( "Non-multiple of grouping weights shape detected: " "%s, grouping=%d" % (self.weights.shape, self.grouping)) self.mask.reset(numpy.zeros(self.effective_shape, dtype=self.weights.dtype)) self.mask.map_invalidate() # TODO(a.kazantsev): add check for transposed weights. for kernel in range(self.effective_shape[0]): for chan in range(self.effective_shape[1]): self.mask[kernel, chan] = not ( kernel % self.grouping == chan % self.grouping) else: assert self.mask.shape == self.effective_shape for vec in self.mask, self.weights: vec.initialize(device) def _gpu_init(self): self.build_program(cache_file_name="zero_filling_%d" % self.grouping, dtype=self.weights.dtype) self.assign_kernel("multiply_by_mask") self.set_args(self.mask, self.weights) def ocl_init(self): self._gpu_init() self._global_size = [self.weights.size] self._local_size = None def cuda_init(self): self._gpu_init() self._global_size = (self.weights.size, 1, 1) self._local_size = (1, 1, 1) def numpy_run(self): self.mask.map_read() self.weights.map_write() self.weights.mem *= self.mask.mem def _gpu_run(self): self.weights.unmap() self.mask.unmap() self.execute_kernel(self._global_size, self._local_size) def ocl_run(self): self._gpu_run() def cuda_run(self): self._gpu_run()