class FixAccumulator(Unit): """ Range accumulator. """ def __init__(self, workflow, **kwargs): super(FixAccumulator, self).__init__(workflow) self.bars = kwargs.get("bars", 200) self.type = kwargs.get("type", "relu") self.input = None self.output = Array() self.reset_flag = Bool(True) self.n_bars = [0] self.max = 100 self.min = 0 def initialize(self, **kwargs): self.output.mem = numpy.zeros([self.bars + 2], dtype=numpy.int64) def run(self): if self.type == "relu": self.max = 10000 self.min = 0 elif self.type == "tanh": self.max = 1.7159 self.min = -1.7159 else: raise error.BadFormatError("Unsupported type %s" % self.type) d = self.max - self.min if not d: return self.output.map_write() self.input.map_read() d = (self.bars - 1) / d if self.reset_flag: self.output.mem[:] = 0 self.n_bars[0] = self.bars + 2 for y in self.input.mem.ravel(): if y < self.min: self.output[0] += 1 continue if y <= self.max and y > self.min: i = int(numpy.floor((y - self.min) * d)) self.output[i] += 1 continue self.output[self.bars + 1] += 1
class EvaluatorMSE(EvaluatorBase): MAPPING = "evaluator_mse" LOSS = "mse" """Evaluator for nn softmax output from the batch labels. Must be assigned before initialize(): output target batch_size labels (may be None) class_targets (may be None) Updates after run(): err_output confusion_matrix max_err_output_sum n_err (only if labels and class_targets is not None) Creates within initialize(): err_output n_err (only if labels and class_targets is not None) max_err_output_sum Attributes: output: output of the network_common as Batch. target: target for the current Batch. err_output: backpropagation errors. batch_size: number of elements in output to evaluate. metrics: [0] - sum of sample's mse, [1] - max of sample's mse, [2] - min of sample's mse. mse: array of mse for each sample in minibatch. krn_constants_i_: numpy array for constant arguments to kernel. labels: labels for a batch (may be None). class_targets: target for each class (may be None). n_err: number of wrongly recognized samples (if labels and class_targets is not None). """ def __init__(self, workflow, **kwargs): super(EvaluatorMSE, self).__init__(workflow, **kwargs) self.metrics = Array() self.mse = Array() self.labels = None self.class_targets = None self.n_err = Array() self.root = kwargs.get("root", True) self.demand("target", "normalizer") @property def root(self): """ :return: True if error metric is RMSE, otherwise, MSE (mean sum of squares). Default is True. """ return self._root @root.setter def root(self, value): if not isinstance(value, bool): raise TypeError("root must be boolean (got %s)" % type(value)) self._root = value def initialize(self, device, **kwargs): super(EvaluatorMSE, self).initialize(device=device, **kwargs) if self.testing: return if self.target.size != self.output.size: raise error.BadFormatError( "target.size != output.size (%s != %s)" % (self.target.size, self.output.size)) self.sources_["evaluator_mse"] = {} self.sources_["denormalization"] = {} dtype = self.output.dtype self.metrics.reset(numpy.zeros(3, dtype=dtype)) self.metrics[2] = 1.0e30 # mse_min self.mse.reset(numpy.zeros(self.err_output.mem.shape[0], dtype)) self.n_err.reset(numpy.zeros(2, dtype=numpy.int32)) self.init_vectors(self.n_err, self.target, self.metrics, self.mse) if self.class_targets: self.class_targets.initialize(self.device) def _gpu_init(self): dtype = self.output.dtype block_size = min(self.err_output.shape[0], 128) if self.class_targets: self.sources_["mse_find_closest"] = { "target_dtype": numpy_dtype_to_opencl(self.class_targets.dtype) } self.build_program(cache_file_name="%s_%d_%d" % (self.__class__.__name__, self.output.shape[0], self.output.sample_size), dtype=dtype, max_batch_size=self.err_output.shape[0], block_size=block_size, output_size=self.err_output.sample_size, root=self.root, normalization=self.normalizer.MAPPING, targets_number=self.class_targets.shape[0] if self.class_targets else None, coeffs=self.normalizer.coefficients) self.assign_kernel("evaluate_mse") self.set_args(self.output, self.target, self.skip_args(2), self.metrics, self.mse.devmem, self.err_output) if self.labels and self.class_targets: assert (self.labels.dtype == self.n_err.dtype == numpy.int32) self.krn_find_closest_ = self.get_kernel("mse_find_closest") self.krn_find_closest_.set_args(self.output.devmem, self.class_targets.devmem, self.labels.devmem, self.n_err.devmem) return block_size def ocl_init(self): if self.testing: return block_size = self._gpu_init() self._local_size = [block_size] self._global_size = self._local_size self._global_size_find_closest_ = lambda: (self.batch_size, ) self._local_size_find_closest = None def cuda_init(self): if self.testing: return block_size = self._gpu_init() self._local_size = (block_size, 1, 1) self._global_size = (1, 1, 1) self._global_size_find_closest_ = lambda: (self.batch_size, 1, 1) self._local_size_find_closest = (1, 1, 1) def _gpu_run(self): self.unmap_vectors(self.err_output, self.output, self.target, self.metrics, self.mse) batch_size = self.batch_size self.krn_constants_i_[0] = batch_size self.set_arg(2, self.krn_constants_i_[0:1]) self.krn_constants_f_[0] = 1.0 / self.batch_size if self.mean else 1.0 self.set_arg(3, self.krn_constants_f_[0:1]) self.execute_kernel(self._global_size, self._local_size) if self.labels and self.class_targets: self.unmap_vectors(self.class_targets, self.labels, self.n_err) self.execute_kernel(self._global_size_find_closest_(), self._local_size_find_closest, self.krn_find_closest_) self.n_err.map_write() self.n_err.mem[1] += batch_size def ocl_run(self): return self._gpu_run() def cuda_run(self): return self._gpu_run() def numpy_run(self): self.output.map_read() self.target.map_read() self.metrics.map_write() self.err_output.map_invalidate() self.mse.map_invalidate() assert (self.output.size == self.target.size == self.err_output.size) batch_size = self.batch_size err_output = self.err_output.matrix[:batch_size] assert_addr(err_output, self.err_output.mem) output = self.output.matrix[:batch_size] assert_addr(output, self.output.mem) target = self.target.matrix[:batch_size] assert_addr(target, self.target.mem) mse = self.mse.mem[:batch_size] assert_addr(mse, self.mse.mem) err_output[:] = output - target if not isinstance(self.normalizer, NoneNormalizer): output_copy = output.copy() target_copy = target.copy() self.normalizer.denormalize(output_copy) self.normalizer.denormalize(target_copy) denormed_err_output = output_copy - target_copy else: denormed_err_output = err_output self.err_output.mem[batch_size:] = 0 mse[:] = numpy.square(denormed_err_output).sum(axis=1) / \ denormed_err_output.shape[1] if self.mean: err_output /= batch_size if self.root: numpy.sqrt(mse, mse) self.mse.mem[batch_size:] = 0 self.metrics.mem[0] += mse.sum() self.metrics.mem[1] = max(self.metrics.mem[1], mse.max()) self.metrics.mem[2] = min(self.metrics.mem[2], mse.min()) if self.labels and self.class_targets: self.class_targets.map_read() self.labels.map_read() self.n_err.map_write() class_targets = self.class_targets.matrix labels = self.labels.mem for i, sample in enumerate(output): lbl = numpy.linalg.norm(class_targets - sample, axis=1).argmin() if lbl != labels[i]: self.n_err.mem[0] += 1 self.n_err.mem[1] += 1 def merge_output(self): if not isinstance(self.normalizer, NoneNormalizer): output = self.output[:self.batch_size].copy() self.normalizer.denormalize(output) else: output = self.output.mem self.merged_output[self.offset - self.batch_size:self.offset] = output
class MultiHistogram(Plotter): """Plotter for drawing weights as 2D. Must be assigned before initialize(): input input_field """ def __init__(self, workflow, **kwargs): super(MultiHistogram, self).__init__(workflow, **kwargs) self.limit = kwargs.get("limit", 64) self.value = Array() self.n_bars = kwargs.get("n_bars", 25) self.hist_number = kwargs.get("hist_number", 16) self.demand("input") def initialize(self, **kwargs): super(MultiHistogram, self).initialize(**kwargs) if self.hist_number > self.limit: self.hist_number = self.limit self.value.mem = numpy.zeros( [self.hist_number, self.n_bars], dtype=numpy.int64) def redraw(self): fig = self.pp.figure(self.name) fig.clf() fig.patch.set_facecolor('#E8D6BB') # fig.patch.set_alpha(0.45) n_cols = int(numpy.round(numpy.sqrt(self.value.shape[0]))) n_rows = int(numpy.ceil(self.value.shape[0] / n_cols)) i = 0 for _ in range(0, n_rows): for _ in range(0, n_cols): ax = fig.add_subplot(n_rows, n_cols, i + 1) ax.cla() # ax.axis('off') ax.patch.set_facecolor('#ffe6ca') # ax.set_xlabel("Input Data", fontsize=10) # ax.set_ylabel("Number", fontsize=10) ymin = self.value[i].min() ymax = self.value[i].max() xmin = self.input[i].min() xmax = self.input[i].max() ax.axis([xmin, xmax + ((xmax - xmin) / self.n_bars), ymin, ymax]) ax.grid(True) ax.set_title(self.name.replace("Histogram ", "")) nbars = self.n_bars width = ((xmax - xmin) / nbars) * 0.8 X = numpy.linspace(xmin, xmax, num=nbars, endpoint=True) Y = self.value[i] if (n_rows > 5) or (n_cols > 5): ax.bar(X, Y, color='#ffa0ef', width=width, edgecolor='red') else: ax.bar(X, Y, color='#ffa0ef', width=width, edgecolor='lavender') if n_rows > 4: ax.set_yticklabels([]) if n_cols > 3: ax.set_xticklabels([]) i += 1 if i >= self.value.shape[0]: break if i >= self.value.shape[0]: break self.show_figure(fig) fig.canvas.draw() return fig def fill(self): for i in range(self.hist_number): self.value.map_write() self.input.map_read() mx = self.input.mem[i].max() mi = self.input.mem[i].min() d = mx - mi if not d: return d = (self.n_bars - 1) / d self.value[i] = 0 for x in self.input.mem[i]: i_bar = int(numpy.floor((x - mi) * d)) self.value[i, i_bar] += 1
class KohonenForward(KohonenBase, AcceleratedUnit): """Kohonen forward layer. Must be assigned before initialize(): input weights minibatch_offset (if total == True) minibatch_size (if total == True) batch_size (if total == True) argmins speeds up run() if linked from KohonenTrainer Updates after run(): output Creates within initialize(): output Attributes: input: input as batch of samples. weights: the weights of the neurons in Kohonen layer. output: the list of winners. total: if total=True is passed in __init__(), the overall winners table """ def __init__(self, workflow, **kwargs): super(KohonenForward, self).__init__(workflow, **kwargs) self.demand("input", "weights") self.argmins = None self._distances = Array() self.output = Array() self._chunk_size_ = 0 self.weights_transposed = False self.total = Array() if kwargs.get("total", False) else None if self.total is not None: self.minibatch_offset = None self.minibatch_size = None self.batch_size = None def init_unpickled(self): super(KohonenForward, self).init_unpickled() self.sources_["kohonen"] = {"FORWARD": 1} @property def neurons_number(self): return self.weights.mem.shape[0] @property def sample_length(self): return self.weights.mem.shape[1] @property def chunk_size(self): return self._chunk_size_ def initialize(self, device, **kwargs): super(KohonenForward, self).initialize(device=device, **kwargs) assert self.input.mem.shape[1] == self.sample_length batch_size = self.input.mem.shape[0] self.output.reset(numpy.zeros(batch_size, dtype=numpy.int32)) if self.argmins is None: self._distances.reset( numpy.zeros([batch_size, self.neurons_number], dtype=self.weights.mem.dtype)) if self.total is not None: self.total.reset(numpy.zeros(self.batch_size, dtype=numpy.int32)) self._minibatch_offset_ = numpy.zeros(1, dtype=numpy.int32) def ocl_init(self): batch_size = self.input.mem.shape[0] self.output.initialize(self.device) if self.argmins is None: self.input.initialize(self.device) self.weights.initialize(self.device) self._distances.initialize(self.device) elif self.total is None: return if self.total is not None: self.total.initialize(self.device) copy_chunk_size = int( numpy.ceil(batch_size / self.device.max_group_size)) chunk_size = self.neurons_number // self.device.max_group_size if chunk_size < 2: chunk_size = self.neurons_number // 2 + 1 self.argmin_group_size = \ int(numpy.ceil(self.neurons_number / chunk_size)) block_size, vector_opt = self.device.device_info.get_kernel_bs_vo( kernel="matrix_multiplication", dtype=self.input.dtype) defines = { 'BLOCK_SIZE': block_size, 'VECTOR_OPT': int(bool(vector_opt)), 'BATCH': batch_size, 'SAMPLE_LENGTH': self.sample_length, 'NEURONS_NUMBER': self.neurons_number, 'CHUNK_SIZE': chunk_size, 'COPY_CHUNK_SIZE': copy_chunk_size, } if self.weights_transposed: defines['WEIGHTS_TRANSPOSED'] = 1 self.build_program(defines, "%s_%d_%d_%d" % (self.__class__.__name__, batch_size, self.sample_length, self.neurons_number), dtype=self.weights.mem.dtype) if self.total is not None: self._set_total_global_size_ = \ [int(numpy.ceil(batch_size / copy_chunk_size))] self._krn_set_total_ = self.get_kernel("set_total") self._krn_set_total_.set_args(self.output.devmem, cl.skip, self.total.devmem) if self.argmins is not None: return self._krn_distances_ = self.get_kernel("calculate_distances") self._krn_distances_.set_args(self.input.devmem, self.weights.devmem, self._distances.devmem) self._krn_argmin_ = self.get_kernel("calculate_argmin") self._krn_argmin_.set_args(self._distances.devmem, self.output.devmem, None) self._gs_distance = [ roundup(self.neurons_number, block_size), roundup(batch_size, block_size) ] self._ls_distance = [block_size, block_size] def ocl_run(self): self.output.unmap() if self.total is not None: self.total.unmap() if self.argmins is None: self.input.unmap() self.weights.unmap() self.execute_kernel(self._gs_distance, self._ls_distance, self._krn_distances_) self.execute_kernel([self.argmin_group_size], [self.argmin_group_size], self._krn_argmin_) else: self.argmins.unmap() self.argmins.map_read() self.output.map_write() self.output.mem[:] = self.argmins.mem self.output.unmap() self.argmins.unmap() if self.total is not None: self._minibatch_offset_[0] = \ self.minibatch_offset - self.minibatch_size self._krn_set_total_.set_arg(1, self._minibatch_offset_) self.execute_kernel(self._set_total_global_size_, None, self._krn_set_total_) def numpy_run(self): self.output.map_invalidate() if self.argmins is not None: self.argmins.map_read() self.output.mem[:] = self.argmins.mem else: self.input.map_read() self.weights.map_read() if self.total is not None: self.total.map_invalidate() length = self.minibatch_size if self.total is not None \ else self.input.mem.shape[0] for sindex in range(length): if self.argmins is None: dist = self.weights.mem - self.input[sindex] winner = numpy.argmin(self.numpy_linalg_norm(dist)) self.output[sindex] = winner else: winner = self.argmins[sindex] if self.total is not None: index = sindex + self.minibatch_offset - self.minibatch_size self.total[index] = winner
class EvaluatorMSE(EvaluatorBase): MAPPING = "evaluator_mse" LOSS = "mse" """Evaluator for nn softmax output from the batch labels. Must be assigned before initialize(): output target batch_size labels (may be None) class_targets (may be None) Updates after run(): err_output confusion_matrix max_err_output_sum n_err (only if labels and class_targets is not None) Creates within initialize(): err_output n_err (only if labels and class_targets is not None) max_err_output_sum Attributes: output: output of the network_common as Batch. target: target for the current Batch. err_output: backpropagation errors. batch_size: number of elements in output to evaluate. metrics: [0] - sum of sample's mse, [1] - max of sample's mse, [2] - min of sample's mse. mse: array of mse for each sample in minibatch. krn_constants_i_: numpy array for constant arguments to kernel. labels: labels for a batch (may be None). class_targets: target for each class (may be None). n_err: number of wrongly recognized samples (if labels and class_targets is not None). """ def __init__(self, workflow, **kwargs): super(EvaluatorMSE, self).__init__(workflow, **kwargs) self.metrics = Array() self.mse = Array() self.labels = None self.class_targets = None self.n_err = Array() self.root = kwargs.get("root", True) self.demand("target", "normalizer") @property def root(self): """ :return: True if error metric is RMSE, otherwise, MSE (mean sum of squares). Default is True. """ return self._root @root.setter def root(self, value): if not isinstance(value, bool): raise TypeError("root must be boolean (got %s)" % type(value)) self._root = value def initialize(self, device, **kwargs): super(EvaluatorMSE, self).initialize(device=device, **kwargs) if self.testing: return if self.target.size != self.output.size: raise error.BadFormatError( "target.size != output.size (%s != %s)" % (self.target.size, self.output.size)) self.sources_["evaluator_mse"] = {} self.sources_["denormalization"] = {} dtype = self.output.dtype self.metrics.reset(numpy.zeros(3, dtype=dtype)) self.metrics[2] = 1.0e30 # mse_min self.mse.reset(numpy.zeros(self.err_output.mem.shape[0], dtype)) self.n_err.reset(numpy.zeros(2, dtype=numpy.int32)) self.init_vectors(self.n_err, self.target, self.metrics, self.mse) if self.class_targets: self.class_targets.initialize(self.device) def _gpu_init(self): dtype = self.output.dtype block_size = min(self.err_output.shape[0], 128) if self.class_targets: self.sources_["mse_find_closest"] = { "target_dtype": numpy_dtype_to_opencl(self.class_targets.dtype) } self.build_program( cache_file_name="%s_%d_%d" % (self.__class__.__name__, self.output.shape[0], self.output.sample_size), dtype=dtype, max_batch_size=self.err_output.shape[0], block_size=block_size, output_size=self.err_output.sample_size, root=self.root, normalization=self.normalizer.MAPPING, targets_number=self.class_targets.shape[0] if self.class_targets else None, coeffs=self.normalizer.coefficients) self.assign_kernel("evaluate_mse") self.set_args(self.output, self.target, self.skip_args(2), self.metrics, self.mse.devmem, self.err_output) if self.labels and self.class_targets: assert(self.labels.dtype == self.n_err.dtype == numpy.int32) self.krn_find_closest_ = self.get_kernel("mse_find_closest") self.krn_find_closest_.set_args( self.output.devmem, self.class_targets.devmem, self.labels.devmem, self.n_err.devmem) return block_size def ocl_init(self): if self.testing: return block_size = self._gpu_init() self._local_size = [block_size] self._global_size = self._local_size self._global_size_find_closest_ = lambda: (self.batch_size,) self._local_size_find_closest = None def cuda_init(self): if self.testing: return block_size = self._gpu_init() self._local_size = (block_size, 1, 1) self._global_size = (1, 1, 1) self._global_size_find_closest_ = lambda: (self.batch_size, 1, 1) self._local_size_find_closest = (1, 1, 1) def _gpu_run(self): self.unmap_vectors(self.err_output, self.output, self.target, self.metrics, self.mse) batch_size = self.batch_size self.krn_constants_i_[0] = batch_size self.set_arg(2, self.krn_constants_i_[0:1]) self.krn_constants_f_[0] = 1.0 / self.batch_size if self.mean else 1.0 self.set_arg(3, self.krn_constants_f_[0:1]) self.execute_kernel(self._global_size, self._local_size) if self.labels and self.class_targets: self.unmap_vectors(self.class_targets, self.labels, self.n_err) self.execute_kernel(self._global_size_find_closest_(), self._local_size_find_closest, self.krn_find_closest_) self.n_err.map_write() self.n_err.mem[1] += batch_size def ocl_run(self): return self._gpu_run() def cuda_run(self): return self._gpu_run() def numpy_run(self): self.output.map_read() self.target.map_read() self.metrics.map_write() self.err_output.map_invalidate() self.mse.map_invalidate() assert(self.output.size == self.target.size == self.err_output.size) batch_size = self.batch_size err_output = self.err_output.matrix[:batch_size] assert_addr(err_output, self.err_output.mem) output = self.output.matrix[:batch_size] assert_addr(output, self.output.mem) target = self.target.matrix[:batch_size] assert_addr(target, self.target.mem) mse = self.mse.mem[:batch_size] assert_addr(mse, self.mse.mem) err_output[:] = output - target if not isinstance(self.normalizer, NoneNormalizer): output_copy = output.copy() target_copy = target.copy() self.normalizer.denormalize(output_copy) self.normalizer.denormalize(target_copy) denormed_err_output = output_copy - target_copy else: denormed_err_output = err_output self.err_output.mem[batch_size:] = 0 mse[:] = numpy.square(denormed_err_output).sum(axis=1) / \ denormed_err_output.shape[1] if self.mean: err_output /= batch_size if self.root: numpy.sqrt(mse, mse) self.mse.mem[batch_size:] = 0 self.metrics.mem[0] += mse.sum() self.metrics.mem[1] = max(self.metrics.mem[1], mse.max()) self.metrics.mem[2] = min(self.metrics.mem[2], mse.min()) if self.labels and self.class_targets: self.class_targets.map_read() self.labels.map_read() self.n_err.map_write() class_targets = self.class_targets.matrix labels = self.labels.mem for i, sample in enumerate(output): lbl = numpy.linalg.norm(class_targets - sample, axis=1).argmin() if lbl != labels[i]: self.n_err.mem[0] += 1 self.n_err.mem[1] += 1 def merge_output(self): if not isinstance(self.normalizer, NoneNormalizer): output = self.output[:self.batch_size].copy() self.normalizer.denormalize(output) else: output = self.output.mem self.merged_output[self.offset - self.batch_size:self.offset] = output
class GradientDescentBase(AcceleratedUnit): """Base class for gradient descent units. Attributes: input: input layer values. output: output layer values. err_output: error to backpropagate. err_input: backpropagated error. weights: weights. bias: bias. batch_size: current minibatch size. learning_rate: gradient descent speed (positive). learning_rate_bias weights_decay: regularization for weights (see l1_vs_l2). weights_decay_bias gradient_moment: moment coefficient for weights. gradient_moment_bias gradient_weights_with_moment: accumulated moment. gradient_bias_with_moment batch_size: effective batch size (if None, get it from y). weights_transposed: assume weights matrix as a transposed one. apply_gradient: will apply gradient. gradient_changed: when True, slave will send gradients to master (assigned to True just before the run call, so it can be set to False inside ocl_run, numpy_run if necessary). ocl_set_const_args: True when constant arguments for the kernel had been changed and need to be set again. """ hide_from_registry = True MAPPING = set() REDUCE_SIZE = 64 # used for updating bias def __init__(self, workflow, **kwargs): kwargs["view_group"] = kwargs.get("view_group", "TRAINER") super(GradientDescentBase, self).__init__(workflow, **kwargs) self.err_input = Array(shallow_pickle=True) self.ocl_set_const_args = True self.weights = None self.bias = None self.demand("input", "err_output") self.learning_rate = kwargs.get("learning_rate", 0.01) self.learning_rate_bias = kwargs.get("learning_rate_bias", self.learning_rate) self.weights_decay = kwargs.get("weights_decay", 0.00005) self.weights_decay_bias = kwargs.get("weights_decay_bias", 0.0) self.l1_vs_l2 = kwargs.get("l1_vs_l2", 0) self.l1_vs_l2_bias = kwargs.get("l1_vs_l2_bias", self.l1_vs_l2) self.gradient_moment = kwargs.get("gradient_moment", 0) self.gradient_moment_bias = kwargs.get("gradient_moment_bias", self.gradient_moment) self.weights_transposed = kwargs.get("weights_transposed", False) self.need_err_input = kwargs.get("need_err_input", True) self.include_bias = kwargs.get("include_bias", True) self.factor_ortho = kwargs.get("factor_ortho", 0) self.col_sums = Array() # for orthogonalization # Current gradient as it is without applying learning_rate etc. self.gradient_weights = Array() self.gradient_bias = Array() # Gradient with applied learning_rate etc. # optionally accumulated from the previous run self.accumulate_gradient = kwargs.get("accumulate_gradient", False) # When accumulate_gradient set to True: # 1. Calculate gd # 2. acc = acc_alpha * gd + acc_beta * acc # 3. gd = gd_alpha * acc + gd_beta * gd # 4. Apply moments to gd # 5. weights += gd if apply_gradient set to True self.acc_alpha = kwargs.get("acc_alpha", 0.0) self.acc_beta = kwargs.get("acc_beta", 0.0) self.gd_alpha = kwargs.get("gd_alpha", 0.0) self.gd_beta = kwargs.get("gd_beta", 1.0) self.accumulated_gradient_weights = Array() self.accumulated_gradient_bias = Array() # Gradient with accumulated moments self.gradient_weights_with_moment = Array() self.gradient_bias_with_moment = Array() # Sets to True when gradient changes self.gradient_changed = False # Gradient will be applied to weights immediately just after computing self.apply_gradient = kwargs.get("apply_gradient", not workflow.is_slave) @property def current_batch_size(self): batch_size = getattr(self, "batch_size", None) if batch_size is None: return self.err_output.mem.shape[0] return int(batch_size) def initialize(self, device, **kwargs): super(GradientDescentBase, self).initialize(device, **kwargs) if self.weights: assert len(self.weights.shape) == 2 self.weights_shape = (tuple(reversed(self.weights.shape)) if self.weights_transposed else self.weights.shape) else: self.weights_shape = None self.learning_rate = kwargs.get("learning_rate", self.learning_rate) self.weights_decay = kwargs.get("weights_decay", self.weights_decay) self.gradient_moment = kwargs.get("gradient_moment", self.gradient_moment) self.learning_rate_bias = kwargs.get("learning_rate_bias", self.learning_rate_bias) self.weights_decay_bias = kwargs.get("weights_decay_bias", self.weights_decay_bias) self.gradient_moment_bias = kwargs.get("gradient_moment_bias", self.gradient_moment_bias) if self.weights: if not self.gradient_weights: self.gradient_weights.reset(numpy.zeros_like(self.weights.mem)) else: assert self.gradient_weights.size == self.weights.size if self.weights and self.accumulate_gradient: if not self.accumulated_gradient_weights: self.accumulated_gradient_weights.reset( numpy.zeros_like(self.weights.mem)) else: assert (self.accumulated_gradient_weights.size == self.weights.size) if self.weights and (self.gradient_moment or not self.is_standalone): if not self.gradient_weights_with_moment: self.gradient_weights_with_moment.reset( numpy.zeros_like(self.weights.mem)) else: assert self.gradient_weights_with_moment.size == \ self.weights.size if (self.include_bias and self.bias and (not self.gradient_bias or self.gradient_bias.size != self.bias.size)): self.gradient_bias.reset(numpy.zeros_like(self.bias.mem)) if (self.include_bias and self.bias and self.accumulate_gradient and (not self.accumulated_gradient_bias or self.accumulated_gradient_bias.size != self.bias.size)): self.accumulated_gradient_bias.reset( numpy.zeros_like(self.bias.mem)) if (self.include_bias and self.bias and (self.gradient_moment_bias or not self.is_standalone)): if not self.gradient_bias_with_moment: self.gradient_bias_with_moment.reset( numpy.zeros_like(self.bias.mem)) else: assert self.gradient_bias_with_moment.size == self.bias.size dtype = self.err_output.dtype if self.need_err_input: if not self.err_input: self.err_input.reset(numpy.zeros(self.input.shape, dtype)) else: assert self.err_input.shape == self.input.shape if self.weights: side = self.weights_shape[0] other = self.weights.size // side if self.factor_ortho: if not self.col_sums: self.col_sums.reset(numpy.zeros(other, dtype=dtype)) else: assert self.col_sums.size == other self.col_sums.initialize(self.device) self.reduce_size = roundup(min(self.reduce_size, other), 32) self.weights.initialize(self.device) for vec in self.bias, self.input, self.err_input: if vec: vec.initialize(self.device) self.init_vectors(self.err_output, self.gradient_weights, self.gradient_bias, self.accumulated_gradient_weights, self.accumulated_gradient_bias, self.gradient_weights_with_moment, self.gradient_bias_with_moment) def gpu_weights_update(self): self.unmap_vectors(self.input, self.err_output, self.weights, self.gradient_weights, self.accumulated_gradient_weights, self.gradient_weights_with_moment) if self.factor_ortho: self.col_sums.unmap() self.execute_kernel(self._global_size_ortho, self._local_size_ortho, self.krn_compute_col_sums_) self._weights_const[12] = self.factor_ortho self.krn_weights_.set_arg(12, self._weights_const[12:13]) self._weights_const[4:12] = (self.learning_rate, self.weights_decay, self.l1_vs_l2, self.gradient_moment, self.acc_alpha, self.acc_beta, self.gd_alpha, self.gd_beta) self.krn_weights_.set_args( self.device.skip(4), self._weights_const[4:5], self._weights_const[5:6], self._weights_const[6:7], self._weights_const[7:8], self._weights_const[8:9], self._weights_const[9:10], self._weights_const[10:11], self._weights_const[11:12]) self.execute_kernel(self._global_size_weights, self._local_size_weights, self.krn_weights_) def gpu_bias_update(self): if not self.include_bias: return self.unmap_vectors(self.err_output, self.bias, self.gradient_bias, self.accumulated_gradient_bias, self.gradient_bias_with_moment) self._bias_const[5:13] = (self.learning_rate_bias, self.weights_decay_bias, self.l1_vs_l2_bias, self.gradient_moment_bias, self.acc_alpha, self.acc_beta, self.gd_alpha, self.gd_beta) self.krn_bias_.set_args(self.device.skip(5), self._bias_const[5:6], self._bias_const[6:7], self._bias_const[7:8], self._bias_const[8:9], self._bias_const[9:10], self._bias_const[10:11], self._bias_const[11:12], self._bias_const[12:13]) self.execute_kernel(self._global_size_bias, self._local_size_bias, self.krn_bias_) def gpu_err_output_update(self): """Multiply err_output by activation derivative by output. """ if self.krn_err_output_ is None: return self.err_output.unmap() self.output.unmap() self.execute_kernel(self._global_size_err_output, self._local_size_err_output, self.krn_err_output_) def numpy_err_output_update(self): """Multiply err_output by activation derivative by output. """ pass def print_debug_data(self): """ Show weights statistics """ if not self.logger.isEnabledFor(logging.DEBUG): return self.weights.map_read() self.bias.map_read() self.gradient_bias.map_read() self.gradient_weights.map_read() weights = self.weights.mem bias = self.bias.mem grad_weights = self.gradient_weights.mem grad_bias = self.gradient_bias.mem weight_table = PrettyTable("TYPE", "Mean", "StdDev", "Min", "Max") weight_table.float_format = ".10" for (w_name, w_array) in [("Weight", weights), ("Bias", bias), ("Grad Weight", grad_weights), ("Grad Bias", grad_bias)]: w_mean = w_stddev = w_min = w_max = None if w_array is not None and w_array.size > 0: w_mean = numpy.mean(w_array) w_stddev = numpy.std(w_array) w_min = numpy.min(w_array) w_max = numpy.max(w_array) weight_table.add_row(w_name, w_mean, w_stddev, w_min, w_max) self.debug("\n" + weight_table.get_string()) def generate_data_for_slave(self, slave): return (self.learning_rate, self.weights_decay, self.gradient_moment, self.learning_rate_bias, self.weights_decay_bias, self.gradient_moment_bias) @staticmethod def fill_zeros(vector): if not vector: return vector.map_invalidate() vector.mem[:] = 0 def apply_data_from_master(self, data): self.learning_rate = data[0] self.weights_decay = data[1] self.gradient_moment = data[2] self.learning_rate_bias = data[3] self.weights_decay_bias = data[4] self.gradient_moment_bias = data[5] self.fill_zeros(self.gradient_weights_with_moment) self.fill_zeros(self.gradient_bias_with_moment) self.fill_zeros(self.gradient_weights) self.fill_zeros(self.gradient_bias) self.fill_zeros(self.accumulated_gradient_weights) self.fill_zeros(self.accumulated_gradient_bias) def generate_data_for_master(self): if not self.gradient_changed: return None self.gradient_changed = False self.gradient_weights_with_moment.map_read() self.gradient_bias_with_moment.map_read() return (self.gradient_weights_with_moment.mem, self.gradient_bias_with_moment.mem) def apply_data_from_slave(self, data, slave): if self.weights: self.weights.map_write() self.gradient_weights_with_moment.map_write() self.gradient_weights_with_moment.mem *= self.gradient_moment self.gradient_weights_with_moment.mem += data[0] self.weights.mem += self.gradient_weights_with_moment.mem if self.bias: self.bias.map_write() self.gradient_bias_with_moment.map_write() self.gradient_bias_with_moment.mem *= self.gradient_moment_bias self.gradient_bias_with_moment.mem += data[1] self.bias.mem += self.gradient_bias_with_moment.mem def drop_slave(self, slave): pass def accumulate_gradient_f(self, accumulated_gradient, gradient): if accumulated_gradient and self.accumulate_gradient: accumulated_gradient[:] = ( gradient * self.acc_alpha + (self.acc_beta * accumulated_gradient if self.acc_beta else 0)) gradient *= self.gd_beta gradient += self.gd_alpha * accumulated_gradient return gradient @staticmethod def numpy_gradient_step(weight, gradient, lr, factor_l12, l1_vs_l2, factor_ortho=0, weights_transposed=False): gradient = gradient.copy() gradient += factor_l12 * ( (1.0 - l1_vs_l2) * weight + 0.5 * l1_vs_l2 * numpy.sign(weight)) if factor_ortho: col_sums = (reshape_transposed(weight).sum( axis=1) if weights_transposed else weight.sum(axis=0)) for i, row in enumerate(gradient): row += (col_sums - weight[i]) * factor_ortho / weight.shape[0] gradient *= lr return gradient def run(self): self.gradient_changed = True super(GradientDescentBase, self).run() self.ocl_set_const_args = False
class MultiHistogram(Plotter): """Plotter for drawing weights as 2D. Must be assigned before initialize(): input input_field """ def __init__(self, workflow, **kwargs): super(MultiHistogram, self).__init__(workflow, **kwargs) self.limit = kwargs.get("limit", 64) self.value = Array() self.n_bars = kwargs.get("n_bars", 25) self.hist_number = kwargs.get("hist_number", 16) self.demand("input") def initialize(self, **kwargs): super(MultiHistogram, self).initialize(**kwargs) if self.hist_number > self.limit: self.hist_number = self.limit self.value.mem = numpy.zeros([self.hist_number, self.n_bars], dtype=numpy.int64) def redraw(self): fig = self.pp.figure(self.name) fig.clf() fig.patch.set_facecolor('#E8D6BB') # fig.patch.set_alpha(0.45) n_cols = int(numpy.round(numpy.sqrt(self.value.shape[0]))) n_rows = int(numpy.ceil(self.value.shape[0] / n_cols)) i = 0 for _ in range(0, n_rows): for _ in range(0, n_cols): ax = fig.add_subplot(n_rows, n_cols, i + 1) ax.cla() # ax.axis('off') ax.patch.set_facecolor('#ffe6ca') # ax.set_xlabel("Input Data", fontsize=10) # ax.set_ylabel("Number", fontsize=10) ymin = self.value[i].min() ymax = self.value[i].max() xmin = self.input[i].min() xmax = self.input[i].max() ax.axis( [xmin, xmax + ((xmax - xmin) / self.n_bars), ymin, ymax]) ax.grid(True) ax.set_title(self.name.replace("Histogram ", "")) nbars = self.n_bars width = ((xmax - xmin) / nbars) * 0.8 X = numpy.linspace(xmin, xmax, num=nbars, endpoint=True) Y = self.value[i] if (n_rows > 5) or (n_cols > 5): ax.bar(X, Y, color='#ffa0ef', width=width, edgecolor='red') else: ax.bar(X, Y, color='#ffa0ef', width=width, edgecolor='lavender') if n_rows > 4: ax.set_yticklabels([]) if n_cols > 3: ax.set_xticklabels([]) i += 1 if i >= self.value.shape[0]: break if i >= self.value.shape[0]: break self.show_figure(fig) fig.canvas.draw() return fig def fill(self): for i in range(self.hist_number): self.value.map_write() self.input.map_read() mx = self.input.mem[i].max() mi = self.input.mem[i].min() d = mx - mi if not d: return d = (self.n_bars - 1) / d self.value[i] = 0 for x in self.input.mem[i]: i_bar = int(numpy.floor((x - mi) * d)) self.value[i, i_bar] += 1
class GradientDescentBase(AcceleratedUnit): """Base class for gradient descent units. Attributes: input: input layer values. output: output layer values. err_output: error to backpropagate. err_input: backpropagated error. weights: weights. bias: bias. batch_size: current minibatch size. learning_rate: gradient descent speed (positive). learning_rate_bias weights_decay: regularization for weights (see l1_vs_l2). weights_decay_bias gradient_moment: moment coefficient for weights. gradient_moment_bias gradient_weights_with_moment: accumulated moment. gradient_bias_with_moment batch_size: effective batch size (if None, get it from y). weights_transposed: assume weights matrix as a transposed one. apply_gradient: will apply gradient. gradient_changed: when True, slave will send gradients to master (assigned to True just before the run call, so it can be set to False inside ocl_run, numpy_run if necessary). ocl_set_const_args: True when constant arguments for the kernel had been changed and need to be set again. """ hide_from_registry = True MAPPING = set() REDUCE_SIZE = 64 # used for updating bias def __init__(self, workflow, **kwargs): kwargs["view_group"] = kwargs.get("view_group", "TRAINER") super(GradientDescentBase, self).__init__(workflow, **kwargs) self.err_input = Array(shallow_pickle=True) self.ocl_set_const_args = True self.weights = None self.bias = None self.demand("input", "err_output") self.learning_rate = kwargs.get("learning_rate", 0.01) self.learning_rate_bias = kwargs.get("learning_rate_bias", self.learning_rate) self.weights_decay = kwargs.get("weights_decay", 0.00005) self.weights_decay_bias = kwargs.get("weights_decay_bias", 0.0) self.l1_vs_l2 = kwargs.get("l1_vs_l2", 0) self.l1_vs_l2_bias = kwargs.get("l1_vs_l2_bias", self.l1_vs_l2) self.gradient_moment = kwargs.get("gradient_moment", 0) self.gradient_moment_bias = kwargs.get("gradient_moment_bias", self.gradient_moment) self.weights_transposed = kwargs.get("weights_transposed", False) self.need_err_input = kwargs.get("need_err_input", True) self.include_bias = kwargs.get("include_bias", True) self.factor_ortho = kwargs.get("factor_ortho", 0) self.col_sums = Array() # for orthogonalization # Current gradient as it is without applying learning_rate etc. self.gradient_weights = Array() self.gradient_bias = Array() # Gradient with applied learning_rate etc. # optionally accumulated from the previous run self.accumulate_gradient = kwargs.get("accumulate_gradient", False) # When accumulate_gradient set to True: # 1. Calculate gd # 2. acc = acc_alpha * gd + acc_beta * acc # 3. gd = gd_alpha * acc + gd_beta * gd # 4. Apply moments to gd # 5. weights += gd if apply_gradient set to True self.acc_alpha = kwargs.get("acc_alpha", 0.0) self.acc_beta = kwargs.get("acc_beta", 0.0) self.gd_alpha = kwargs.get("gd_alpha", 0.0) self.gd_beta = kwargs.get("gd_beta", 1.0) self.accumulated_gradient_weights = Array() self.accumulated_gradient_bias = Array() # Gradient with accumulated moments self.gradient_weights_with_moment = Array() self.gradient_bias_with_moment = Array() # Sets to True when gradient changes self.gradient_changed = False # Gradient will be applied to weights immediately just after computing self.apply_gradient = kwargs.get("apply_gradient", not workflow.is_slave) @property def current_batch_size(self): batch_size = getattr(self, "batch_size", None) if batch_size is None: return self.err_output.mem.shape[0] return int(batch_size) def initialize(self, device, **kwargs): super(GradientDescentBase, self).initialize(device, **kwargs) if self.weights: assert len(self.weights.shape) == 2 self.weights_shape = tuple(reversed(self.weights.shape)) if self.weights_transposed else self.weights.shape else: self.weights_shape = None self.learning_rate = kwargs.get("learning_rate", self.learning_rate) self.weights_decay = kwargs.get("weights_decay", self.weights_decay) self.gradient_moment = kwargs.get("gradient_moment", self.gradient_moment) self.learning_rate_bias = kwargs.get("learning_rate_bias", self.learning_rate_bias) self.weights_decay_bias = kwargs.get("weights_decay_bias", self.weights_decay_bias) self.gradient_moment_bias = kwargs.get("gradient_moment_bias", self.gradient_moment_bias) if self.weights: if not self.gradient_weights: self.gradient_weights.reset(numpy.zeros_like(self.weights.mem)) else: assert self.gradient_weights.size == self.weights.size if self.weights and self.accumulate_gradient: if not self.accumulated_gradient_weights: self.accumulated_gradient_weights.reset(numpy.zeros_like(self.weights.mem)) else: assert self.accumulated_gradient_weights.size == self.weights.size if self.weights and (self.gradient_moment or not self.is_standalone): if not self.gradient_weights_with_moment: self.gradient_weights_with_moment.reset(numpy.zeros_like(self.weights.mem)) else: assert self.gradient_weights_with_moment.size == self.weights.size if self.include_bias and self.bias and (not self.gradient_bias or self.gradient_bias.size != self.bias.size): self.gradient_bias.reset(numpy.zeros_like(self.bias.mem)) if ( self.include_bias and self.bias and self.accumulate_gradient and (not self.accumulated_gradient_bias or self.accumulated_gradient_bias.size != self.bias.size) ): self.accumulated_gradient_bias.reset(numpy.zeros_like(self.bias.mem)) if self.include_bias and self.bias and (self.gradient_moment_bias or not self.is_standalone): if not self.gradient_bias_with_moment: self.gradient_bias_with_moment.reset(numpy.zeros_like(self.bias.mem)) else: assert self.gradient_bias_with_moment.size == self.bias.size dtype = self.err_output.dtype if self.need_err_input: if not self.err_input: self.err_input.reset(numpy.zeros(self.input.shape, dtype)) else: assert self.err_input.shape == self.input.shape if self.weights: side = self.weights_shape[0] other = self.weights.size // side if self.factor_ortho: if not self.col_sums: self.col_sums.reset(numpy.zeros(other, dtype=dtype)) else: assert self.col_sums.size == other self.col_sums.initialize(self.device) self.reduce_size = roundup(min(self.reduce_size, other), 32) self.weights.initialize(self.device) for vec in self.bias, self.input, self.err_input: if vec: vec.initialize(self.device) self.init_vectors( self.err_output, self.gradient_weights, self.gradient_bias, self.accumulated_gradient_weights, self.accumulated_gradient_bias, self.gradient_weights_with_moment, self.gradient_bias_with_moment, ) def gpu_weights_update(self): self.unmap_vectors( self.input, self.err_output, self.weights, self.gradient_weights, self.accumulated_gradient_weights, self.gradient_weights_with_moment, ) if self.factor_ortho: self.col_sums.unmap() self.execute_kernel(self._global_size_ortho, self._local_size_ortho, self.krn_compute_col_sums_) self._weights_const[12] = self.factor_ortho self.krn_weights_.set_arg(12, self._weights_const[12:13]) self._weights_const[4:12] = ( self.learning_rate, self.weights_decay, self.l1_vs_l2, self.gradient_moment, self.acc_alpha, self.acc_beta, self.gd_alpha, self.gd_beta, ) self.krn_weights_.set_args( self.device.skip(4), self._weights_const[4:5], self._weights_const[5:6], self._weights_const[6:7], self._weights_const[7:8], self._weights_const[8:9], self._weights_const[9:10], self._weights_const[10:11], self._weights_const[11:12], ) self.execute_kernel(self._global_size_weights, self._local_size_weights, self.krn_weights_) def gpu_bias_update(self): if not self.include_bias: return self.unmap_vectors( self.err_output, self.bias, self.gradient_bias, self.accumulated_gradient_bias, self.gradient_bias_with_moment, ) self._bias_const[5:13] = ( self.learning_rate_bias, self.weights_decay_bias, self.l1_vs_l2_bias, self.gradient_moment_bias, self.acc_alpha, self.acc_beta, self.gd_alpha, self.gd_beta, ) self.krn_bias_.set_args( self.device.skip(5), self._bias_const[5:6], self._bias_const[6:7], self._bias_const[7:8], self._bias_const[8:9], self._bias_const[9:10], self._bias_const[10:11], self._bias_const[11:12], self._bias_const[12:13], ) self.execute_kernel(self._global_size_bias, self._local_size_bias, self.krn_bias_) def gpu_err_output_update(self): """Multiply err_output by activation derivative by output. """ if self.krn_err_output_ is None: return self.err_output.unmap() self.output.unmap() self.execute_kernel(self._global_size_err_output, self._local_size_err_output, self.krn_err_output_) def numpy_err_output_update(self): """Multiply err_output by activation derivative by output. """ pass def print_debug_data(self): """ Show weights statistics """ if not self.logger.isEnabledFor(logging.DEBUG): return self.weights.map_read() self.bias.map_read() self.gradient_bias.map_read() self.gradient_weights.map_read() weights = self.weights.mem bias = self.bias.mem grad_weights = self.gradient_weights.mem grad_bias = self.gradient_bias.mem weight_table = PrettyTable("TYPE", "Mean", "StdDev", "Min", "Max") weight_table.float_format = ".10" for (w_name, w_array) in [ ("Weight", weights), ("Bias", bias), ("Grad Weight", grad_weights), ("Grad Bias", grad_bias), ]: w_mean = w_stddev = w_min = w_max = None if w_array is not None and w_array.size > 0: w_mean = numpy.mean(w_array) w_stddev = numpy.std(w_array) w_min = numpy.min(w_array) w_max = numpy.max(w_array) weight_table.add_row(w_name, w_mean, w_stddev, w_min, w_max) self.debug("\n" + weight_table.get_string()) def generate_data_for_slave(self, slave): return ( self.learning_rate, self.weights_decay, self.gradient_moment, self.learning_rate_bias, self.weights_decay_bias, self.gradient_moment_bias, ) @staticmethod def fill_zeros(vector): if not vector: return vector.map_invalidate() vector.mem[:] = 0 def apply_data_from_master(self, data): self.learning_rate = data[0] self.weights_decay = data[1] self.gradient_moment = data[2] self.learning_rate_bias = data[3] self.weights_decay_bias = data[4] self.gradient_moment_bias = data[5] self.fill_zeros(self.gradient_weights_with_moment) self.fill_zeros(self.gradient_bias_with_moment) self.fill_zeros(self.gradient_weights) self.fill_zeros(self.gradient_bias) self.fill_zeros(self.accumulated_gradient_weights) self.fill_zeros(self.accumulated_gradient_bias) def generate_data_for_master(self): if not self.gradient_changed: return None self.gradient_changed = False self.gradient_weights_with_moment.map_read() self.gradient_bias_with_moment.map_read() return (self.gradient_weights_with_moment.mem, self.gradient_bias_with_moment.mem) def apply_data_from_slave(self, data, slave): if self.weights: self.weights.map_write() self.gradient_weights_with_moment.map_write() self.gradient_weights_with_moment.mem *= self.gradient_moment self.gradient_weights_with_moment.mem += data[0] self.weights.mem += self.gradient_weights_with_moment.mem if self.bias: self.bias.map_write() self.gradient_bias_with_moment.map_write() self.gradient_bias_with_moment.mem *= self.gradient_moment_bias self.gradient_bias_with_moment.mem += data[1] self.bias.mem += self.gradient_bias_with_moment.mem def drop_slave(self, slave): pass def accumulate_gradient_f(self, accumulated_gradient, gradient): if accumulated_gradient and self.accumulate_gradient: accumulated_gradient[:] = gradient * self.acc_alpha + ( self.acc_beta * accumulated_gradient if self.acc_beta else 0 ) gradient *= self.gd_beta gradient += self.gd_alpha * accumulated_gradient return gradient @staticmethod def numpy_gradient_step(weight, gradient, lr, factor_l12, l1_vs_l2, factor_ortho=0, weights_transposed=False): gradient = gradient.copy() gradient += factor_l12 * ((1.0 - l1_vs_l2) * weight + 0.5 * l1_vs_l2 * numpy.sign(weight)) if factor_ortho: col_sums = reshape_transposed(weight).sum(axis=1) if weights_transposed else weight.sum(axis=0) for i, row in enumerate(gradient): row += (col_sums - weight[i]) * factor_ortho / weight.shape[0] gradient *= lr return gradient def run(self): self.gradient_changed = True super(GradientDescentBase, self).run() self.ocl_set_const_args = False
class Cutter1D(AcceleratedUnit): """Cuts the specified interval from each 1D sample of input batch into output. y = alpha * x + beta * y """ def __init__(self, workflow, **kwargs): super(Cutter1D, self).__init__(workflow, **kwargs) self.alpha = kwargs.get("alpha") self.beta = kwargs.get("beta") self.output_offset = kwargs.get("output_offset", 0) self.output = Array() self.demand("alpha", "beta", "input") # TODO: add input_offset and length to demand and not to crash lstm # TODO: unit test def init_unpickled(self): super(Cutter1D, self).init_unpickled() self.sources_["cutter"] = {} def initialize(self, device, **kwargs): super(Cutter1D, self).initialize(device, **kwargs) if not self.output or self.output.shape[0] != self.input.shape[0]: self.output.reset( numpy.zeros( (self.input.shape[0], self.output_offset + self.length), dtype=self.input.dtype)) else: assert self.output.sample_size >= self.output_offset + self.length self.init_vectors(self.input, self.output) def cuda_init(self): dtype = self.input.dtype itemsize = self.input.itemsize limit = self.input.shape[0] * self.length self.build_program({}, "%s" % self.__class__.__name__, dtype=dtype) self.assign_kernel("cutter_1d_forward") self.set_args( int(self.input.devmem) + self.input_offset * itemsize, numpy.array([self.alpha], dtype=dtype), numpy.array([self.input.sample_size], dtype=numpy.int32), int(self.output.devmem) + self.output_offset * itemsize, numpy.array([self.beta], dtype=dtype), numpy.array([self.output.sample_size], dtype=numpy.int32), numpy.array([self.length], dtype=numpy.int32), numpy.array([limit], dtype=numpy.int32)) block_size = self.device.suggest_block_size(self._kernel_) self._global_size = (int(numpy.ceil(limit / block_size)), 1, 1) self._local_size = (block_size, 1, 1) def ocl_init(self): dtype = self.input.dtype self.build_program({}, "%s" % self.__class__.__name__, dtype=dtype) self.assign_kernel("cutter_1d_forward") self.set_args( self.input.devmem, numpy.array([self.input_offset], dtype=numpy.int32), numpy.array([self.alpha], dtype=dtype), numpy.array([self.input.sample_size], dtype=numpy.int32), self.output.devmem, numpy.array([self.output_offset], dtype=numpy.int32), numpy.array([self.beta], dtype=dtype), numpy.array([self.output.sample_size], dtype=numpy.int32)) self._global_size = (self.input.shape[0], self.length) self._local_size = None def _gpu_run(self): self.unmap_vectors(self.input, self.output) self.execute_kernel(self._global_size, self._local_size) def cuda_run(self): return self._gpu_run() def ocl_run(self): return self._gpu_run() def numpy_run(self): self.input.map_read() self.output.map_write() out = self.output.matrix[:, self.output_offset:self.output_offset + self.length] if self.beta: out *= self.beta else: out[:] = 0 out += (self.input.matrix[:, self.input_offset:self.input_offset + self.length] * self.alpha)
class Cutter1D(AcceleratedUnit): """Cuts the specified interval from each 1D sample of input batch into output. y = alpha * x + beta * y """ def __init__(self, workflow, **kwargs): super(Cutter1D, self).__init__(workflow, **kwargs) self.alpha = kwargs.get("alpha") self.beta = kwargs.get("beta") self.output_offset = kwargs.get("output_offset", 0) self.output = Array() self.demand("alpha", "beta", "input") # TODO: add input_offset and length to demand and not to crash lstm # TODO: unit test def init_unpickled(self): super(Cutter1D, self).init_unpickled() self.sources_["cutter"] = {} def initialize(self, device, **kwargs): super(Cutter1D, self).initialize(device, **kwargs) if not self.output or self.output.shape[0] != self.input.shape[0]: self.output.reset( numpy.zeros( (self.input.shape[0], self.output_offset + self.length), dtype=self.input.dtype)) else: assert self.output.sample_size >= self.output_offset + self.length self.init_vectors(self.input, self.output) def cuda_init(self): dtype = self.input.dtype itemsize = self.input.itemsize limit = self.input.shape[0] * self.length self.build_program({}, "%s" % self.__class__.__name__, dtype=dtype) self.assign_kernel("cutter_1d_forward") self.set_args( int(self.input.devmem) + self.input_offset * itemsize, numpy.array([self.alpha], dtype=dtype), numpy.array([self.input.sample_size], dtype=numpy.int32), int(self.output.devmem) + self.output_offset * itemsize, numpy.array([self.beta], dtype=dtype), numpy.array([self.output.sample_size], dtype=numpy.int32), numpy.array([self.length], dtype=numpy.int32), numpy.array([limit], dtype=numpy.int32)) block_size = self.device.suggest_block_size(self._kernel_) self._global_size = (int(numpy.ceil(limit / block_size)), 1, 1) self._local_size = (block_size, 1, 1) def ocl_init(self): dtype = self.input.dtype self.build_program({}, "%s" % self.__class__.__name__, dtype=dtype) self.assign_kernel("cutter_1d_forward") self.set_args( self.input.devmem, numpy.array([self.input_offset], dtype=numpy.int32), numpy.array([self.alpha], dtype=dtype), numpy.array([self.input.sample_size], dtype=numpy.int32), self.output.devmem, numpy.array([self.output_offset], dtype=numpy.int32), numpy.array([self.beta], dtype=dtype), numpy.array([self.output.sample_size], dtype=numpy.int32)) self._global_size = (self.input.shape[0], self.length) self._local_size = None def _gpu_run(self): self.unmap_vectors(self.input, self.output) self.execute_kernel(self._global_size, self._local_size) def cuda_run(self): return self._gpu_run() def ocl_run(self): return self._gpu_run() def numpy_run(self): self.input.map_read() self.output.map_write() out = self.output.matrix[ :, self.output_offset:self.output_offset + self.length] if self.beta: out *= self.beta else: out[:] = 0 out += ( self.input.matrix[ :, self.input_offset:self.input_offset + self.length] * self.alpha)
class KohonenForward(KohonenBase, AcceleratedUnit): """Kohonen forward layer. Must be assigned before initialize(): input weights minibatch_offset (if total == True) minibatch_size (if total == True) batch_size (if total == True) argmins speeds up run() if linked from KohonenTrainer Updates after run(): output Creates within initialize(): output Attributes: input: input as batch of samples. weights: the weights of the neurons in Kohonen layer. output: the list of winners. total: if total=True is passed in __init__(), the overall winners table """ def __init__(self, workflow, **kwargs): super(KohonenForward, self).__init__(workflow, **kwargs) self.demand("input", "weights") self.argmins = None self._distances = Array() self.output = Array() self._chunk_size_ = 0 self.weights_transposed = False self.total = Array() if kwargs.get("total", False) else None if self.total is not None: self.minibatch_offset = None self.minibatch_size = None self.batch_size = None def init_unpickled(self): super(KohonenForward, self).init_unpickled() self.sources_["kohonen"] = {"FORWARD": 1} @property def neurons_number(self): return self.weights.mem.shape[0] @property def sample_length(self): return self.weights.mem.shape[1] @property def chunk_size(self): return self._chunk_size_ def initialize(self, device, **kwargs): super(KohonenForward, self).initialize(device=device, **kwargs) assert self.input.mem.shape[1] == self.sample_length batch_size = self.input.mem.shape[0] self.output.reset(numpy.zeros(batch_size, dtype=numpy.int32)) if self.argmins is None: self._distances.reset(numpy.zeros( [batch_size, self.neurons_number], dtype=self.weights.mem.dtype)) if self.total is not None: self.total.reset(numpy.zeros(self.batch_size, dtype=numpy.int32)) self._minibatch_offset_ = numpy.zeros(1, dtype=numpy.int32) def ocl_init(self): batch_size = self.input.mem.shape[0] self.output.initialize(self.device) if self.argmins is None: self.input.initialize(self.device) self.weights.initialize(self.device) self._distances.initialize(self.device) elif self.total is None: return if self.total is not None: self.total.initialize(self.device) copy_chunk_size = int(numpy.ceil(batch_size / self.device.max_group_size)) chunk_size = self.neurons_number // self.device.max_group_size if chunk_size < 2: chunk_size = self.neurons_number // 2 + 1 self.argmin_group_size = \ int(numpy.ceil(self.neurons_number / chunk_size)) block_size, vector_opt = self.device.device_info.get_kernel_bs_vo( kernel="matrix_multiplication", dtype=self.input.dtype) defines = { 'BLOCK_SIZE': block_size, 'VECTOR_OPT': int(bool(vector_opt)), 'BATCH': batch_size, 'SAMPLE_LENGTH': self.sample_length, 'NEURONS_NUMBER': self.neurons_number, 'CHUNK_SIZE': chunk_size, 'COPY_CHUNK_SIZE': copy_chunk_size, } if self.weights_transposed: defines['WEIGHTS_TRANSPOSED'] = 1 self.build_program(defines, "%s_%d_%d_%d" % (self.__class__.__name__, batch_size, self.sample_length, self.neurons_number), dtype=self.weights.mem.dtype) if self.total is not None: self._set_total_global_size_ = \ [int(numpy.ceil(batch_size / copy_chunk_size))] self._krn_set_total_ = self.get_kernel("set_total") self._krn_set_total_.set_args(self.output.devmem, cl.skip, self.total.devmem) if self.argmins is not None: return self._krn_distances_ = self.get_kernel("calculate_distances") self._krn_distances_.set_args(self.input.devmem, self.weights.devmem, self._distances.devmem) self._krn_argmin_ = self.get_kernel("calculate_argmin") self._krn_argmin_.set_args(self._distances.devmem, self.output.devmem, None) self._gs_distance = [ roundup(self.neurons_number, block_size), roundup(batch_size, block_size)] self._ls_distance = [block_size, block_size] def ocl_run(self): self.output.unmap() if self.total is not None: self.total.unmap() if self.argmins is None: self.input.unmap() self.weights.unmap() self.execute_kernel(self._gs_distance, self._ls_distance, self._krn_distances_) self.execute_kernel([self.argmin_group_size], [self.argmin_group_size], self._krn_argmin_) else: self.argmins.unmap() self.argmins.map_read() self.output.map_write() self.output.mem[:] = self.argmins.mem self.output.unmap() self.argmins.unmap() if self.total is not None: self._minibatch_offset_[0] = \ self.minibatch_offset - self.minibatch_size self._krn_set_total_.set_arg(1, self._minibatch_offset_) self.execute_kernel(self._set_total_global_size_, None, self._krn_set_total_) def numpy_run(self): self.output.map_invalidate() if self.argmins is not None: self.argmins.map_read() self.output.mem[:] = self.argmins.mem else: self.input.map_read() self.weights.map_read() if self.total is not None: self.total.map_invalidate() length = self.minibatch_size if self.total is not None \ else self.input.mem.shape[0] for sindex in range(length): if self.argmins is None: dist = self.weights.mem - self.input[sindex] winner = numpy.argmin(self.numpy_linalg_norm(dist)) self.output[sindex] = winner else: winner = self.argmins[sindex] if self.total is not None: index = sindex + self.minibatch_offset - self.minibatch_size self.total[index] = winner
class Uniform(AcceleratedUnit): """Generates random numbers from uniform distribution. Attributes: num_states: number of random states for parallel generation. states: Array of random states. prng: veles.prng.RandomGenerator for initial states generation. output_bytes: number of output bytes to generate. """ backend_methods = AcceleratedUnit.backend_methods + ("fill",) def __init__(self, workflow, **kwargs): super(Uniform, self).__init__(workflow, **kwargs) self.num_states = kwargs.get("num_states", 256) self.states = Array() self.prng = kwargs.get("prng", get()) self.output_bytes = kwargs.get("output_bytes", 0) self.output = Array() self.cl_const = numpy.zeros(1, dtype=numpy.int32) def init_unpickled(self): super(Uniform, self).init_unpickled() self.sources_["random"] = {} def initialize(self, device, **kwargs): super(Uniform, self).initialize(device, **kwargs) if not self.states or self.states.size != self.num_states * 16: self.states.reset(numpy.empty(self.num_states * 16 * 2, dtype=numpy.uint32)) self.states.mem[:] = self.prng.randint(0, (1 << 32) + 1, self.states.size) if not self.output or self.output.nbytes < self.output_bytes: self.output_bytes = roundup(self.output_bytes, self.num_states * 16 * 8) self.output.reset(numpy.zeros(self.output_bytes, numpy.uint8)) else: self.output_bytes = self.output.nbytes self.init_vectors(self.states, self.output) def _gpu_init(self): self.build_program({}, "uniform_%d" % self.num_states) self.assign_kernel("random_xorshift1024star") self.set_args(self.states, self.cl_const, self.output) def ocl_init(self): self._gpu_init() self._global_size = [self.num_states] self._local_size = None def cuda_init(self): self._gpu_init() n = self.num_states l = 1 while not (n & 1) and l < 32: n >>= 1 l <<= 1 self._global_size = (n, 1, 1) self._local_size = (l, 1, 1) def _gpu_fill(self, nbytes): bytes_per_round = self.num_states * 16 * 8 nbytes = roundup(nbytes, bytes_per_round) if nbytes > self.output.nbytes: raise error.Bug("nbytes > self.output.nbytes") self.unmap_vectors(self.states, self.output) self.cl_const[0] = nbytes // bytes_per_round self.set_arg(1, self.cl_const) self.execute_kernel(self._global_size, self._local_size) def ocl_fill(self, nbytes): self._gpu_fill(nbytes) def cuda_fill(self, nbytes): self._gpu_fill(nbytes) def numpy_fill(self, nbytes): bytes_per_round = self.num_states * 16 * 8 nbytes = roundup(nbytes, bytes_per_round) if nbytes > self.output.nbytes: raise error.Bug("nbytes > self.output.nbytes") self.states.map_write() self.output.map_invalidate() n_rounds = nbytes // bytes_per_round u64 = numpy.array([1181783497276652981], dtype=numpy.uint64) s0 = numpy.zeros(1, dtype=numpy.uint64) s1 = numpy.zeros(1, dtype=numpy.uint64) states = self.states.mem.view(dtype=numpy.uint64) states = states.reshape(states.size // 16, 16) output = self.output.mem.view(dtype=numpy.uint64) for i in range(self.num_states): offs = i s = states[i] self.p = 0 for _round in range(n_rounds): for _iter in range(16): output[offs] = self._next_rand(s, s0, s1, u64) offs += self.num_states def _next_rand(self, s, s0, s1, u64): s0[0] = s[self.p] self.p = (self.p + 1) & 15 s1[0] = s[self.p] s1 ^= s1 << 31 s1 ^= s1 >> 11 s0 ^= s0 >> 30 s0 ^= s1 s[self.p] = s0[0] return (s0 * u64)[0] def fill(self, nbytes): self._backend_fill_(nbytes) def ocl_run(self): self.ocl_fill(self.output.nbytes) def cuda_run(self): self.cuda_fill(self.output.nbytes) def numpy_run(self): self.numpy_fill(self.output.nbytes)