def softplus(self, activations, bias, dest=None): kernel_cache, thread = self.kernel_cache, self.thread if dest is None: dest = activations key = (self.softplus, activations.shape, thread) if not key in kernel_cache.keys(): log.info("compiling " + str(key)) assert activations.shape[1] == bias.shape[0] kernel = PureParallel([ Parameter('activations', Annotation(activations, 'i')), Parameter('bias', Annotation(bias, 'i')), Parameter('dest', Annotation(dest, 'o')), ], """ ${activations.ctype} a = ${activations.load_same}; ${bias.ctype} b = ${bias.load_idx}(${idxs[1]}); a += b; a = min(max(-45.0f, a), 45.0f); a = log(1.0f + exp(a)); ${dest.store_same}(a); """, guiding_array='activations') kernel_cache[key] = kernel.compile(thread, fast_math=True) # Run kernel kernel_cache[key](activations, bias, dest) return dest
def nan_to_zeros(self, array, dest=None): kernel_cache, thread = self.kernel_cache, self.thread if dest is None: dest = array key = (self.nan_to_zeros, array.shape, thread) if not key in kernel_cache.keys(): log.info("compiling " + str(key)) kernel = PureParallel([ Parameter('array', Annotation(array, 'i')), Parameter('dest', Annotation(dest, 'o')), ], """ ${array.ctype} a = ${array.load_same}; if (isnan(a)) { ${dest.store_same}(0.0f); } """, guiding_array='array') kernel_cache[key] = kernel.compile(thread, fast_math=True) # Run kernel kernel_cache[key](array, dest) return dest
def sub(self, mat1, mat2, dest): """ Subtract mat2 from mat1. ATTENTION: if a value is nan, the result will be zero. """ kernel_cache = self.kernel_cache thread = self.thread key = (self.sub, mat1.dtype, mat1.shape) if key not in kernel_cache.keys(): log.info("compiling " + str(key)) assert mat1.shape == mat2.shape == dest.shape kernel_delta_output = PureParallel([ Parameter('mat1', Annotation(mat1, 'i')), Parameter('mat2', Annotation(mat2, 'i')), Parameter('dest', Annotation(dest, 'o')) ], """ // Delta ( for the output layer ) ${mat1.ctype} m1 = ${mat1.load_same}; ${mat2.ctype} m2 = ${mat2.load_same}; if (isnan(m1) || isnan(m2)) { ${dest.store_same}(0.0f); } else { ${dest.ctype} d = m1 - m2; ${dest.store_same}(d); } """, guiding_array='dest') kernel_cache[key] = kernel_delta_output.compile(thread) kernel_cache[key](mat1, mat2, dest)
def add(self, mat1, mat2, dest): kernel_cache = self.kernel_cache thread = self.thread key = (self.add, mat1.dtype, mat1.shape) if key not in kernel_cache.keys(): log.info("compiling " + str(key)) assert mat1.shape == mat2.shape == dest.shape kernel_delta_output = PureParallel([ Parameter('mat1', Annotation(mat1, 'i')), Parameter('mat2', Annotation(mat2, 'i')), Parameter('dest', Annotation(dest, 'o')) ], """ // Delta ( for the output layer ) ${mat1.ctype} m1 = ${mat1.load_same}; ${mat2.ctype} m2 = ${mat2.load_same}; ${dest.ctype} d = m1 + m2; ${dest.store_same}(d); """, guiding_array='dest') kernel_cache[key] = kernel_delta_output.compile(thread) kernel_cache[key](mat1, mat2, dest)
def softplus_derivative(self, activations, delta, dest=None): kernel_cache, thread = self.kernel_cache, self.thread if dest is None: dest = delta key = (self.softplus_derivative, activations.shape, thread) if not key in kernel_cache.keys(): log.info("compiling " + str(key)) kernel = PureParallel([ Parameter('activations', Annotation(activations, 'i')), Parameter('delta', Annotation(activations, 'i')), Parameter('dest', Annotation(dest, 'o')), ], """ ${activations.ctype} a = ${activations.load_same}; ${delta.ctype} d = ${delta.load_same}; // the softplus function already has been applied // to the activations, so wee need to apply the // inverse of softplus chained with logistic // note: logistic is the derivative of softplus a = min(max(-45.0f, a), 45.0f); a = 1.0f / (1.0f / (exp(a) - 1.0f) + 1.0f); d = d*a; ${dest.store_same}(d); """, guiding_array='activations') kernel_cache[key] = kernel.compile(thread) # Run kernel kernel_cache[key](activations, delta, dest)
def renormalize_kernel(ctx, array, norm, constraint): kernel_cache, thread = ctx.kernel_cache, ctx.thread constraint = numpy.float32(constraint) key = (renormalize_kernel, array.shape, norm.shape, thread._context) if key not in kernel_cache.keys(): comp = PureParallel([ Parameter('array', Annotation(array, 'io')), Parameter('norm', Annotation(norm, 'i')), Parameter('constraint', Annotation(constraint)) ], """ // Renormalize if necessary float n = ${norm.load_idx}(${idxs[1]}); float c = ${constraint}; if ( n > c ) { float a = ${array.load_same}; a = a * c / n; ${array.store_same}(a); } """, guiding_array='array') kernel_cache[key] = comp.compile(thread) kernel_cache[key](array, norm, constraint)
def _get_connection_modules(self, output, name, annotation): node = self.nodes[name] param = Parameter(name, annotation) ntr = node.output_ntr if output else node.input_ntr m_idx = None m_same = None m_combined = None if ntr is None: m_idx = module_leaf_macro(output, param) else: m_idx = self._get_transformation_module(annotation, ntr) subtree_params = self.get_leaf_parameters([name]) # FIXME: this module won't work at the base level (that is, not in a trnsformation) # unless 'idx' variables were defined. # This behavior was enabled for PureParallel.from_trf(), which defines these variables. m_same = module_same_indices(output, param, subtree_params, m_idx) m_combined = module_combined(output, param, subtree_params, m_idx) return m_idx, m_same, m_combined
def _get_transformation_module(self, annotation, ntr): param = Parameter(ntr.connector_node_name, annotation) tr_args = [Indices(param.annotation.type.shape)] connection_names = [] for tr_param in ntr.trf.signature.parameters.values(): connection_name = ntr.node_from_tr[tr_param.name] connection_names.append(connection_name) if connection_name == ntr.connector_node_name: if ntr.output: load_same = node_connector(ntr.output) tr_args.append( KernelParameter(param.name, param.annotation.type, load_same=load_same)) else: store_same = node_connector(ntr.output) tr_args.append( KernelParameter(param.name, param.annotation.type, store_same=store_same)) else: tr_args.append( self._get_kernel_argobject(connection_name, tr_param.annotation)) subtree_params = self.get_leaf_parameters([ntr.connector_node_name]) return module_transformation(ntr.output, param, subtree_params, ntr.trf.snippet, tr_args)
def _process_kernel_arguments(self, args): """ Scan through kernel arguments passed by the user, check types, and wrap ad hoc values if necessary. Does not change the plan state. """ processed_args = [] adhoc_idgen = IdGen('_adhoc') adhoc_values = {} for arg in args: if not isinstance(arg, KernelArgument): if hasattr(arg, 'shape') and hasattr(arg, 'dtype'): if len(arg.shape) > 0: raise ValueError( "Arrays are not allowed as ad hoc arguments") # Not creating a new persistent scalar with _scalar(), # because the kernel compilation may fail, # in which case we would have to roll back the plan state. # These arguments are local to this kernel anyway, # so there's no need in registering them in the plan. name = self._translator(adhoc_idgen()) adhoc_values[name] = arg annotation = Annotation(Type(arg.dtype)) arg = KernelArgument(name, annotation.type) else: raise TypeError("Unknown argument type: " + str(type(arg))) else: annotation = self._get_annotation(arg.name) processed_args.append(Parameter(arg.name, annotation)) return processed_args, adhoc_values
def _connect(self, ntr): # At this point we assume that ``ntr`` describes a valid connection. # All sanity checks are performed in ``connect()``. for tr_param in ntr.trf.signature.parameters.values(): node_name = ntr.node_from_tr[tr_param.name] if node_name == ntr.connector_node_name: ann = self.leaf_parameters[node_name].annotation if ann.input and ann.output: # splitting the 'io' leaf updated_role = 'i' if ntr.output else 'o' # Since it is an array parameter, we do not need to worry # about preserving the default value (it can't have one). self.leaf_parameters[node_name] = Parameter( node_name, Annotation(ann.type, role=updated_role)) else: # 'i' or 'o' leaf is hidden by the transformation del self.leaf_parameters[node_name] else: if (node_name in self.leaf_parameters and self.leaf_parameters[node_name].annotation.array): ann = self.leaf_parameters[node_name].annotation if (ann.input and ntr.output) or (ann.output and not ntr.output): # Joining 'i' and 'o' paths into an 'io' leaf. # Since it is an array parameter, we do not need to worry # about preserving the default value (it can't have one). self.leaf_parameters[node_name] = Parameter( node_name, Annotation(ann.type, role='io')) else: self.leaf_parameters[node_name] = tr_param.rename( node_name) if node_name not in self.nodes: self.nodes[node_name] = Node() self.nodes[ntr.connector_node_name] = self.nodes[ ntr.connector_node_name].connect(ntr)
def dropout(ctx, mat, rand, probability): kernel_cache = ctx.kernel_cache probability = numpy.float32(probability) thread = ctx.thread key = (dropout, mat.dtype, mat.shape) if key not in kernel_cache.keys(): log.info("compiling " + str(key)) kernel = PureParallel([ Parameter('mat', Annotation(mat, 'o')), Parameter('rand', Annotation(mat, 'i')), Parameter('probability', Annotation(probability)) ], """ ${rand.ctype} r = ${rand.load_same}; if (r < ${probability}) { ${mat.store_same}(0.0f); } """, guiding_array='mat') kernel_cache[key] = kernel.compile(thread) kernel_cache[key](mat, rand, probability)
def scale(self, mat, scalar): kernel_cache = self.kernel_cache scalar = numpy.float32(scalar) thread = self.thread key = (self.scale, mat.dtype, mat.shape) if key not in kernel_cache.keys(): log.info("compiling " + str(key)) kernel = PureParallel([ Parameter('mat', Annotation(mat, 'io')), Parameter('scalar', Annotation(scalar)) ], """ // Delta ( for the output layer ) ${mat.ctype} m = ${mat.load_same}; ${mat.ctype} s = ${scalar}; m *= s; ${mat.store_same}(m); """, guiding_array='mat') kernel_cache[key] = kernel.compile(thread) kernel_cache[key](mat, scalar)
def copy_minibatch(self, array, indices, minibatch): kernel_cache, thread = self.kernel_cache, self.thread key = (self.copy_minibatch, minibatch.dtype, minibatch.shape, array.shape) if key not in kernel_cache.keys(): log.info("compiling " + str(key)) assert minibatch.shape[0] == indices.shape[0] assert indices.dtype == numpy.int32 dimensions = numpy.int32(len(array.shape)) assert minibatch.shape[0] == indices.shape[0] kernel = PureParallel([ Parameter('array', Annotation(array, 'i')), Parameter('indices', Annotation(indices, 'i')), Parameter('minibatch', Annotation(minibatch, 'o')) ], """ SIZE_T idx = ${indices.load_idx}(${idxs[0]}); %if dimensions == 2: ${minibatch.store_same}(${array.load_idx}(idx, ${idxs[1]})); %elif dimensions == 3: ${minibatch.store_same}(${array.load_idx}(idx, ${idxs[1]}, ${idxs[2]})); %else: ${minibatch.store_same}(${array.load_idx}(idx)); %endif """, guiding_array='minibatch', render_kwds=dict(dimensions=dimensions)) log.info(array.shape) log.info(indices.shape) log.info(minibatch.shape) kernel_cache[key] = kernel.compile(thread) kernel_cache[key](array, indices, minibatch)
def lwta(ctx, mat, lwta_size): kernel_cache = ctx.kernel_cache lwta_size = numpy.float32(lwta_size) thread = ctx.thread key = (lwta, mat.dtype, mat.shape, lwta_size) if key not in kernel_cache.keys(): num_units = mat.shape[1] log.info("compiling " + str(key)) kernel = PureParallel([Parameter('mat', Annotation(mat, 'io'))], """ SIZE_T this_idx = ${idxs[1]}; SIZE_T group_size = ${lwta_size}; // only the first thread per group computes anything if (this_idx % group_size == 0) { SIZE_T argmax = ${idxs[1]}; SIZE_T candidate_idx; ${mat.ctype} ma = ${mat.load_same}; ${mat.ctype} candidate_value; // find the argmax in the group for (SIZE_T i=1; i < group_size; i++) { candidate_idx = this_idx + i; if (candidate_idx >= ${num_units}) break; candidate_value = ${mat.load_idx}(${idxs[0]}, candidate_idx); if ( candidate_value > ma) { ma = candidate_value; argmax = candidate_idx; } } // second pass: zero all except argmax for (SIZE_T i=0; i < group_size; i++) { candidate_idx = this_idx + i; if (candidate_idx >= ${num_units}) break; if ( candidate_idx != argmax ) { ${mat.store_idx}(${idxs[0]}, candidate_idx, 0.0f); } } } """, guiding_array='mat', render_kwds=dict(lwta_size=lwta_size, num_units=num_units)) kernel_cache[key] = kernel.compile(thread) kernel_cache[key](mat)
def __init__(self, root_parameters): # Preserve order of initial root parameters. # These can repeat. self.root_names = [] # Keeping whole parameters, because we want to preserve the default values (if any). self.root_parameters = {} self.nodes = {} # all nodes of the tree self.leaf_parameters = {} # nodes available for connection for param in root_parameters: self.root_names.append(param.name) if param.name in self.root_parameters and param != self.root_parameters[ param.name]: # Could be an 'io' parameter used for separate 'i' and 'o' parameters # in a nested computation. # Need to check types and merge. new_ann = param.annotation old_param = self.root_parameters[param.name] old_ann = old_param.annotation # FIXME: Not sure when these can be raised assert old_ann.type == new_ann.type assert old_param.default == param.default # Given the old_param != param, the only possible combinations of roles are # 'i' and 'o', 'i' and 'io', 'o' and 'io'. # In all cases the resulting role is 'io'. new_param = Parameter(param.name, Annotation(new_ann.type, 'io'), default=param.default) self.root_parameters[param.name] = new_param self.leaf_parameters[param.name] = new_param else: self.nodes[param.name] = Node() self.root_parameters[param.name] = param self.leaf_parameters[param.name] = param
def sarprop_kernel(ctx, weights, gradient, last_gradient, step_sizes, noise, parameters): """ SARPROP update kernel """ kernel_cache, thread = ctx.kernel_cache, ctx.thread assert weights.shape == gradient.shape == last_gradient.shape == step_sizes.shape key = (sarprop_kernel, weights.shape, thread._context) + tuple( parameters.values()) if not key in kernel_cache.keys(): logging.info("compiling " + str(key)) kernel = PureParallel([ Parameter('weights', Annotation(weights, 'io')), Parameter('gradient', Annotation(gradient, 'i')), Parameter('last_gradient', Annotation(last_gradient, 'io')), Parameter('step_sizes', Annotation(step_sizes, 'io')), Parameter('noise', Annotation(step_sizes, 'i')) ], """ ${weights.ctype} w = ${weights.load_same}; ${gradient.ctype} g = ${gradient.load_same}; ${last_gradient.ctype} lg = ${last_gradient.load_same}; ${step_sizes.ctype} s = ${step_sizes.load_same}; ${noise.ctype} n = ${noise.load_same}; n = fabs(n); // Adapt step size if (g * lg > 0.0f) { s = min(${reward_factor}f * s, ${max_step_size}f); // Apply update if (g < 0.0f) { w = w - s*n; } if (g > 0.0f) { w = w + s*n; } } else { // punish step size s = max(${punish_factor}f * s, ${min_step_size}f); } // If l1 weight decay is greater zero, apply it % if l1_decay > 0.0: if (w > 0.0f) { w = max(0.0f, w - ${l1_decay}f); } if (w < 0.0f) { w = min(0.0f, w + ${l1_decay}f); } % endif; // If l2 weight decay is greater zero, apply it % if l2_decay > 0.0: w *= ${1.0 - l2_decay}f; % endif; // Save last gradient lg = g; ${weights.store_same}(w); ${last_gradient.store_same}(lg); ${step_sizes.store_same}(s); """, guiding_array='weights', render_kwds=parameters) kernel_cache[key] = kernel.compile(thread) # Run kernel kernel_cache[key](weights, gradient, last_gradient, step_sizes, noise)