def grad(X, Y, act, params, grads, aux): H, bh = params _H, _bh = grads a, eh, loss = aux # forward pass a[0].assign(X) n_layers = len(eh) for i in range(n_layers): # a = sigmoid( ap*H + bh ) a[i].dot(H[i], target=a[i + 1]) a[i + 1].add_row_vec(bh[i]) if i < n_layers - 1: cm.sigmoid(a[i + 1]) else: # last layer if act == 'logistic': cm.sigmoid(a[i + 1]) elif act == 'softmax': a_t = a[i + 1].transpose() cm.softmax(a_t) a_t.transpose(target=a[i + 1]) a_t.free_device_memory() else: pass # backward pass # compute error term of the last layer a[-1].subtract(Y, target=eh[-1]) # check the following for i in range(n_layers - 1, -1, -1): # compute derivatives _H[i].assign(0.0) _H[i].add_dot(a[i].T, eh[i]) eh[i].sum(axis=0, target=_bh[i]) # compute error term for the previous layer if i > 0: # eh = sigmoid'(a) x ( ehp*H' ) eh[i].dot(H[i].T, target=eh[i - 1]) eh[i - 1].apply_logistic_deriv(a[i]) if act == 'logistic': cm.cross_entropy_bernoulli(Y, a[n_layers], target=loss) elif act == 'softmax': loss = cm.cross_entropy(Y, a[n_layers], target=loss) elif act == 'linear': a[-1].mult(a[-1], target=loss) return loss.sum()
def grad(X, Y, act, params, grads, aux): H, bh = params _H, _bh = grads a, eh, loss = aux # forward pass a[0].assign(X) n_layers = len(eh) for i in range(n_layers): # a = sigmoid( ap*H + bh ) a[i].dot(H[i], target = a[i+1]) a[i+1].add_row_vec(bh[i]) if i < n_layers-1: cm.sigmoid(a[i+1]) else: # last layer if act == 'logistic': cm.sigmoid(a[i+1]) elif act == 'softmax': a_t = a[i+1].transpose() cm.softmax(a_t) a_t.transpose(target=a[i+1]) a_t.free_device_memory() else: pass # backward pass # compute error term of the last layer a[-1].subtract(Y, target=eh[-1]) # check the following for i in range(n_layers-1, -1, -1): # compute derivatives _H[i].assign(0.0) _H[i].add_dot(a[i].T, eh[i]) eh[i].sum(axis=0, target=_bh[i]) # compute error term for the previous layer if i > 0: # eh = sigmoid'(a) x ( ehp*H' ) eh[i].dot(H[i].T, target=eh[i-1]) eh[i-1].apply_logistic_deriv(a[i]) if act == 'logistic': cm.cross_entropy_bernoulli(Y, a[n_layers], target=loss) elif act == 'softmax': loss = cm.cross_entropy(Y, a[n_layers], target=loss) elif act == 'linear': a[-1].mult(a[-1], target=loss) return loss.sum()
def grad(X, Y, act_type, rho, params, grads, aux): H, O, bh, bo = params _H, _O, _bh, _bo = grads a, z, eh, eo, loss, s, s_m = aux _H.assign(0.0) _O.assign(0.0) _bh.assign(0.0) _bo.assign(0.0) # watch out for the redundand accumulations ### FORWARD PASS ### # a = tanh( x*H + bh ) X.dot(H, target=a) a.add_row_vec(bh) cm.sigmoid(a) # b = sigm( a*O + bo ) #a.dot(H.T, target=z) # use tyied weights a.dot(O, target=z) z.add_row_vec(bo) if act_type == 'logistic': cm.sigmoid(z) # DEBUG ### BACKWARD PASS ### # eo = z - y z.subtract(Y, target=eo) # eh = sigmoid'(a) x ( eo * O + (rho-1)/(s-1) - rho/s ) eo.dot(O.T, target = eh) # the following needs to be verified if rho > 0: a.sum(axis=0, target=s) s.mult(1.0/a.shape[0]) # normalize by batch_size s.reciprocal() s.mult(rho) a.sum(axis=0, target=s_m) # TODO: remove this redundancy s_m.mult(1.0/a.shape[0]) # normalize by batch_size s_m.subtract(1.0) s_m.reciprocal() s_m.mult(rho-1) s.subtract(s_m) eh.add_row_mult(s, -1.0) eh.apply_logistic_deriv(a) ### COMPUTE GRADIENTS ### _O.add_dot(a.T, eo) _H.add_dot(X.T, eh) _bo.add_sums(eo, axis=0) _bh.add_sums(eh, axis=0) ### COMPUTE ERROR ### if act_type == 'logistic': cm.cross_entropy_bernoulli(Y, z, target=loss) elif act_type == 'linear': eo.mult(eo, target=loss) #loss.add_mult(eo, eo) # DEBUG else: raise ValueError("Activation function '%s' is unknown" % args.act_type) err = loss.sum() return err
def GetLoss(self, get_deriv=False): """Compute loss and also deriv w.r.t to it if asked for. Compute the loss function. Targets should be in self.data, predictions should be in self.state. Args: get_deriv: If True, compute the derivative w.r.t the loss function and put it in self.deriv. """ perf = deepnet_pb2.Metrics() perf.MergeFrom(self.proto.performance_stats) perf.count = self.batchsize tiny = self.tiny if self.loss_function == deepnet_pb2.Layer.CROSS_ENTROPY: if self.activation == deepnet_pb2.Hyperparams.LOGISTIC: data = self.data state = self.state deriv = self.deriv temp3 = self.dimsize unitcell = self.unitcell cm.cross_entropy_bernoulli(data, state, target=deriv, tiny=self.tiny) deriv.sum(axis=1, target=temp3) temp3.sum(axis=0, target=unitcell) cross_entropy = unitcell.euclid_norm() cm.correct_preds(data, state, target=deriv, cutoff=0.5) deriv.sum(axis=1, target=temp3) temp3.sum(axis=0, target=unitcell) correct_preds = unitcell.euclid_norm() if get_deriv: self.state.subtract(self.data, target=self.deriv) perf.cross_entropy = cross_entropy perf.correct_preds = correct_preds elif self.activation == deepnet_pb2.Hyperparams.SOFTMAX: temp2 = self.temp2 temp = self.temp batchsize = self.batchsize dimensions = self.dimensions numlabels = self.numlabels state = self.state data = self.data unitcell = self.unitcell indices = self.indices # Optimized for space to handle large number of labels in a softmax. data.reshape((1, batchsize * dimensions)) data.add(self.rowshift, target=indices) state.reshape((numlabels, dimensions * batchsize)) state.max(axis=0, target=temp2) state.reshape((1, batchsize * numlabels * dimensions)) state.select_columns(indices, temp) temp2.subtract(temp) temp2.sign(target=temp2) temp2.sum(axis=1, target=unitcell) correct_preds = batchsize - unitcell.euclid_norm() if get_deriv: temp.subtract(1, target=temp2) state.set_selected_columns(indices, temp2) state.reshape((numlabels * dimensions, batchsize)) self.deriv.assign(self.state) state.reshape((numlabels * dimensions, batchsize)) temp.add(tiny) cm.log(temp) temp.sum(axis=1, target=unitcell) cross_entropy = unitcell.euclid_norm() perf.cross_entropy = cross_entropy perf.correct_preds = correct_preds elif self.loss_function == deepnet_pb2.Layer.SQUARED_LOSS: if self.activation == deepnet_pb2.Hyperparams.REPLICATED_SOFTMAX: if self.hyperparams.normalize_error: self.data.sum(axis=0, target=self.temp) self.temp.add(self.tiny) self.data.div_by_row(self.temp, target=self.deriv) self.state.div_by_row(self.NN, target=self.expanded_batch) self.deriv.subtract(self.expanded_batch) else: self.data.sum(axis=0, target=self.temp) self.temp.add(self.tiny) self.state.div_by_row(self.temp, target=self.deriv) self.deriv.subtract(self.data) elif self.activation == deepnet_pb2.Hyperparams.SOFTMAX: self.expansion_matrix.select_columns(self.data, target=self.expanded_batch) self.state.subtract(self.expanded_batch, target=self.deriv) else: if 'precision' in self.params: self.data.mult_by_col(self.params['precision'], target=self.deriv) self.deriv.subtract(self.state) else: self.state.subtract(self.data, target=self.deriv) error = self.deriv.euclid_norm()**2 perf.error = error if self.activation != deepnet_pb2.Hyperparams.SOFTMAX and \ self.activation != deepnet_pb2.Hyperparams.REPLICATED_SOFTMAX: self.ComputeDeriv() else: raise Exception('Unknown loss function.') return perf