def grad(self, inp, grads): x, dy, scale, x_mean, x_invstd, epsilon = inp ddinputs, ddscale, ddbias = grads x_diff = x - x_mean mean_dy_x_diff = T.mean(dy * x_diff, axis=self.axes, keepdims=True) # compute gradients given each of the output gradients g_wrt_x = 0 g_wrt_dy = 0 g_wrt_scale = 0 g_wrt_x_mean = 0 g_wrt_x_invstd = 0 if not isinstance(ddinputs.type, theano.gradient.DisconnectedType): ccc = scale * (ddinputs - T.mean(ddinputs, axis=self.axes, keepdims=True)) ddd = (x_invstd**3) * ( ccc * T.mean(dy * x_diff, axis=self.axes, keepdims=True) + dy * T.mean(ccc * x_diff, axis=self.axes, keepdims=True)) g_wrt_x = g_wrt_x - ddd g_wrt_dy = g_wrt_dy + ((ccc * x_invstd) - ( (x_invstd**3) * x_diff * T.mean(ccc * x_diff, axis=self.axes, keepdims=True))) eee = (dy * x_invstd) - ((x_invstd**3) * x_diff * mean_dy_x_diff) g_wrt_scale = g_wrt_scale + T.sum( ddinputs * (eee - T.mean(eee, axis=self.axes, keepdims=True)), axis=self.axes, keepdims=True) g_wrt_x_mean = g_wrt_x_mean + T.sum( ddd, axis=self.axes, keepdims=True) g_wrt_x_invstd = g_wrt_x_invstd + T.sum( ccc * (dy - 3 * (x_invstd**2) * x_diff * mean_dy_x_diff), axis=self.axes, keepdims=True) if not isinstance(ddscale.type, theano.gradient.DisconnectedType): g_wrt_x = g_wrt_x + (x_invstd * ddscale * dy) g_wrt_dy = g_wrt_dy + (x_invstd * ddscale * x_diff) g_wrt_x_mean = g_wrt_x_mean - ( x_invstd * ddscale * T.sum(dy, axis=self.axes, keepdims=True)) g_wrt_x_invstd = g_wrt_x_invstd + ( ddscale * T.sum(dy * x_diff, axis=self.axes, keepdims=True)) if not isinstance(ddbias.type, theano.gradient.DisconnectedType): g_wrt_dy = g_wrt_dy + T.fill(dy, ddbias) # depending on which output gradients are given, # some inputs should be disconnected results = [ g_wrt_x, g_wrt_dy, g_wrt_scale, g_wrt_x_mean, g_wrt_x_invstd, theano.gradient.DisconnectedType()() ] return [ theano.gradient.DisconnectedType()() if r is 0 else r for r in results ]
def local_abstract_batch_norm_train_grad(node): if not isinstance(node.op, AbstractBatchNormTrainGrad): return None x, dy, scale, x_mean, x_invstd, epsilon = node.inputs axes = node.op.axes if min(axes) < 0 or max(axes) > x.ndim: return None if not isinstance(x.type, TensorType) or \ not isinstance(dy.type, TensorType) or \ not isinstance(scale.type, TensorType) or \ not isinstance(x_mean.type, TensorType) or \ not isinstance(x_invstd.type, TensorType) or \ not isinstance(epsilon.type, TensorType): return None x_diff = x - x_mean mean_dy_x_diff = T.mean(dy * x_diff, axis=axes, keepdims=True) c = (dy * x_invstd) - x_diff * (mean_dy_x_diff * (x_invstd ** 3)) g_wrt_inputs = scale * (c - T.mean(c, axis=axes, keepdims=True)) g_wrt_scale = T.sum(dy * x_invstd * x_diff, axis=axes, keepdims=True) g_wrt_bias = T.sum(dy, axis=axes, keepdims=True) results = [g_wrt_inputs, g_wrt_scale, g_wrt_bias] results = [T.patternbroadcast(r, r_orig.broadcastable) for (r, r_orig) in zip(results, node.outputs)] for var in theano.gof.graph.variables(node.inputs, results): if var not in node.inputs: copy_stack_trace(node.outputs[0], var) return results
def grad(self, inp, grads): dy, sm = inp g, = grads tmp = g + tensor.neg(tensor.sum(g * sm, axis=1).dimshuffle((0, 'x'))) g_dy = tmp * sm tmp2 = tensor.sum(dy * sm, axis=1).dimshuffle((0, 'x')) g_sm = tmp * dy - g * tmp2 return g_dy, g_sm
def grad(self, inputs, output_gradients): W, b, d, H, RShape = inputs dCdR, = output_gradients dCdH = conv3D(dCdR, W, T.zeros_like(H[0, 0, 0, 0, :]), d) WShape = W.shape dCdW = convGrad3D(dCdR, d, WShape, H) dCdb = T.sum(dCdR, axis=(0, 1, 2, 3)) dCdd = None #not differentiable, since d is not continuous dCdRShape = None #not differentiable, since RShape is not continuous if 'name' in dir(dCdR) and dCdR.name is not None: dCdR_name = dCdR.name else: dCdR_name = 'anon' if 'name' in dir(H) and H.name is not None: H_name = H.name else: H_name = 'anon' if 'name' in dir(W) and W.name is not None: W_name = W.name else: W_name = 'anon' if 'name' in dir(b) and b.name is not None: b_name = b.name else: b_name = 'anon' dCdW.name = 'ConvTransp3D_dCdW.H=' + H_name + ',dCdR=' + dCdR_name + ',W=' + W_name dCdb.name = 'ConvTransp3D_dCdb.H=' + H_name + ',dCdR=' + dCdR_name + ',W=' + W_name + ',b=' + b_name dCdH.name = 'ConvTransp3D_dCdH.H=' + H_name + ',dCdR=' + dCdR_name return [dCdW, dCdb, dCdd, dCdH, dCdRShape]
def hierarchical_categorical_crossentropy(coding_dist, true_dist, hierarchy, inv_hierarchy, level_list): """ Return the cross-entropy between an approximating distribution and a true distribution taking into account a hierarchy of classes Mathematically it is defined as follows: .. math:: H(p,q) = - \sum_x p(x) \log(q(x)) Parameters ---------- coding_dist : a dense matrix Each slice along axis represents one distribution. true_dist : a dense matrix or sparse matrix or integer vector In the case of a matrix argument, each slice along axis represents one distribution. In the case of an integer vector argument, each element represents the position of the '1' in a 1-of-N encoding. Returns ------- tensor of rank one-less-than `coding_dist` The cross entropy between each coding and true distribution. Notes ----- axis : int The dimension over which each distribution runs (1 for row distributions, 0 for column distributions). """ if true_dist.ndim == coding_dist.ndim: return -tensor.sum(true_dist * tensor.log(coding_dist), axis=coding_dist.ndim - 1) elif true_dist.ndim == coding_dist.ndim - 1: return hierarchical_categorical_crossentropy_1hot( coding_dist, true_dist, hierarchy, inv_hierarchy, level_list) else: raise TypeError('rank mismatch between coding and true distributions')
def Generate_theta_p(x, s, I, N, K, prod_f): #print(s,N,len(x)) T = len(x) D = len(x[0]) s = [one_hot(ss, N) for ss in s] # x = [[xt for _ in range(N)] for xt in x] x = np.array(x) s = np.array(s) model = pm.Model() with model: # Priors for unknown model parameters theta = pm.Normal("theta", mu=0, sigma=1, shape=(D, K)) / np.sqrt( K * D) p_list = [] for t in range(T): #print(prod_f) #print(np.transpose(theta)) wt = dot(dot(prod_f, transpose(theta)), x[t]) swt = s[t] * wt sum_sw = sum(swt) p = exp(swt) / (1 + sum_sw) p0 = 1 / (1 + sum_sw) p_list.append(concatenate(([p0], p))) I_obs = pm.Categorical("I_obs", p=stack(p_list, axis=0), observed=I) with model: step = pm.Metropolis() trace1 = pm.sample(tune=2000, chains=1, step=step) return trace1["theta"][-1]
def grad(self, inp, grads): x, dy, scale, x_mean, x_invstd, epsilon = inp ddinputs, ddscale, ddbias = grads x_diff = x - x_mean mean_dy_x_diff = T.mean(dy * x_diff, axis=self.axes, keepdims=True) # compute gradients given each of the output gradients g_wrt_x = 0 g_wrt_dy = 0 g_wrt_scale = 0 g_wrt_x_mean = 0 g_wrt_x_invstd = 0 if not isinstance(ddinputs.type, theano.gradient.DisconnectedType): ccc = scale * (ddinputs - T.mean(ddinputs, axis=self.axes, keepdims=True)) ddd = (x_invstd ** 3) * (ccc * T.mean(dy * x_diff, axis=self.axes, keepdims=True) + dy * T.mean(ccc * x_diff, axis=self.axes, keepdims=True)) g_wrt_x = g_wrt_x - ddd g_wrt_dy = g_wrt_dy + ((ccc * x_invstd) - ((x_invstd ** 3) * x_diff * T.mean(ccc * x_diff, axis=self.axes, keepdims=True))) eee = (dy * x_invstd) - ((x_invstd ** 3) * x_diff * mean_dy_x_diff) g_wrt_scale = g_wrt_scale + T.sum(ddinputs * (eee - T.mean(eee, axis=self.axes, keepdims=True)), axis=self.axes, keepdims=True) g_wrt_x_mean = g_wrt_x_mean + T.sum(ddd, axis=self.axes, keepdims=True) g_wrt_x_invstd = g_wrt_x_invstd + T.sum(ccc * (dy - 3 * (x_invstd ** 2) * x_diff * mean_dy_x_diff), axis=self.axes, keepdims=True) if not isinstance(ddscale.type, theano.gradient.DisconnectedType): g_wrt_x = g_wrt_x + (x_invstd * ddscale * dy) g_wrt_dy = g_wrt_dy + (x_invstd * ddscale * x_diff) g_wrt_x_mean = g_wrt_x_mean - (x_invstd * ddscale * T.sum(dy, axis=self.axes, keepdims=True)) g_wrt_x_invstd = g_wrt_x_invstd + (ddscale * T.sum(dy * x_diff, axis=self.axes, keepdims=True)) if not isinstance(ddbias.type, theano.gradient.DisconnectedType): g_wrt_dy = g_wrt_dy + T.fill(dy, ddbias) # depending on which output gradients are given, # some inputs should be disconnected results = [g_wrt_x, g_wrt_dy, g_wrt_scale, g_wrt_x_mean, g_wrt_x_invstd, theano.gradient.DisconnectedType()()] return [theano.gradient.DisconnectedType()() if r is 0 else r for r in results]
def norm(x, ord): x = as_tensor_variable(x) ndim = x.ndim if ndim == 0: raise ValueError("'axis' entry is out of bounds.") elif ndim == 1: if ord is None: return tensor.sum(x**2)**0.5 elif ord == 'inf': return tensor.max(abs(x)) elif ord == '-inf': return tensor.min(abs(x)) elif ord == 0: return x[x.nonzero()].shape[0] else: try: z = tensor.sum(abs(x**ord))**(1. / ord) except TypeError: raise ValueError("Invalid norm order for vectors.") return z elif ndim == 2: if ord is None or ord == 'fro': return tensor.sum(abs(x**2))**(0.5) elif ord == 'inf': return tensor.max(tensor.sum(abs(x), 1)) elif ord == '-inf': return tensor.min(tensor.sum(abs(x), 1)) elif ord == 1: return tensor.max(tensor.sum(abs(x), 0)) elif ord == -1: return tensor.min(tensor.sum(abs(x), 0)) else: raise ValueError(0) elif ndim > 2: raise NotImplementedError("We don't support norm with ndim > 2")
def norm(x,ord): x = as_tensor_variable(x) ndim = x.ndim if ndim == 0: raise ValueError("'axis' entry is out of bounds.") elif ndim == 1: if ord is None: return tensor.sum(x**2)**0.5 elif ord == 'inf': return tensor.max(abs(x)) elif ord == '-inf': return tensor.min(abs(x)) elif ord == 0: return x[x.nonzero()].shape[0] else: try: z = tensor.sum(abs(x**ord))**(1./ord) except TypeError: raise ValueError("Invalid norm order for vectors.") return z elif ndim == 2: if ord is None or ord == 'fro': return tensor.sum(abs(x**2))**(0.5) elif ord == 'inf': return tensor.max(tensor.sum(abs(x), 1)) elif ord == '-inf': return tensor.min(tensor.sum(abs(x), 1)) elif ord == 1: return tensor.max(tensor.sum(abs(x), 0)) elif ord == -1: return tensor.min(tensor.sum(abs(x),0)) else: raise ValueError(0) elif ndim > 2: raise NotImplementedError("We don't support norm witn ndim > 2")
def partially_linear(true_dist, coding_dist): loss = 0 TIME = 150 N_C = 21 batch = 32 for t in range (TIME): term1 = true_dist[:,t] * tensor.log(coding_dist[:,t]+0.0000001) term2 = (1-true_dist[:,t]) * tensor.log(1-coding_dist[:,t]+0.0000001) loss = loss + np.double(1)/N_C * tensor.sum(term1+term2*np.double(t)/TIME, axis=1) return -loss/batch
def grad(self, inputs, output_gradients): V, W, b, d = inputs dCdH, = output_gradients # make all of these ops support broadcasting of scalar b to vector b and eplace the zeros_like in all their grads # print dCdH.broadcastable # print "dCdH.broadcastable" # quit(-1) # dCdH = printing.Print("dCdH = ",["shape"]) # Make sure the broadcasting pattern of the gradient is the the same # as the initial variable dCdV = theano.tensor.nnet.convTransp3D(W, T.zeros_like(V[0, 0, 0, 0, :]), d, dCdH, V.shape[1:4]) dCdV = T.patternbroadcast(dCdV, V.broadcastable) WShape = W.shape dCdW = theano.tensor.nnet.convGrad3D(V, d, WShape, dCdH) dCdW = T.patternbroadcast(dCdW, W.broadcastable) dCdb = T.sum(dCdH, axis=(0, 1, 2, 3)) dCdb = T.patternbroadcast(dCdb, b.broadcastable) dCdd = grad_undefined( self, 3, inputs[3], "The gradient of Conv3D with respect to the convolution" " stride is undefined because Conv3D is only defined for" " integer strides.") if 'name' in dir(dCdH) and dCdH.name is not None: dCdH_name = dCdH.name else: dCdH_name = 'anon_dCdH' if 'name' in dir(V) and V.name is not None: V_name = V.name else: V_name = 'anon_V' if 'name' in dir(W) and W.name is not None: W_name = W.name else: W_name = 'anon_W' if 'name' in dir(b) and b.name is not None: b_name = b.name else: b_name = 'anon_b' dCdV.name = 'Conv3D_dCdV(dCdH=' + dCdH_name + ',V=' + V_name + ')' dCdW.name = ('Conv3D_dCdW(dCdH=' + dCdH_name + ',V=' + V_name + ',W=' + W_name + ')') dCdb.name = ('Conv3D_dCdb(dCdH=' + dCdH_name + ',V=' + V_name + ',W=' + W_name + ',b=' + b_name + ')') return [dCdV, dCdW, dCdb, dCdd]
def functions(self, sequence_length): key = (sequence_length) if key not in self.cache: logging.info("Need to construct graph for sequence_length=%d..." % (sequence_length)) # creating network input variable nodes correct_inputs = t.ftensor3("correct input") noise_inputs = t.ftensor3("noise input") learning_rate = t.fscalar("learning rate") # creating op nodes for firing the network correct_score, correct_prehidden = self.score(correct_inputs) noise_score, noise_prehidden = self.score(noise_inputs) # creating op nodes for the pairwise ranking cost function loss = t.clip(1 - correct_score + noise_score, 0, 1e999) total_loss = t.sum(loss) # the necessary cost function gradients parameters_gradient = grad(total_loss, list(self.parameters)) correct_inputs_gradient = grad(total_loss, correct_inputs) noise_inputs_gradient = grad(total_loss, noise_inputs) # setting network inputs predict_inputs = [correct_inputs] train_inputs = [correct_inputs, noise_inputs, learning_rate] verbose_predict_inputs = predict_inputs # setting network outputs predict_outputs = [correct_score] train_outputs = [correct_inputs_gradient, noise_inputs_gradient, loss, correct_score, noise_score] verbose_predict_outputs = [correct_score, correct_prehidden] nnodes = len(theano.gof.graph.ops(predict_inputs, predict_outputs)) logging.info("About to compile prediction function over %d ops [nodes]..." % nnodes) predict = theano.function(predict_inputs, predict_outputs, mode=COMPILE_MODE) logging.info("...done constructing graph for sequence_length=%d" % (sequence_length)) nnodes = len(theano.gof.graph.ops(verbose_predict_inputs, verbose_predict_outputs)) logging.info("About to compile verbose prediction function over %d ops [nodes]..." % nnodes) verbose_predict = theano.function(verbose_predict_inputs, verbose_predict_outputs, mode=COMPILE_MODE) logging.info("...done constructing graph for sequence_length=%d" % (sequence_length)) nnodes = len(theano.gof.graph.ops(train_inputs, train_outputs)) logging.info("About to compile training function over %d ops [nodes]..." % nnodes) train = theano.function(train_inputs, train_outputs, mode=COMPILE_MODE, updates=[(p, p - learning_rate * gp) for p, gp in zip(list(self.parameters), parameters_gradient)]) logging.info("...done constructing graph for sequence_length=%d" % (sequence_length)) self.cache[key] = (predict, train, verbose_predict) return self.cache[key]
def grad(self, inputs, output_gradients): V, W, b, d = inputs dCdH, = output_gradients # make all of these ops support broadcasting of scalar b to vector b and eplace the zeros_like in all their grads # print dCdH.broadcastable # print "dCdH.broadcastable" # quit(-1) # dCdH = printing.Print("dCdH = ",["shape"]) # Make sure the broadcasting pattern of the gradient is the the same # as the initial variable dCdV = theano.tensor.nnet.convTransp3D( W, T.zeros_like(V[0, 0, 0, 0, :]), d, dCdH, V.shape[1:4]) dCdV = T.patternbroadcast(dCdV, V.broadcastable) WShape = W.shape dCdW = theano.tensor.nnet.convGrad3D(V, d, WShape, dCdH) dCdW = T.patternbroadcast(dCdW, W.broadcastable) dCdb = T.sum(dCdH, axis=(0, 1, 2, 3)) dCdb = T.patternbroadcast(dCdb, b.broadcastable) dCdd = grad_undefined( self, 3, inputs[3], "The gradient of Conv3D with respect to the convolution" " stride is undefined because Conv3D is only defined for" " integer strides.") if 'name' in dir(dCdH) and dCdH.name is not None: dCdH_name = dCdH.name else: dCdH_name = 'anon_dCdH' if 'name' in dir(V) and V.name is not None: V_name = V.name else: V_name = 'anon_V' if 'name' in dir(W) and W.name is not None: W_name = W.name else: W_name = 'anon_W' if 'name' in dir(b) and b.name is not None: b_name = b.name else: b_name = 'anon_b' dCdV.name = 'Conv3D_dCdV(dCdH=' + dCdH_name + ',V=' + V_name + ')' dCdW.name = ('Conv3D_dCdW(dCdH=' + dCdH_name + ',V=' + V_name + ',W=' + W_name + ')') dCdb.name = ('Conv3D_dCdb(dCdH=' + dCdH_name + ',V=' + V_name + ',W=' + W_name + ',b=' + b_name + ')') return [dCdV, dCdW, dCdb, dCdd]
def acc_weighted_cross_entropy(pred, targets): ''' loss only counting misclassified ''' #weights = np.not_equal(pred, targets).astype("float") #return -np.sum(weights* targets * tensor.log(pred), axis=pred.ndim - 1) if targets.ndim == pred.ndim: weights = tensor.neq(pred, targets) return -tensor.sum(weights * targets * tensor.log(pred), axis=pred.ndim - 1) else: print(targets.ndim, " ", pred.ndim) raise TypeError('rank mismatch between coding and true distributions')
def grad(self,inputs, output_gradients): V,W,b,d = inputs dCdH ,= output_gradients #make all of these ops support broadcasting of scalar b to vector b and eplace the zeros_like in all their grads #print dCdH.broadcastable #print "dCdH.broadcastable" #quit(-1) #dCdH = printing.Print("dCdH = ",["shape"]) # Make sure the broadcasting pattern of the gradient is the the same # as the initial variable dCdV = ConvTransp3D.convTransp3D(W, T.zeros_like(V[0,0,0,0,:]), d, dCdH, V.shape[1:4]) dCdV = T.patternbroadcast(dCdV, V.broadcastable) WShape = W.shape dCdW = ConvGrad3D.convGrad3D(V,d,WShape,dCdH) dCdW = T.patternbroadcast(dCdW, W.broadcastable) dCdb = T.sum(dCdH, axis=(0,1,2,3)) dCdb = T.patternbroadcast(dCdb, b.broadcastable) dCdd = None #not differentiable, since d is not continuous if 'name' in dir(dCdH) and dCdH.name is not None: dCdH_name = dCdH.name else: dCdH_name = 'anon' if 'name' in dir(V) and V.name is not None: V_name = V.name else: V_name = 'anon' if 'name' in dir(W) and W.name is not None: W_name = W.name else: W_name = 'anon' if 'name' in dir(b) and b.name is not None: b_name = b.name else: b_name = 'anon' dCdV.name = 'Conv3D_dCdV.dCdH='+dCdH_name+',V='+V_name dCdW.name = 'Conv3D_dCdW.dCdH='+dCdH_name+',V='+V_name+',W='+W_name dCdb.name = 'Conv3D_dCdb.dCdH='+dCdH_name+',V='+V_name+',W='+W_name+',b='+b_name return [ dCdV, dCdW, dCdb, dCdd ]
def grad(self, inputs, output_gradients): V, W, b, d = inputs dCdH, = output_gradients #make all of these ops support broadcasting of scalar b to vector b and eplace the zeros_like in all their grads #print dCdH.broadcastable #print "dCdH.broadcastable" #quit(-1) #dCdH = printing.Print("dCdH = ",["shape"]) dCdV = ConvTransp3D.convTransp3D(W, T.zeros_like(V[0, 0, 0, 0, :]), d, dCdH, V.shape[1:4]) WShape = W.shape dCdW = ConvGrad3D.convGrad3D(V, d, WShape, dCdH) dCdb = T.sum(dCdH, axis=(0, 1, 2, 3)) dCdd = None #not differentiable, since d is not continuous if 'name' in dir(dCdH) and dCdH.name is not None: dCdH_name = dCdH.name else: dCdH_name = 'anon' if 'name' in dir(V) and V.name is not None: V_name = V.name else: V_name = 'anon' if 'name' in dir(W) and W.name is not None: W_name = W.name else: W_name = 'anon' if 'name' in dir(b) and b.name is not None: b_name = b.name else: b_name = 'anon' dCdV.name = 'Conv3D_dCdV.dCdH=' + dCdH_name + ',V=' + V_name dCdW.name = 'Conv3D_dCdW.dCdH=' + dCdH_name + ',V=' + V_name + ',W=' + W_name dCdb.name = 'Conv3D_dCdb.dCdH=' + dCdH_name + ',V=' + V_name + ',W=' + W_name + ',b=' + b_name return [dCdV, dCdW, dCdb, dCdd]
def grad(self, inputs, output_gradients): W, b, d, H, RShape = inputs dCdR, = output_gradients dCdH = theano.tensor.nnet.conv3D(dCdR, W, T.zeros_like(H[0, 0, 0, 0, :]), d) WShape = W.shape dCdW = theano.tensor.nnet.convGrad3D(dCdR, d, WShape, H) dCdb = T.sum(dCdR, axis=(0, 1, 2, 3)) # not differentiable, since d affects the output elements dCdd = grad_undefined(self, 2, d) # disconnected, since RShape just determines the output shape dCdRShape = DisconnectedType()() if 'name' in dir(dCdR) and dCdR.name is not None: dCdR_name = dCdR.name else: dCdR_name = 'anon_dCdR' if 'name' in dir(H) and H.name is not None: H_name = H.name else: H_name = 'anon_H' if 'name' in dir(W) and W.name is not None: W_name = W.name else: W_name = 'anon_W' if 'name' in dir(b) and b.name is not None: b_name = b.name else: b_name = 'anon_b' dCdW.name = ('ConvTransp3D_dCdW.H=' + H_name + ',dCdR=' + dCdR_name + ',W=' + W_name) dCdb.name = ('ConvTransp3D_dCdb.H=' + H_name + ',dCdR=' + dCdR_name + ',W=' + W_name + ',b=' + b_name) dCdH.name = 'ConvTransp3D_dCdH.H=' + H_name + ',dCdR=' + dCdR_name return [dCdW, dCdb, dCdd, dCdH, dCdRShape]
def grad(self,inputs, output_gradients): W,b,d,H, RShape = inputs dCdR ,= output_gradients dCdH = conv3D( dCdR, W, T.zeros_like(H[0,0,0,0,:]), d) WShape = W.shape dCdW = convGrad3D(dCdR,d,WShape,H) dCdb = T.sum(dCdR,axis=(0,1,2,3)) dCdd = None #not differentiable, since d is not continuous dCdRShape = None #not differentiable, since RShape is not continuous if 'name' in dir(dCdR) and dCdR.name is not None: dCdR_name = dCdR.name else: dCdR_name = 'anon' if 'name' in dir(H) and H.name is not None: H_name = H.name else: H_name = 'anon' if 'name' in dir(W) and W.name is not None: W_name = W.name else: W_name = 'anon' if 'name' in dir(b) and b.name is not None: b_name = b.name else: b_name = 'anon' dCdW.name = 'ConvTransp3D_dCdW.H='+H_name+',dCdR='+dCdR_name+',W='+W_name dCdb.name = 'ConvTransp3D_dCdb.H='+H_name+',dCdR='+dCdR_name+',W='+W_name+',b='+b_name dCdH.name = 'ConvTransp3D_dCdH.H='+H_name+',dCdR='+dCdR_name return [ dCdW, dCdb, dCdd, dCdH, dCdRShape ]
def functions(self, sequence_length): key = (sequence_length) if key not in self.cache: logging.info("Need to construct graph for sequence_length=%d..." % (sequence_length)) # creating network input variable nodes correct_inputs = t.ftensor3("correct input") noise_inputs = t.ftensor3("noise input") learning_rate = t.fscalar("learning rate") # creating op nodes for firing the network correct_score, correct_prehidden = self.score(correct_inputs) noise_score, noise_prehidden = self.score(noise_inputs) # creating op nodes for the pairwise ranking cost function loss = t.clip(1 - correct_score + noise_score, 0, 1e999) total_loss = t.sum(loss) # the necessary cost function gradients parameters_gradient = grad(total_loss, list(self.parameters)) correct_inputs_gradient = grad(total_loss, correct_inputs) noise_inputs_gradient = grad(total_loss, noise_inputs) # setting network inputs predict_inputs = [correct_inputs] train_inputs = [correct_inputs, noise_inputs, learning_rate] verbose_predict_inputs = predict_inputs # setting network outputs predict_outputs = [correct_score] train_outputs = [ correct_inputs_gradient, noise_inputs_gradient, loss, correct_score, noise_score ] verbose_predict_outputs = [correct_score, correct_prehidden] nnodes = len(theano.gof.graph.ops(predict_inputs, predict_outputs)) logging.info( "About to compile prediction function over %d ops [nodes]..." % nnodes) predict = theano.function(predict_inputs, predict_outputs, mode=COMPILE_MODE) logging.info("...done constructing graph for sequence_length=%d" % (sequence_length)) nnodes = len( theano.gof.graph.ops(verbose_predict_inputs, verbose_predict_outputs)) logging.info( "About to compile verbose prediction function over %d ops [nodes]..." % nnodes) verbose_predict = theano.function(verbose_predict_inputs, verbose_predict_outputs, mode=COMPILE_MODE) logging.info("...done constructing graph for sequence_length=%d" % (sequence_length)) nnodes = len(theano.gof.graph.ops(train_inputs, train_outputs)) logging.info( "About to compile training function over %d ops [nodes]..." % nnodes) train = theano.function( train_inputs, train_outputs, mode=COMPILE_MODE, updates=[(p, p - learning_rate * gp) for p, gp in zip( list(self.parameters), parameters_gradient)]) logging.info("...done constructing graph for sequence_length=%d" % (sequence_length)) self.cache[key] = (predict, train, verbose_predict) return self.cache[key]