def grad(self,inputs, output_gradients): C,d, WShape, B = inputs dLdA ,= output_gradients z = T.zeros_like(C[0,0,0,0,:]) dLdC = convTransp3D( dLdA, z, d, B, C.shape[1:4]) dLdd = None #not differentiable, since d is not continuous dLdWShape = None #not differentiable, since d is not continuous dLdB = conv3D( C, dLdA, T.zeros_like(B[0,0,0,0,:]), d) return [ dLdC, dLdd, dLdWShape, dLdB ]
def grad(self, inputs, output_gradients): C, d, WShape, B = inputs dLdA, = output_gradients z = T.zeros_like(C[0, 0, 0, 0, :]) dLdC = convTransp3D(dLdA, z, d, B, C.shape[1:4]) # d actually does affect the outputs, so it's not disconnected dLdd = grad_undefined(self, 1, d) # The shape of the weights doesn't affect the output elements dLdWShape = DisconnectedType()() dLdB = conv3D(C, dLdA, T.zeros_like(B[0, 0, 0, 0, :]), d) return [dLdC, dLdd, dLdWShape, dLdB]
def grad(self, inputs, output_gradients): W, b, d, H, RShape = inputs dCdR, = output_gradients dCdH = conv3D(dCdR, W, T.zeros_like(H[0, 0, 0, 0, :]), d) WShape = W.shape dCdW = convGrad3D(dCdR, d, WShape, H) dCdb = T.sum(dCdR, axis=(0, 1, 2, 3)) dCdd = None #not differentiable, since d is not continuous dCdRShape = None #not differentiable, since RShape is not continuous if 'name' in dir(dCdR) and dCdR.name is not None: dCdR_name = dCdR.name else: dCdR_name = 'anon' if 'name' in dir(H) and H.name is not None: H_name = H.name else: H_name = 'anon' if 'name' in dir(W) and W.name is not None: W_name = W.name else: W_name = 'anon' if 'name' in dir(b) and b.name is not None: b_name = b.name else: b_name = 'anon' dCdW.name = 'ConvTransp3D_dCdW.H=' + H_name + ',dCdR=' + dCdR_name + ',W=' + W_name dCdb.name = 'ConvTransp3D_dCdb.H=' + H_name + ',dCdR=' + dCdR_name + ',W=' + W_name + ',b=' + b_name dCdH.name = 'ConvTransp3D_dCdH.H=' + H_name + ',dCdR=' + dCdR_name return [dCdW, dCdb, dCdd, dCdH, dCdRShape]
def make_rule(self, local_particle, local_acc_updates, global_particle): """Make Downpour rule. All particles along with the global particle start from the same position. According to this rule, each local particle executes descent normally but their parameter updates are accumulated (e.g. by moving average) to a variable. Every N iterations, the local accumulated updates are added together and applied to the global particle. Each local particle restarts from global particle's position. Parameters ---------- local_particle : {:ref:`theano.compile.SharedVariable`, list of :ref:`theano.compile.SharedVariable`} A particle's position in parameter space doing local SGD. local_acc_updates : {:ref:`theano.compile.SharedVariable`, list of :ref:`theano.compile.SharedVariable`} Shared variable accumulating local parameter updates. global_particle : {:ref:`theano.compile.SharedVariable`, list of :ref:`theano.compile.SharedVariable`} A particle whose position is updated only by the Downpour process and resets position of local particles. .. seealso:: Notes on :meth:`GlobalDynamics.make_rule` """ import theano from theano.tensor import basic if isinstance(local_particle, theano.compile.SharedVariable): local_particle = [local_particle] if isinstance(local_acc_updates, theano.compile.SharedVariable): local_acc_updates = [local_acc_updates] if isinstance(global_particle, theano.compile.SharedVariable): global_particle = [global_particle] new_global = [] new_local = [] new_acc_updates = [] for lp, lau, gp in zip(local_particle, local_acc_updates, global_particle): global_acc_updates = AllReduceSum(lau, inplace=True) if self.average: global_acc_updates /= self.worker.global_size new_global.append(gp + global_acc_updates) new_local.append(new_global[-1]) new_acc_updates.append(basic.zeros_like(lau)) updates = list(zip(local_particle, new_local)) + \ list(zip(local_acc_updates, new_acc_updates)) + \ list(zip(global_particle, new_global)) self._fn = theano.function([], [], updates=updates, accept_inplace=True)
def grad(self, inputs, output_gradients): V, W, b, d = inputs dCdH, = output_gradients # make all of these ops support broadcasting of scalar b to vector b and eplace the zeros_like in all their grads # print dCdH.broadcastable # print "dCdH.broadcastable" # quit(-1) # dCdH = printing.Print("dCdH = ",["shape"]) # Make sure the broadcasting pattern of the gradient is the the same # as the initial variable dCdV = theano.tensor.nnet.convTransp3D(W, T.zeros_like(V[0, 0, 0, 0, :]), d, dCdH, V.shape[1:4]) dCdV = T.patternbroadcast(dCdV, V.broadcastable) WShape = W.shape dCdW = theano.tensor.nnet.convGrad3D(V, d, WShape, dCdH) dCdW = T.patternbroadcast(dCdW, W.broadcastable) dCdb = T.sum(dCdH, axis=(0, 1, 2, 3)) dCdb = T.patternbroadcast(dCdb, b.broadcastable) dCdd = grad_undefined( self, 3, inputs[3], "The gradient of Conv3D with respect to the convolution" " stride is undefined because Conv3D is only defined for" " integer strides.") if 'name' in dir(dCdH) and dCdH.name is not None: dCdH_name = dCdH.name else: dCdH_name = 'anon_dCdH' if 'name' in dir(V) and V.name is not None: V_name = V.name else: V_name = 'anon_V' if 'name' in dir(W) and W.name is not None: W_name = W.name else: W_name = 'anon_W' if 'name' in dir(b) and b.name is not None: b_name = b.name else: b_name = 'anon_b' dCdV.name = 'Conv3D_dCdV(dCdH=' + dCdH_name + ',V=' + V_name + ')' dCdW.name = ('Conv3D_dCdW(dCdH=' + dCdH_name + ',V=' + V_name + ',W=' + W_name + ')') dCdb.name = ('Conv3D_dCdb(dCdH=' + dCdH_name + ',V=' + V_name + ',W=' + W_name + ',b=' + b_name + ')') return [dCdV, dCdW, dCdb, dCdd]
def grad(self, inputs, output_gradients): V, W, b, d = inputs dCdH, = output_gradients # make all of these ops support broadcasting of scalar b to vector b and eplace the zeros_like in all their grads # print dCdH.broadcastable # print "dCdH.broadcastable" # quit(-1) # dCdH = printing.Print("dCdH = ",["shape"]) # Make sure the broadcasting pattern of the gradient is the the same # as the initial variable dCdV = theano.tensor.nnet.convTransp3D( W, T.zeros_like(V[0, 0, 0, 0, :]), d, dCdH, V.shape[1:4]) dCdV = T.patternbroadcast(dCdV, V.broadcastable) WShape = W.shape dCdW = theano.tensor.nnet.convGrad3D(V, d, WShape, dCdH) dCdW = T.patternbroadcast(dCdW, W.broadcastable) dCdb = T.sum(dCdH, axis=(0, 1, 2, 3)) dCdb = T.patternbroadcast(dCdb, b.broadcastable) dCdd = grad_undefined( self, 3, inputs[3], "The gradient of Conv3D with respect to the convolution" " stride is undefined because Conv3D is only defined for" " integer strides.") if 'name' in dir(dCdH) and dCdH.name is not None: dCdH_name = dCdH.name else: dCdH_name = 'anon_dCdH' if 'name' in dir(V) and V.name is not None: V_name = V.name else: V_name = 'anon_V' if 'name' in dir(W) and W.name is not None: W_name = W.name else: W_name = 'anon_W' if 'name' in dir(b) and b.name is not None: b_name = b.name else: b_name = 'anon_b' dCdV.name = 'Conv3D_dCdV(dCdH=' + dCdH_name + ',V=' + V_name + ')' dCdW.name = ('Conv3D_dCdW(dCdH=' + dCdH_name + ',V=' + V_name + ',W=' + W_name + ')') dCdb.name = ('Conv3D_dCdb(dCdH=' + dCdH_name + ',V=' + V_name + ',W=' + W_name + ',b=' + b_name + ')') return [dCdV, dCdW, dCdb, dCdd]
def grad(self,inputs, output_gradients): V,W,b,d = inputs dCdH ,= output_gradients #make all of these ops support broadcasting of scalar b to vector b and eplace the zeros_like in all their grads #print dCdH.broadcastable #print "dCdH.broadcastable" #quit(-1) #dCdH = printing.Print("dCdH = ",["shape"]) # Make sure the broadcasting pattern of the gradient is the the same # as the initial variable dCdV = ConvTransp3D.convTransp3D(W, T.zeros_like(V[0,0,0,0,:]), d, dCdH, V.shape[1:4]) dCdV = T.patternbroadcast(dCdV, V.broadcastable) WShape = W.shape dCdW = ConvGrad3D.convGrad3D(V,d,WShape,dCdH) dCdW = T.patternbroadcast(dCdW, W.broadcastable) dCdb = T.sum(dCdH, axis=(0,1,2,3)) dCdb = T.patternbroadcast(dCdb, b.broadcastable) dCdd = None #not differentiable, since d is not continuous if 'name' in dir(dCdH) and dCdH.name is not None: dCdH_name = dCdH.name else: dCdH_name = 'anon' if 'name' in dir(V) and V.name is not None: V_name = V.name else: V_name = 'anon' if 'name' in dir(W) and W.name is not None: W_name = W.name else: W_name = 'anon' if 'name' in dir(b) and b.name is not None: b_name = b.name else: b_name = 'anon' dCdV.name = 'Conv3D_dCdV.dCdH='+dCdH_name+',V='+V_name dCdW.name = 'Conv3D_dCdW.dCdH='+dCdH_name+',V='+V_name+',W='+W_name dCdb.name = 'Conv3D_dCdb.dCdH='+dCdH_name+',V='+V_name+',W='+W_name+',b='+b_name return [ dCdV, dCdW, dCdb, dCdd ]
def grad(self, inputs, output_gradients): V, W, b, d = inputs dCdH, = output_gradients #make all of these ops support broadcasting of scalar b to vector b and eplace the zeros_like in all their grads #print dCdH.broadcastable #print "dCdH.broadcastable" #quit(-1) #dCdH = printing.Print("dCdH = ",["shape"]) dCdV = ConvTransp3D.convTransp3D(W, T.zeros_like(V[0, 0, 0, 0, :]), d, dCdH, V.shape[1:4]) WShape = W.shape dCdW = ConvGrad3D.convGrad3D(V, d, WShape, dCdH) dCdb = T.sum(dCdH, axis=(0, 1, 2, 3)) dCdd = None #not differentiable, since d is not continuous if 'name' in dir(dCdH) and dCdH.name is not None: dCdH_name = dCdH.name else: dCdH_name = 'anon' if 'name' in dir(V) and V.name is not None: V_name = V.name else: V_name = 'anon' if 'name' in dir(W) and W.name is not None: W_name = W.name else: W_name = 'anon' if 'name' in dir(b) and b.name is not None: b_name = b.name else: b_name = 'anon' dCdV.name = 'Conv3D_dCdV.dCdH=' + dCdH_name + ',V=' + V_name dCdW.name = 'Conv3D_dCdW.dCdH=' + dCdH_name + ',V=' + V_name + ',W=' + W_name dCdb.name = 'Conv3D_dCdb.dCdH=' + dCdH_name + ',V=' + V_name + ',W=' + W_name + ',b=' + b_name return [dCdV, dCdW, dCdb, dCdd]
def grad(self, inputs, output_gradients): W, b, d, H, RShape = inputs dCdR, = output_gradients dCdH = theano.tensor.nnet.conv3D(dCdR, W, T.zeros_like(H[0, 0, 0, 0, :]), d) WShape = W.shape dCdW = theano.tensor.nnet.convGrad3D(dCdR, d, WShape, H) dCdb = T.sum(dCdR, axis=(0, 1, 2, 3)) # not differentiable, since d affects the output elements dCdd = grad_undefined(self, 2, d) # disconnected, since RShape just determines the output shape dCdRShape = DisconnectedType()() if 'name' in dir(dCdR) and dCdR.name is not None: dCdR_name = dCdR.name else: dCdR_name = 'anon_dCdR' if 'name' in dir(H) and H.name is not None: H_name = H.name else: H_name = 'anon_H' if 'name' in dir(W) and W.name is not None: W_name = W.name else: W_name = 'anon_W' if 'name' in dir(b) and b.name is not None: b_name = b.name else: b_name = 'anon_b' dCdW.name = ('ConvTransp3D_dCdW.H=' + H_name + ',dCdR=' + dCdR_name + ',W=' + W_name) dCdb.name = ('ConvTransp3D_dCdb.H=' + H_name + ',dCdR=' + dCdR_name + ',W=' + W_name + ',b=' + b_name) dCdH.name = 'ConvTransp3D_dCdH.H=' + H_name + ',dCdR=' + dCdR_name return [dCdW, dCdb, dCdd, dCdH, dCdRShape]
def grad(self,inputs, output_gradients): W,b,d,H, RShape = inputs dCdR ,= output_gradients dCdH = conv3D( dCdR, W, T.zeros_like(H[0,0,0,0,:]), d) WShape = W.shape dCdW = convGrad3D(dCdR,d,WShape,H) dCdb = T.sum(dCdR,axis=(0,1,2,3)) dCdd = None #not differentiable, since d is not continuous dCdRShape = None #not differentiable, since RShape is not continuous if 'name' in dir(dCdR) and dCdR.name is not None: dCdR_name = dCdR.name else: dCdR_name = 'anon' if 'name' in dir(H) and H.name is not None: H_name = H.name else: H_name = 'anon' if 'name' in dir(W) and W.name is not None: W_name = W.name else: W_name = 'anon' if 'name' in dir(b) and b.name is not None: b_name = b.name else: b_name = 'anon' dCdW.name = 'ConvTransp3D_dCdW.H='+H_name+',dCdR='+dCdR_name+',W='+W_name dCdb.name = 'ConvTransp3D_dCdb.H='+H_name+',dCdR='+dCdR_name+',W='+W_name+',b='+b_name dCdH.name = 'ConvTransp3D_dCdH.H='+H_name+',dCdR='+dCdR_name return [ dCdW, dCdb, dCdd, dCdH, dCdRShape ]