def fprop(self, input): # we reduce the precision of parameters for the computations self.w_comp = apply_format(self.format, self.W, self.comp_precision, self.w_range) self.b_comp = apply_format(self.format, self.b, self.comp_precision, self.b_range) input = input.reshape(self.image_shape) # convolution input_shuffled = input.dimshuffle(1, 2, 3, 0) # bc01 to c01b filters_shuffled = self.w_comp.dimshuffle(1, 2, 3, 0) *self.scale # bc01 to c01b conv_op = FilterActs(stride=self.filter_stride, partial_sum=self.partial_sum,pad = self.zero_pad) contiguous_input = gpu_contiguous(input_shuffled) contiguous_filters = gpu_contiguous(filters_shuffled) conv_out_shuffled = conv_op(contiguous_input, contiguous_filters) # downsample each feature map individually, using maxpooling # pooled_out = downsample.max_pool_2d(input=conv_out, # ds=poolsize, ignore_border=True) pool_op = MaxPool(ds=self.pool_shape, stride=self.pool_stride) pooled_out_shuffled = pool_op(conv_out_shuffled) pooled_out = pooled_out_shuffled.dimshuffle(3, 0, 1, 2) # c01b to bc01 # bias pooled_out = apply_format(self.format, pooled_out + self.b_comp.dimshuffle('x', 0, 'x', 'x')*self.scale, self.comp_precision, self.z_range) # activation pooled_out = self.activation(pooled_out) pooled_out = apply_format(self.format, pooled_out.flatten(2), self.comp_precision, self.y_range) return pooled_out
def dropout_fprop(self, input): # we reduce the precision of parameters for the computations self.fixed_W = apply_format(self.format, self.W, self.comp_precision, self.w_range) self.fixed_b = apply_format(self.format, self.b, self.comp_precision, self.b_range) # create the dropout mask # The cast is important because # int * float32 = float64 which pulls things off the gpu srng = T.shared_randomstreams.RandomStreams(self.rng.randint(999999)) self.mask = T.cast(srng.binomial(n=1, p=self.p, size=T.shape(input)), theano.config.floatX) # apply the mask self.fixed_x = input * self.mask # weighted sum self.z = T.dot(self.fixed_x, self.fixed_W) + self.fixed_b self.fixed_z = apply_format(self.format, self.z, self.comp_precision, self.z_range) # activation self.y = self.activation(self.fixed_z) self.fixed_y = apply_format(self.format, self.y, self.comp_precision, self.y_range) # return the output return self.fixed_y
def parameter_updates(self, LR, M): # compute updates new_update_W = apply_format(self.format, M * self.update_W - LR * self.w_LR_scale * self.fixed_dEdW, self.comp_precision, self.update_w_range) new_update_b = apply_format(self.format, M * self.update_b - LR * self.b_LR_scale * self.fixed_dEdb, self.comp_precision, self.update_b_range) # compute new parameters. Note that we use a better precision than the other operations new_W = apply_format(self.format, self.W + new_update_W, self.update_precision, self.w_range) new_b = apply_format(self.format, self.b + new_update_b, self.update_precision, self.b_range) # L2 column constraint on W col_norms = T.sqrt(T.sum(T.sqr(new_W), axis=0)) # col_norms = T.max(new_W, axis=0) desired_norms = T.clip(col_norms, 0, self.max_col_norm) # clip = saturate below min and beyond max new_W = apply_format(self.format, new_W * (desired_norms / (1e-7 + col_norms)), self.update_precision, self.w_range) # for some reason, works better than # new_W = new_W * (desired_norms / col_norms) # It may be a kind of regularization # return the updates of shared variables updates = [] updates.append((self.W, new_W)) updates.append((self.b, new_b)) updates.append((self.update_W, new_update_W)) updates.append((self.update_b, new_update_b)) return updates
def bprop(self, dEdy): self.fixed_dEdy = apply_format(self.format, dEdy, self.comp_precision, self.dEdy_range) # activation self.activation_bprop() # compute gradients of parameters self.fixed_dEdW = apply_format( self.format, T.grad(cost=None, wrt=[self.fixed_W], known_grads={self.z: self.fixed_dEdz})[0], self.comp_precision, self.dEdw_range) self.fixed_dEdb = apply_format( self.format, T.grad(cost=None, wrt=[self.fixed_b], known_grads={self.z: self.fixed_dEdz})[0], self.comp_precision, self.dEdb_range) # weighted sum dEdx = T.grad(cost=None, wrt=[self.fixed_x], known_grads={self.z: self.fixed_dEdz})[0] # apply mask dEdx = self.mask * dEdx return dEdx
def dropout_fprop(self, input): # we reduce the precision of parameters for the computations self.fixed_W = apply_format(self.format, self.W, self.comp_precision, self.w_range) self.fixed_b = apply_format(self.format, self.b, self.comp_precision, self.b_range) # create the dropout mask # The cast is important because # int * float32 = float64 which pulls things off the gpu srng = T.shared_randomstreams.RandomStreams(self.rng.randint(999999)) self.mask = T.cast(srng.binomial(n=1, p=self.p, size=T.shape(input)), theano.config.floatX) input = input * self.mask self.fixed_x = input.reshape(self.image_shape) # convolution input_shuffled = self.fixed_x.dimshuffle(1, 2, 3, 0) # bc01 to c01b filters_shuffled = self.fixed_W.dimshuffle(1, 2, 3, 0) # bc01 to c01b conv_op = FilterActs( stride=self.filter_stride, partial_sum=self.partial_sum, pad=self.zero_pad ) # augment partial sum -> use less memory but slower contiguous_input = gpu_contiguous(input_shuffled) contiguous_filters = gpu_contiguous(filters_shuffled) conv_out_shuffled = conv_op(contiguous_input, contiguous_filters) self.z = conv_out_shuffled.dimshuffle(3, 0, 1, 2) # c01b to bc01 self.fixed_z = apply_format(self.format, self.z, self.comp_precision, self.z_range) conv_out_shuffled = self.fixed_z.dimshuffle(1, 2, 3, 0) # bc01 to c01b conv_out_shuffled = gpu_contiguous(conv_out_shuffled) # downsample each feature map individually, using maxpooling # pooled_out = downsample.max_pool_2d(input=conv_out, # ds=poolsize, ignore_border=True) pool_op = MaxPool(ds=self.pool_shape, stride=self.pool_stride) pooled_out_shuffled = pool_op(conv_out_shuffled) pooled_out = pooled_out_shuffled.dimshuffle(3, 0, 1, 2) # c01b to bc01 # bias self.u = pooled_out + self.fixed_b.dimshuffle('x', 0, 'x', 'x') self.fixed_u = apply_format(self.format, self.u, self.comp_precision, self.z_range) # activation self.y = self.activation(self.fixed_u).flatten(2) self.fixed_y = apply_format(self.format, self.y, self.comp_precision, self.y_range) return self.fixed_y
def fprop(self, input): # we reduce the precision of parameters for the computations self.w_comp = apply_format(self.format, self.W, self.comp_precision, self.w_range) self.b_comp = apply_format(self.format, self.b, self.comp_precision, self.b_range) # scaled weighted sum self.z = apply_format(self.format, T.dot(input, self.w_comp * self.scale) + self.b_comp*self.scale, self.comp_precision, self.z_range) # activation self.y = apply_format(self.format, self.activation(self.z), self.comp_precision, self.y_range) # return the output return self.y
def bprop(self, dEdy): self.fixed_dEdy = apply_format(self.format, dEdy.reshape(self.output_shape), self.comp_precision, self.dEdy_range) fixed_dEdu = apply_format(self.format, T.grad(cost = None, wrt=[self.fixed_u], known_grads={self.y:self.fixed_dEdy})[0], self.comp_precision,self.dEdz_range) self.fixed_dEdb = apply_format(self.format, T.grad(cost = None, wrt=[self.fixed_b], known_grads={self.u:fixed_dEdu})[0], self.comp_precision,self.dEdb_range) self.fixed_dEdz = apply_format(self.format, T.grad(cost = None, wrt=[self.fixed_z], known_grads={self.u:fixed_dEdu})[0], self.comp_precision, self.dEdz_range) self.fixed_dEdW = apply_format(self.format, T.grad(cost = None, wrt=[self.fixed_W], known_grads={self.z:self.fixed_dEdz})[0], self.comp_precision,self.dEdw_range) dEdx = T.grad(cost = None, wrt=[self.fixed_x], known_grads={self.z:self.fixed_dEdz})[0] dEdx = T.reshape(self.mask,T.shape(dEdx)) * dEdx return dEdx
def dropout_fprop(self, input): # we reduce the precision of parameters for the computations self.fixed_W = apply_format(self.format, self.W, self.comp_precision, self.w_range) self.fixed_b = apply_format(self.format, self.b, self.comp_precision, self.b_range) # create the dropout mask # The cast is important because # int * float32 = float64 which pulls things off the gpu srng = T.shared_randomstreams.RandomStreams(self.rng.randint(999999)) self.mask = T.cast(srng.binomial(n=1, p=self.p, size=T.shape(input)), theano.config.floatX) input = input * self.mask self.fixed_x = input.reshape(self.image_shape) # convolution input_shuffled = self.fixed_x.dimshuffle(1, 2, 3, 0) # bc01 to c01b filters_shuffled = self.fixed_W.dimshuffle(1, 2, 3, 0) # bc01 to c01b conv_op = FilterActs(stride=self.filter_stride, partial_sum=self.partial_sum,pad = self.zero_pad) # augment partial sum -> use less memory but slower contiguous_input = gpu_contiguous(input_shuffled) contiguous_filters = gpu_contiguous(filters_shuffled) conv_out_shuffled = conv_op(contiguous_input, contiguous_filters) self.z = conv_out_shuffled.dimshuffle(3, 0, 1, 2) # c01b to bc01 self.fixed_z = apply_format(self.format, self.z, self.comp_precision, self.z_range) conv_out_shuffled = self.fixed_z.dimshuffle(1, 2, 3, 0) # bc01 to c01b conv_out_shuffled = gpu_contiguous(conv_out_shuffled) # downsample each feature map individually, using maxpooling # pooled_out = downsample.max_pool_2d(input=conv_out, # ds=poolsize, ignore_border=True) pool_op = MaxPool(ds=self.pool_shape, stride=self.pool_stride) pooled_out_shuffled = pool_op(conv_out_shuffled) pooled_out = pooled_out_shuffled.dimshuffle(3, 0, 1, 2) # c01b to bc01 # bias self.u = pooled_out + self.fixed_b.dimshuffle('x', 0, 'x', 'x') self.fixed_u = apply_format(self.format, self.u, self.comp_precision, self.z_range) # activation self.y = self.activation(self.fixed_u).flatten(2) self.fixed_y = apply_format(self.format, self.y, self.comp_precision, self.y_range) return self.fixed_y
def activation_bprop(self): self.fixed_dEdz = apply_format( self.format, T.grad(cost=None, wrt=[self.fixed_z], known_grads={self.y: self.fixed_dEdy})[0], self.comp_precision, self.dEdz_range)
def bprop(self, dEdy): self.fixed_dEdy = apply_format(self.format, dEdy, self.comp_precision, self.dEdy_range) # activation self.activation_bprop() # compute gradients of parameters self.fixed_dEdW = apply_format(self.format, T.grad(cost = None, wrt=[self.fixed_W], known_grads={self.z:self.fixed_dEdz})[0], self.comp_precision, self.dEdw_range) self.fixed_dEdb = apply_format(self.format, T.grad(cost = None, wrt=[self.fixed_b], known_grads={self.z:self.fixed_dEdz})[0], self.comp_precision, self.dEdb_range) # weighted sum dEdx = T.grad(cost = None, wrt=[self.fixed_x], known_grads={self.z:self.fixed_dEdz})[0] # apply mask dEdx = self.mask * dEdx return dEdx
def fprop(self, input): # we reduce the precision of parameters for the computations self.w_comp = apply_format(self.format, self.W, self.comp_precision, self.w_range) self.b_comp = apply_format(self.format, self.b, self.comp_precision, self.b_range) # scaled weighted sum self.z = apply_format( self.format, T.dot(input, self.w_comp * self.scale) + self.b_comp * self.scale, self.comp_precision, self.z_range) # activation self.y = apply_format(self.format, self.activation(self.z), self.comp_precision, self.y_range) # return the output return self.y
def bprop(self, dEdy): self.fixed_dEdy = apply_format(self.format, dEdy.reshape(self.output_shape), self.comp_precision, self.dEdy_range) fixed_dEdu = apply_format( self.format, T.grad(cost=None, wrt=[self.fixed_u], known_grads={self.y: self.fixed_dEdy})[0], self.comp_precision, self.dEdz_range) self.fixed_dEdb = apply_format( self.format, T.grad(cost=None, wrt=[self.fixed_b], known_grads={self.u: fixed_dEdu})[0], self.comp_precision, self.dEdb_range) self.fixed_dEdz = apply_format( self.format, T.grad(cost=None, wrt=[self.fixed_z], known_grads={self.u: fixed_dEdu})[0], self.comp_precision, self.dEdz_range) self.fixed_dEdW = apply_format( self.format, T.grad(cost=None, wrt=[self.fixed_W], known_grads={self.z: self.fixed_dEdz})[0], self.comp_precision, self.dEdw_range) dEdx = T.grad(cost=None, wrt=[self.fixed_x], known_grads={self.z: self.fixed_dEdz})[0] dEdx = T.reshape(self.mask, T.shape(dEdx)) * dEdx return dEdx
def fprop(self, input): # we reduce the precision of parameters for the computations self.w_comp = apply_format(self.format, self.W, self.comp_precision, self.w_range) self.b_comp = apply_format(self.format, self.b, self.comp_precision, self.b_range) input = input.reshape(self.image_shape) # convolution input_shuffled = input.dimshuffle(1, 2, 3, 0) # bc01 to c01b filters_shuffled = self.w_comp.dimshuffle( 1, 2, 3, 0) * self.scale # bc01 to c01b conv_op = FilterActs(stride=self.filter_stride, partial_sum=self.partial_sum, pad=self.zero_pad) contiguous_input = gpu_contiguous(input_shuffled) contiguous_filters = gpu_contiguous(filters_shuffled) conv_out_shuffled = conv_op(contiguous_input, contiguous_filters) # downsample each feature map individually, using maxpooling # pooled_out = downsample.max_pool_2d(input=conv_out, # ds=poolsize, ignore_border=True) pool_op = MaxPool(ds=self.pool_shape, stride=self.pool_stride) pooled_out_shuffled = pool_op(conv_out_shuffled) pooled_out = pooled_out_shuffled.dimshuffle(3, 0, 1, 2) # c01b to bc01 # bias pooled_out = apply_format( self.format, pooled_out + self.b_comp.dimshuffle('x', 0, 'x', 'x') * self.scale, self.comp_precision, self.z_range) # activation pooled_out = self.activation(pooled_out) pooled_out = apply_format(self.format, pooled_out.flatten(2), self.comp_precision, self.y_range) return pooled_out
def parameter_updates(self, LR, M): # compute updates new_update_W = apply_format( self.format, M * self.update_W - LR * self.w_LR_scale * self.fixed_dEdW, self.comp_precision, self.update_w_range) new_update_b = apply_format( self.format, M * self.update_b - LR * self.b_LR_scale * self.fixed_dEdb, self.comp_precision, self.update_b_range) # compute new parameters. Note that we use a better precision than the other operations new_W = apply_format(self.format, self.W + new_update_W, self.update_precision, self.w_range) new_b = apply_format(self.format, self.b + new_update_b, self.update_precision, self.b_range) # L2 column constraint on W col_norms = T.sqrt(T.sum(T.sqr(new_W), axis=0)) # col_norms = T.max(new_W, axis=0) desired_norms = T.clip( col_norms, 0, self.max_col_norm) # clip = saturate below min and beyond max new_W = apply_format(self.format, new_W * (desired_norms / (1e-7 + col_norms)), self.update_precision, self.w_range) # for some reason, works better than # new_W = new_W * (desired_norms / col_norms) # It may be a kind of regularization # return the updates of shared variables updates = [] updates.append((self.W, new_W)) updates.append((self.b, new_b)) updates.append((self.update_W, new_update_W)) updates.append((self.update_b, new_update_b)) return updates
def activation_bprop(self): self.fixed_dEdz = apply_format(self.format, T.grad(cost = None, wrt=[self.fixed_z], known_grads={self.y:self.fixed_dEdy})[0], self.comp_precision, self.dEdz_range)
def activation_bprop(self): self.fixed_dEdz = apply_format(self.format, self.fixed_dEdy, self.comp_precision, self.dEdz_range)