def fprop(self, input):
     
     # we reduce the precision of parameters for the computations
     self.w_comp = apply_format(self.format, self.W, self.comp_precision, self.w_range)
     self.b_comp = apply_format(self.format, self.b, self.comp_precision, self.b_range)
     
     input = input.reshape(self.image_shape)
     
     # convolution
     input_shuffled = input.dimshuffle(1, 2, 3, 0) # bc01 to c01b
     filters_shuffled = self.w_comp.dimshuffle(1, 2, 3, 0) *self.scale # bc01 to c01b
     conv_op = FilterActs(stride=self.filter_stride, partial_sum=self.partial_sum,pad = self.zero_pad)
     contiguous_input = gpu_contiguous(input_shuffled)
     contiguous_filters = gpu_contiguous(filters_shuffled)
     conv_out_shuffled = conv_op(contiguous_input, contiguous_filters)
     
     # downsample each feature map individually, using maxpooling
     # pooled_out = downsample.max_pool_2d(input=conv_out,
     #                                     ds=poolsize, ignore_border=True)
     pool_op = MaxPool(ds=self.pool_shape, stride=self.pool_stride)
     pooled_out_shuffled = pool_op(conv_out_shuffled)
     pooled_out = pooled_out_shuffled.dimshuffle(3, 0, 1, 2) # c01b to bc01
     
     # bias
     pooled_out = apply_format(self.format, pooled_out + self.b_comp.dimshuffle('x', 0, 'x', 'x')*self.scale, self.comp_precision, self.z_range)
     
     # activation
     pooled_out = self.activation(pooled_out)
     pooled_out = apply_format(self.format, pooled_out.flatten(2), self.comp_precision, self.y_range)
     
     return pooled_out
Пример #2
0
    def dropout_fprop(self, input):

        # we reduce the precision of parameters for the computations
        self.fixed_W = apply_format(self.format, self.W, self.comp_precision,
                                    self.w_range)
        self.fixed_b = apply_format(self.format, self.b, self.comp_precision,
                                    self.b_range)

        # create the dropout mask
        # The cast is important because
        # int * float32 = float64 which pulls things off the gpu
        srng = T.shared_randomstreams.RandomStreams(self.rng.randint(999999))
        self.mask = T.cast(srng.binomial(n=1, p=self.p, size=T.shape(input)),
                           theano.config.floatX)

        # apply the mask
        self.fixed_x = input * self.mask

        # weighted sum
        self.z = T.dot(self.fixed_x, self.fixed_W) + self.fixed_b
        self.fixed_z = apply_format(self.format, self.z, self.comp_precision,
                                    self.z_range)

        # activation
        self.y = self.activation(self.fixed_z)
        self.fixed_y = apply_format(self.format, self.y, self.comp_precision,
                                    self.y_range)

        # return the output
        return self.fixed_y
 def parameter_updates(self, LR, M):    
     
     # compute updates
     new_update_W = apply_format(self.format, M * self.update_W - LR * self.w_LR_scale * self.fixed_dEdW, self.comp_precision, self.update_w_range)
     new_update_b = apply_format(self.format, M * self.update_b - LR * self.b_LR_scale * self.fixed_dEdb, self.comp_precision, self.update_b_range)
     
     # compute new parameters. Note that we use a better precision than the other operations
     new_W = apply_format(self.format, self.W + new_update_W, self.update_precision, self.w_range)
     new_b = apply_format(self.format, self.b + new_update_b, self.update_precision, self.b_range)
     
     # L2 column constraint on W
     col_norms = T.sqrt(T.sum(T.sqr(new_W), axis=0))
     # col_norms = T.max(new_W, axis=0)
     desired_norms = T.clip(col_norms, 0, self.max_col_norm) # clip = saturate below min and beyond max
     new_W = apply_format(self.format, new_W * (desired_norms / (1e-7 + col_norms)), self.update_precision, self.w_range)
     # for some reason, works better than 
     # new_W = new_W * (desired_norms / col_norms)
     # It may be a kind of regularization
     
     # return the updates of shared variables
     updates = []
     updates.append((self.W, new_W))
     updates.append((self.b, new_b))
     updates.append((self.update_W, new_update_W))
     updates.append((self.update_b, new_update_b)) 
     
     return updates
Пример #4
0
    def bprop(self, dEdy):

        self.fixed_dEdy = apply_format(self.format, dEdy, self.comp_precision,
                                       self.dEdy_range)

        # activation
        self.activation_bprop()

        # compute gradients of parameters
        self.fixed_dEdW = apply_format(
            self.format,
            T.grad(cost=None,
                   wrt=[self.fixed_W],
                   known_grads={self.z: self.fixed_dEdz})[0],
            self.comp_precision, self.dEdw_range)
        self.fixed_dEdb = apply_format(
            self.format,
            T.grad(cost=None,
                   wrt=[self.fixed_b],
                   known_grads={self.z: self.fixed_dEdz})[0],
            self.comp_precision, self.dEdb_range)

        # weighted sum
        dEdx = T.grad(cost=None,
                      wrt=[self.fixed_x],
                      known_grads={self.z: self.fixed_dEdz})[0]

        # apply mask
        dEdx = self.mask * dEdx

        return dEdx
 def dropout_fprop(self, input):
     
     # we reduce the precision of parameters for the computations
     self.fixed_W = apply_format(self.format, self.W, self.comp_precision, self.w_range)
     self.fixed_b = apply_format(self.format, self.b, self.comp_precision, self.b_range)
         
     # create the dropout mask
     # The cast is important because
     # int * float32 = float64 which pulls things off the gpu
     srng = T.shared_randomstreams.RandomStreams(self.rng.randint(999999))
     self.mask = T.cast(srng.binomial(n=1, p=self.p, size=T.shape(input)), theano.config.floatX)
     
     # apply the mask
     self.fixed_x = input * self.mask
     
     # weighted sum
     self.z = T.dot(self.fixed_x, self.fixed_W) + self.fixed_b
     self.fixed_z = apply_format(self.format, self.z, self.comp_precision, self.z_range)
     
     # activation
     self.y = self.activation(self.fixed_z)
     self.fixed_y = apply_format(self.format, self.y, self.comp_precision, self.y_range)
     
     # return the output
     return  self.fixed_y
Пример #6
0
    def dropout_fprop(self, input):

        # we reduce the precision of parameters for the computations
        self.fixed_W = apply_format(self.format, self.W, self.comp_precision,
                                    self.w_range)
        self.fixed_b = apply_format(self.format, self.b, self.comp_precision,
                                    self.b_range)

        # create the dropout mask
        # The cast is important because
        # int * float32 = float64 which pulls things off the gpu

        srng = T.shared_randomstreams.RandomStreams(self.rng.randint(999999))
        self.mask = T.cast(srng.binomial(n=1, p=self.p, size=T.shape(input)),
                           theano.config.floatX)
        input = input * self.mask

        self.fixed_x = input.reshape(self.image_shape)

        # convolution
        input_shuffled = self.fixed_x.dimshuffle(1, 2, 3, 0)  # bc01 to c01b
        filters_shuffled = self.fixed_W.dimshuffle(1, 2, 3, 0)  # bc01 to c01b
        conv_op = FilterActs(
            stride=self.filter_stride,
            partial_sum=self.partial_sum,
            pad=self.zero_pad
        )  # augment partial sum -> use less memory but slower
        contiguous_input = gpu_contiguous(input_shuffled)
        contiguous_filters = gpu_contiguous(filters_shuffled)
        conv_out_shuffled = conv_op(contiguous_input, contiguous_filters)

        self.z = conv_out_shuffled.dimshuffle(3, 0, 1, 2)  # c01b to bc01
        self.fixed_z = apply_format(self.format, self.z, self.comp_precision,
                                    self.z_range)

        conv_out_shuffled = self.fixed_z.dimshuffle(1, 2, 3, 0)  # bc01 to c01b
        conv_out_shuffled = gpu_contiguous(conv_out_shuffled)

        # downsample each feature map individually, using maxpooling
        # pooled_out = downsample.max_pool_2d(input=conv_out,
        #                                     ds=poolsize, ignore_border=True)
        pool_op = MaxPool(ds=self.pool_shape, stride=self.pool_stride)
        pooled_out_shuffled = pool_op(conv_out_shuffled)
        pooled_out = pooled_out_shuffled.dimshuffle(3, 0, 1, 2)  # c01b to bc01

        # bias
        self.u = pooled_out + self.fixed_b.dimshuffle('x', 0, 'x', 'x')
        self.fixed_u = apply_format(self.format, self.u, self.comp_precision,
                                    self.z_range)

        # activation
        self.y = self.activation(self.fixed_u).flatten(2)
        self.fixed_y = apply_format(self.format, self.y, self.comp_precision,
                                    self.y_range)

        return self.fixed_y
 def fprop(self, input):
     
     # we reduce the precision of parameters for the computations
     self.w_comp = apply_format(self.format, self.W, self.comp_precision, self.w_range)
     self.b_comp = apply_format(self.format, self.b, self.comp_precision, self.b_range)
     
     # scaled weighted sum
     self.z = apply_format(self.format, T.dot(input, self.w_comp * self.scale) + self.b_comp*self.scale, self.comp_precision, self.z_range)
     
     # activation
     self.y = apply_format(self.format, self.activation(self.z), self.comp_precision, self.y_range)
     
     # return the output
     return self.y
 def bprop(self, dEdy):
     
     self.fixed_dEdy = apply_format(self.format, dEdy.reshape(self.output_shape), self.comp_precision, self.dEdy_range)
     
     fixed_dEdu = apply_format(self.format, T.grad(cost = None, wrt=[self.fixed_u], known_grads={self.y:self.fixed_dEdy})[0],  self.comp_precision,self.dEdz_range)
     
     self.fixed_dEdb = apply_format(self.format, T.grad(cost = None, wrt=[self.fixed_b], known_grads={self.u:fixed_dEdu})[0],  self.comp_precision,self.dEdb_range)
     
     self.fixed_dEdz = apply_format(self.format, T.grad(cost = None, wrt=[self.fixed_z], known_grads={self.u:fixed_dEdu})[0], self.comp_precision, self.dEdz_range)
     
     self.fixed_dEdW = apply_format(self.format, T.grad(cost = None, wrt=[self.fixed_W], known_grads={self.z:self.fixed_dEdz})[0],  self.comp_precision,self.dEdw_range)
     
     dEdx = T.grad(cost = None, wrt=[self.fixed_x], known_grads={self.z:self.fixed_dEdz})[0]
     
     dEdx = T.reshape(self.mask,T.shape(dEdx)) * dEdx
     
     return dEdx     
    def dropout_fprop(self, input):
        
        # we reduce the precision of parameters for the computations
        self.fixed_W = apply_format(self.format, self.W, self.comp_precision, self.w_range)
        self.fixed_b = apply_format(self.format, self.b, self.comp_precision, self.b_range)
        
        # create the dropout mask
        # The cast is important because
        # int * float32 = float64 which pulls things off the gpu
        
        srng = T.shared_randomstreams.RandomStreams(self.rng.randint(999999))
        self.mask = T.cast(srng.binomial(n=1, p=self.p, size=T.shape(input)), theano.config.floatX)
        input = input * self.mask
        
        self.fixed_x = input.reshape(self.image_shape)

        # convolution
        input_shuffled = self.fixed_x.dimshuffle(1, 2, 3, 0) # bc01 to c01b
        filters_shuffled = self.fixed_W.dimshuffle(1, 2, 3, 0) # bc01 to c01b
        conv_op = FilterActs(stride=self.filter_stride, partial_sum=self.partial_sum,pad = self.zero_pad) # augment partial sum -> use less memory but slower
        contiguous_input = gpu_contiguous(input_shuffled)
        contiguous_filters = gpu_contiguous(filters_shuffled)
        conv_out_shuffled = conv_op(contiguous_input, contiguous_filters)
        
        self.z = conv_out_shuffled.dimshuffle(3, 0, 1, 2) # c01b to bc01
        self.fixed_z = apply_format(self.format, self.z, self.comp_precision, self.z_range) 
        
        conv_out_shuffled = self.fixed_z.dimshuffle(1, 2, 3, 0) # bc01 to c01b
        conv_out_shuffled = gpu_contiguous(conv_out_shuffled)
        
        # downsample each feature map individually, using maxpooling
        # pooled_out = downsample.max_pool_2d(input=conv_out,
        #                                     ds=poolsize, ignore_border=True)
        pool_op = MaxPool(ds=self.pool_shape, stride=self.pool_stride)
        pooled_out_shuffled = pool_op(conv_out_shuffled)
        pooled_out = pooled_out_shuffled.dimshuffle(3, 0, 1, 2) # c01b to bc01
        
        # bias
        self.u = pooled_out + self.fixed_b.dimshuffle('x', 0, 'x', 'x')
        self.fixed_u =  apply_format(self.format, self.u, self.comp_precision, self.z_range)
        
        # activation
        self.y = self.activation(self.fixed_u).flatten(2)
        self.fixed_y = apply_format(self.format, self.y, self.comp_precision, self.y_range)
        
        return self.fixed_y
Пример #10
0
    def activation_bprop(self):

        self.fixed_dEdz = apply_format(
            self.format,
            T.grad(cost=None,
                   wrt=[self.fixed_z],
                   known_grads={self.y: self.fixed_dEdy})[0],
            self.comp_precision, self.dEdz_range)
  def bprop(self, dEdy):
 
      self.fixed_dEdy = apply_format(self.format, dEdy, self.comp_precision, self.dEdy_range)
      
      # activation
      self.activation_bprop()
       
      # compute gradients of parameters
      self.fixed_dEdW = apply_format(self.format, T.grad(cost = None, wrt=[self.fixed_W], known_grads={self.z:self.fixed_dEdz})[0], self.comp_precision, self.dEdw_range)
      self.fixed_dEdb = apply_format(self.format, T.grad(cost = None, wrt=[self.fixed_b], known_grads={self.z:self.fixed_dEdz})[0], self.comp_precision, self.dEdb_range)
      
      # weighted sum
      dEdx = T.grad(cost = None, wrt=[self.fixed_x], known_grads={self.z:self.fixed_dEdz})[0]
      
      # apply mask
      dEdx = self.mask * dEdx
      
      return dEdx
Пример #12
0
    def fprop(self, input):

        # we reduce the precision of parameters for the computations
        self.w_comp = apply_format(self.format, self.W, self.comp_precision,
                                   self.w_range)
        self.b_comp = apply_format(self.format, self.b, self.comp_precision,
                                   self.b_range)

        # scaled weighted sum
        self.z = apply_format(
            self.format,
            T.dot(input, self.w_comp * self.scale) + self.b_comp * self.scale,
            self.comp_precision, self.z_range)

        # activation
        self.y = apply_format(self.format, self.activation(self.z),
                              self.comp_precision, self.y_range)

        # return the output
        return self.y
Пример #13
0
    def bprop(self, dEdy):

        self.fixed_dEdy = apply_format(self.format,
                                       dEdy.reshape(self.output_shape),
                                       self.comp_precision, self.dEdy_range)

        fixed_dEdu = apply_format(
            self.format,
            T.grad(cost=None,
                   wrt=[self.fixed_u],
                   known_grads={self.y: self.fixed_dEdy})[0],
            self.comp_precision, self.dEdz_range)

        self.fixed_dEdb = apply_format(
            self.format,
            T.grad(cost=None,
                   wrt=[self.fixed_b],
                   known_grads={self.u: fixed_dEdu})[0], self.comp_precision,
            self.dEdb_range)

        self.fixed_dEdz = apply_format(
            self.format,
            T.grad(cost=None,
                   wrt=[self.fixed_z],
                   known_grads={self.u: fixed_dEdu})[0], self.comp_precision,
            self.dEdz_range)

        self.fixed_dEdW = apply_format(
            self.format,
            T.grad(cost=None,
                   wrt=[self.fixed_W],
                   known_grads={self.z: self.fixed_dEdz})[0],
            self.comp_precision, self.dEdw_range)

        dEdx = T.grad(cost=None,
                      wrt=[self.fixed_x],
                      known_grads={self.z: self.fixed_dEdz})[0]

        dEdx = T.reshape(self.mask, T.shape(dEdx)) * dEdx

        return dEdx
Пример #14
0
    def fprop(self, input):

        # we reduce the precision of parameters for the computations
        self.w_comp = apply_format(self.format, self.W, self.comp_precision,
                                   self.w_range)
        self.b_comp = apply_format(self.format, self.b, self.comp_precision,
                                   self.b_range)

        input = input.reshape(self.image_shape)

        # convolution
        input_shuffled = input.dimshuffle(1, 2, 3, 0)  # bc01 to c01b
        filters_shuffled = self.w_comp.dimshuffle(
            1, 2, 3, 0) * self.scale  # bc01 to c01b
        conv_op = FilterActs(stride=self.filter_stride,
                             partial_sum=self.partial_sum,
                             pad=self.zero_pad)
        contiguous_input = gpu_contiguous(input_shuffled)
        contiguous_filters = gpu_contiguous(filters_shuffled)
        conv_out_shuffled = conv_op(contiguous_input, contiguous_filters)

        # downsample each feature map individually, using maxpooling
        # pooled_out = downsample.max_pool_2d(input=conv_out,
        #                                     ds=poolsize, ignore_border=True)
        pool_op = MaxPool(ds=self.pool_shape, stride=self.pool_stride)
        pooled_out_shuffled = pool_op(conv_out_shuffled)
        pooled_out = pooled_out_shuffled.dimshuffle(3, 0, 1, 2)  # c01b to bc01

        # bias
        pooled_out = apply_format(
            self.format,
            pooled_out + self.b_comp.dimshuffle('x', 0, 'x', 'x') * self.scale,
            self.comp_precision, self.z_range)

        # activation
        pooled_out = self.activation(pooled_out)
        pooled_out = apply_format(self.format, pooled_out.flatten(2),
                                  self.comp_precision, self.y_range)

        return pooled_out
Пример #15
0
    def parameter_updates(self, LR, M):

        # compute updates
        new_update_W = apply_format(
            self.format,
            M * self.update_W - LR * self.w_LR_scale * self.fixed_dEdW,
            self.comp_precision, self.update_w_range)
        new_update_b = apply_format(
            self.format,
            M * self.update_b - LR * self.b_LR_scale * self.fixed_dEdb,
            self.comp_precision, self.update_b_range)

        # compute new parameters. Note that we use a better precision than the other operations
        new_W = apply_format(self.format, self.W + new_update_W,
                             self.update_precision, self.w_range)
        new_b = apply_format(self.format, self.b + new_update_b,
                             self.update_precision, self.b_range)

        # L2 column constraint on W
        col_norms = T.sqrt(T.sum(T.sqr(new_W), axis=0))
        # col_norms = T.max(new_W, axis=0)
        desired_norms = T.clip(
            col_norms, 0,
            self.max_col_norm)  # clip = saturate below min and beyond max
        new_W = apply_format(self.format,
                             new_W * (desired_norms / (1e-7 + col_norms)),
                             self.update_precision, self.w_range)
        # for some reason, works better than
        # new_W = new_W * (desired_norms / col_norms)
        # It may be a kind of regularization

        # return the updates of shared variables
        updates = []
        updates.append((self.W, new_W))
        updates.append((self.b, new_b))
        updates.append((self.update_W, new_update_W))
        updates.append((self.update_b, new_update_b))

        return updates
 def activation_bprop(self):
 
     self.fixed_dEdz = apply_format(self.format, 
         T.grad(cost = None, wrt=[self.fixed_z], known_grads={self.y:self.fixed_dEdy})[0], 
         self.comp_precision, self.dEdz_range)
Пример #17
0
    def activation_bprop(self):

        self.fixed_dEdz = apply_format(self.format, self.fixed_dEdy,
                                       self.comp_precision, self.dEdz_range)
 def activation_bprop(self):
     
     self.fixed_dEdz = apply_format(self.format, self.fixed_dEdy, 
         self.comp_precision, self.dEdz_range)