예제 #1
0
def compute_feedforward(Ws, BNva, X, cache, train_mode):
    dropout_cache, batchnorm_cache = cache
    N = X.shape[0]
    As = []
    Hs = []
    Zs = []

    A = dropout(X, dropout_cache[0], train_mode = train_mode)
    A = add_bias_unit(A)
    if train_mode:
        As.append(A)  

    for layer in range(len(Ws)):     
        H = A.dot(Ws[layer])
        Z, batchnorm_cache[layer] = batchnorm_forward(H, BNva[layer], batchnorm_cache[layer])

        if layer == len(Ws)-1: #last layer
            A = Z
        else:
            A = activation_function(Z)
            A = dropout(A, dropout_cache[layer+1], train_mode = train_mode)
            A = add_bias_unit(A)           

        if (train_mode == True) or (layer == len(Ws) -1):
            Hs.append(copy.deepcopy(H))
            Zs.append(copy.deepcopy(Z))
            As.append(copy.deepcopy(A)) 
        # if DEBUG:
            # print layer, ': ', A

    cache = dropout_cache, batchnorm_cache
    if train_mode:
        return As, Hs, Zs, cache
    else:
        return As[0]
예제 #2
0
 def __call__(self, inputs, state, scope=None):
     if self._input_keep_prob < 1.0:
         inputs = dropout(inputs, self._input_keep_prob, seed=self._seed)
     output, new_state = self._cell(inputs, state, scope)
     if self._output_keep_prob < 1.0:
         output = dropout(output, self._output_keep_prob, seed=self._seed)
     return output, new_state
예제 #3
0
    def run(self, x, dropout=True):
        if x.ndim > 2:
            # x isn't a matrix, make it one.
            x = x.flatten(2)
        d = self.dropout
        if not hasattr(self, 'dropout'):
            d = 0
        else:
            d = self.dropout

        if not dropout:
            d = 0
        out = self.activation(T.dot(x, self.w) + self.b)
        return drp.dropout(srng, out, d, (self.batch_size, self.hidden_size))
    def run(self, x, dropout=True):
        if x.ndim > 2:
            # x isn't a matrix, make it one.
            x = x.flatten(2)
        #d = self.dropout
        if not hasattr(self, 'dropout'):
            d = 0
        else:
            d = self.dropout

        if not dropout:
            d = 0
        out = self.activation(T.dot(x, self.w) + self.b)
        return drp.dropout(srng, out, d, (out.shape[0], self.hidden_size))
    def run(self, x, dropout=True):
        if x.ndim > 2:
            x = x.flatten(2)
        if not hasattr(self, 'dropout'):
            d = 0
        else:
            d = self.dropout

        if not dropout:
            d = 0
        out = self.activation(T.dot(x, self.w) + self.b)
        out = drp.dropout(srng, out, d, (out.shape[0], self.classes))
        prob = T.nnet.softmax(out)
        pred = T.argmax(prob, axis=1)
        return prob, pred
예제 #6
0
def compute_feedbackward(W, BNva, As, Hs, Zs, Y, caches):
    dropout_cache, batchnorm_cache = caches
 
    sz = Y.shape[0]
    
    delta = As[-1] - Y
    dz = delta #no activation function at last layer
    dh, bn_grad = batchnorm_backward(Hs[-1], BNva[-1], dz, batchnorm_cache[-1])
    grad = As[-2].T.dot(dh) / sz
    w_grads = [grad]
    bn_grads = [bn_grad]

    for l in range(1, len(W)):
        delta = dh.dot(W[-l][1:].T)
        delta = dropout(delta, dropout_cache[-l], train_mode = True)
        dz = np.multiply(delta, compute_grad_actfunc(Zs[-l]))
        dh, bn_grad = batchnorm_backward(Hs[-l-1], BNva[-l-1], dz, batchnorm_cache[-l-1])
        bn_grads.append(bn_grad)
        w_grad = As[-l-2].T.dot(dh) / sz
        w_grads.append(w_grad)

    return w_grads, bn_grads
예제 #7
0
    def __init__(self, inp, n_labels, n_hidden_previous, update_fn,
                 training=None, keep_prob=None):
        if type(inp) == list:
            self.input = T.concatenate(inp)
            input_size = len(inp) * n_hidden_previous
        else:
            self.input = inp
            input_size = n_hidden_previous

        if training is not None:
            assert keep_prob is not None
            self.input = dropout(self.input, training, keep_prob)

        self.update_fn = update_fn

        # input -> hidden (sized somwhere between size of input & softmax)
        n_hidden = int(math.sqrt(input_size * n_labels))
        print "concat sizing %s -> %s -> %s" % (input_size, n_hidden, n_labels)
        self.Wih = util.sharedMatrix(input_size, n_hidden, 'Wih')
        self.bh = util.shared(util.zeros((1, n_hidden)), 'bh')
        # hidden -> softmax
        self.Whs = util.sharedMatrix(n_hidden, n_labels, 'Whs')
        self.bs = util.shared(util.zeros((1, n_labels)), 'bs')
예제 #8
0
    def create_feedforward_classifier_model(self):
        """
        Creates:
        self.input_images
        self.labels
        self.lr
        self.preds - pre-softmax predictions
        self.loss
        self.accuracy
        self.grads
        self.train_op
        self.change_weights_op
        """
        ## input placeholders
        self.input_images = tf.placeholder(tf.float32, shape=(None,self.cfg.input_dim), name='input_images')
        self.labels       = tf.placeholder(tf.int32,   shape=(None,), name='labels')
        self.lr           = tf.placeholder(tf.float32, shape=(), name='lr')
        self.keep_prob    = tf.placeholder(tf.float32, shape=(), name='keep_prob')
        self.use_past_bt  = tf.placeholder(tf.bool,    shape=(), name='use_past_bt') # to pass previous dropout mask
        self.h1_past_bt   = tf.placeholder(tf.float32, shape=(None, self.cfg.h1_dim), name='h1_past_bt')
        self.h2_past_bt   = tf.placeholder(tf.float32, shape=(None, self.cfg.h2_dim), name='h2_past_bt')

        ## forward pass, note how this is pre-softmax
        h1 = layers.fully_connected(self.input_images, num_outputs=self.cfg.h1_dim, activation_fn=tf.nn.relu,
                                    biases_initializer=layers.initializers.xavier_initializer(), scope='h1')
        h1, self.h1_binary_tensor = tf.cond(self.use_past_bt, lambda: [math_ops.div(h1,self.cfg.keep_prob)*self.h1_past_bt, self.h1_past_bt],
                                            lambda: dropout(h1, keep_prob=self.cfg.keep_prob))
        #h1, self.h1_binary_tensor = tf.cond(self.use_past_bt, lambda: [math_ops.div(h1,self.keep_prob)*self.h1_past_bt, self.h1_past_bt],
        #                                    lambda: dropout(h1, keep_prob=self.keep_prob))
        h2 = layers.fully_connected(h1, num_outputs=self.cfg.h2_dim, activation_fn=tf.nn.relu,
                                    biases_initializer=layers.initializers.xavier_initializer(), scope='h2')
        h2, self.h2_binary_tensor = tf.cond(self.use_past_bt, lambda: [math_ops.div(h2,self.cfg.keep_prob)*self.h2_past_bt, self.h2_past_bt],
                                            lambda: dropout(h2, keep_prob=self.cfg.keep_prob))
        #h2, self.h2_binary_tensor = tf.cond(self.use_past_bt, lambda: [math_ops.div(h2,self.keep_prob)*self.h2_past_bt, self.h2_past_bt],
        #                                    lambda: dropout(h2, keep_prob=self.keep_prob))
        self.h2 = h2
        self.preds = layers.fully_connected(h2, num_outputs=self.cfg.output_dim, activation_fn=None,
                                            biases_initializer=layers.initializers.xavier_initializer(), scope='preds')

        ## loss and accuracy
        loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.preds, labels=self.labels)
        self.loss = tf.reduce_mean(loss, name='loss', axis=None)
        self.accuracy = tf.contrib.metrics.accuracy(labels=self.labels, predictions=tf.to_int32(tf.argmax(self.preds, axis=1)))

        ## training op
        if self.cfg.optimizer=='kalpit':
            optimizer = tf.train.GradientDescentOptimizer(learning_rate=self.lr) # can set lr every minibatch
        if self.cfg.optimizer=='sgd':
            optimizer = tf.train.MomentumOptimizer(learning_rate=self.cfg.learning_rate, momentum=self.cfg.momentum, 
                                                   use_nesterov=self.cfg.nesterov)
        elif self.cfg.optimizer=='adam':
            optimizer = tf.train.AdamOptimizer(learning_rate=self.cfg.learning_rate, beta1=self.cfg.beta1, beta2=self.cfg.beta2)
        elif self.cfg.optimizer=='adadelta':
            optimizer = tf.train.AdadeltaOptimizer(learning_rate=self.cfg.learning_rate, rho=self.cfg.rho)
        gvs = optimizer.compute_gradients(self.loss, var_list=tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES))
        self.grads, vrbs = zip(*gvs)
        self.train_op = optimizer.apply_gradients(gvs)

        ### op to just apply passed gradients
        self.h1_W_grad = tf.placeholder(tf.float32, shape=(self.cfg.input_dim,self.cfg.h1_dim), name='h1_W_grad')
        self.h1_b_grad = tf.placeholder(tf.float32, shape=(self.cfg.h1_dim), name='h1_b_grad')
        self.h2_W_grad = tf.placeholder(tf.float32, shape=(self.cfg.h1_dim,self.cfg.h2_dim), name='h2_W_grad')
        self.h2_b_grad = tf.placeholder(tf.float32, shape=(self.cfg.h2_dim), name='h2_b_grad')
        self.preds_W_grad = tf.placeholder(tf.float32, shape=(self.cfg.h2_dim,self.cfg.output_dim), name='preds_W_grad')
        self.preds_b_grad = tf.placeholder(tf.float32, shape=(self.cfg.output_dim), name='preds_b_grad')
        passed_grads = [self.h1_W_grad, self.h1_b_grad,
                        self.h2_W_grad, self.h2_b_grad,
                        self.preds_W_grad, self.preds_b_grad]
        passed_gvs = zip(passed_grads, vrbs)
        self.change_weights_op = optimizer.apply_gradients(passed_gvs)
    
        ## do L2
        target_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)[-2:]
        assignments = []
        assignments.append(target_vars[0].assign(self.preds_W_grad))# abusing use of preds_W_grad
        assignments.append(target_vars[1].assign(self.preds_b_grad))# abusing use of preds_b_grad
        self.assign_last_layer = tf.group(*assignments)
예제 #9
0
    def create_convnet_classifier_model(self):
        """
        Creates:
        self.input_images
        self.labels
        self.lr
        self.preds - pre-softmax predictions
        self.loss
        self.accuracy
        self.grads
        self.train_op
        self.change_weights_op
        """
        ## input placeholders
        self.input_images = tf.placeholder(tf.float32, shape=(None,self.cfg.input_height,self.cfg.input_width,self.cfg.input_nchannels), 
                                           name='input_images')
        self.labels       = tf.placeholder(tf.int32,   shape=(None,), name='labels')
        self.lr           = tf.placeholder(tf.float32, shape=(), name='lr')
        self.use_past_bt   = tf.placeholder(tf.bool, shape=(), name='use_past_bt')
        self.input_past_bt = tf.placeholder(tf.float32, shape=(None,self.cfg.input_height,self.cfg.input_width,self.cfg.input_nchannels),
                                            name='input_past_bt') # past binary tensor
        self.fc4_past_bt   = tf.placeholder(tf.float32, shape=(None,1000),
                                            name='input_past_bt') # past binary tensor
        
        ## forward pass, note how this is pre-softmax
        dropout_input_images, self.input_binary_tensor = tf.cond(self.use_past_bt, 
                                               lambda: [math_ops.div(self.input_images,self.cfg.keep_prob)*self.input_past_bt, self.input_past_bt],
                                               lambda: dropout(self.input_images, keep_prob=self.cfg.keep_prob))
        conv1 = layers.convolution2d(dropout_input_images, num_outputs=64, kernel_size=(5,5), stride=(1,1), 
                                     padding='SAME', biases_initializer=layers.initializers.xavier_initializer(), 
                                     activation_fn=tf.nn.relu, scope='conv1')
        pool1 = tf.nn.max_pool(conv1, ksize=(1,3,3,1), strides=(1,2,2,1), padding='SAME')
        conv2 = layers.convolution2d(pool1, num_outputs=64, kernel_size=(5,5), stride=(1,1), 
                                     padding='SAME', biases_initializer=layers.initializers.xavier_initializer(), 
                                     activation_fn=tf.nn.relu, scope='conv2')
        pool2 = tf.nn.max_pool(conv2, ksize=(1,3,3,1), strides=(1,2,2,1), padding='SAME')
        conv3 = layers.convolution2d(pool2, num_outputs=128, kernel_size=(5,5), stride=(1,1), 
                                     padding='SAME', biases_initializer=layers.initializers.xavier_initializer(), 
                                     activation_fn=tf.nn.relu, scope='conv3')
        pool3 = tf.nn.max_pool(conv3, ksize=(1,3,3,1), strides=(1,2,2,1), padding='SAME')
        pool3_flat = layers.flatten(pool3)
    
        fc4 = layers.fully_connected(pool3_flat, num_outputs=1000, activation_fn=tf.nn.relu,
                                     biases_initializer=layers.initializers.xavier_initializer(), scope='fc4')
        fc4, self.fc4_binary_tensor = tf.cond(self.use_past_bt, 
                                              lambda: [math_ops.div(fc4,self.cfg.keep_prob)*self.fc4_past_bt, self.fc4_past_bt],
                                              lambda: dropout(fc4, keep_prob=self.cfg.keep_prob))
        fc5 = layers.fully_connected(fc4, num_outputs=self.cfg.output_dim, activation_fn=None,
                                     biases_initializer=layers.initializers.xavier_initializer(), scope='fc5')
        self.preds = fc5

        ## loss and accuracy
        loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.preds, labels=self.labels)
        self.loss = tf.reduce_mean(loss, name='loss', axis=None)
        self.accuracy = tf.contrib.metrics.accuracy(labels=self.labels, predictions=tf.to_int32(tf.argmax(self.preds, axis=1)))

        ## training op
        if self.cfg.optimizer=='kalpit':
            #optimizer = tf.train.MomentumOptimizer(learning_rate=self.lr, momentum=self.cfg.momentum, 
            #                                       use_nesterov=self.cfg.nesterov)
            optimizer = tf.train.GradientDescentOptimizer(learning_rate=self.lr) # can set lr every minibatch
        elif self.cfg.optimizer=='sgd':
            optimizer = tf.train.MomentumOptimizer(learning_rate=self.cfg.learning_rate, momentum=self.cfg.momentum, 
                                                   use_nesterov=self.cfg.nesterov)
        elif self.cfg.optimizer=='adam':
            optimizer = tf.train.AdamOptimizer(learning_rate=self.cfg.learning_rate, beta1=self.cfg.beta1, beta2=self.cfg.beta2,
                                               epsilon=self.cfg.epsilon)
        elif self.cfg.optimizer=='adadelta':
            optimizer = tf.train.AdadeltaOptimizer(learning_rate=self.cfg.learning_rate, rho=self.cfg.rho)
        gvs = optimizer.compute_gradients(self.loss, var_list=tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES))
        self.grads, vrbs = zip(*gvs)
        self.train_op = optimizer.apply_gradients(gvs)

        ### op to just apply passed gradients
        self.conv1_W_grad = tf.placeholder(tf.float32, shape=(5,5,self.cfg.input_nchannels,64), name='conv1_W_grad')
        self.conv1_b_grad = tf.placeholder(tf.float32, shape=(64), name='conv1_b_grad')
        self.conv2_W_grad = tf.placeholder(tf.float32, shape=(5,5,64,64), name='conv2_W_grad')
        self.conv2_b_grad = tf.placeholder(tf.float32, shape=(64), name='conv2_b_grad')
        self.conv3_W_grad = tf.placeholder(tf.float32, shape=(5,5,64,128), name='conv3_W_grad')
        self.conv3_b_grad = tf.placeholder(tf.float32, shape=(128), name='conv3_b_grad')
        self.fc4_W_grad = tf.placeholder(tf.float32, shape=(2048,1000), name='fc4_W_grad')
        self.fc4_b_grad = tf.placeholder(tf.float32, shape=(1000), name='fc4_b_grad')
        self.fc5_W_grad = tf.placeholder(tf.float32, shape=(1000,self.cfg.output_dim), name='fc5_W_grad')
        self.fc5_b_grad = tf.placeholder(tf.float32, shape=(self.cfg.output_dim), name='fc5_b_grad')

        passed_grads = [self.conv1_W_grad, self.conv1_b_grad,
                        self.conv2_W_grad, self.conv2_b_grad,
                        self.conv3_W_grad, self.conv3_b_grad,
                        self.fc4_W_grad,   self.fc4_b_grad,
                        self.fc5_W_grad,   self.fc5_b_grad]
        passed_gvs = zip(passed_grads, vrbs)
        self.change_weights_op = optimizer.apply_gradients(passed_gvs)
예제 #10
0
    def create_feedforward_classifier_model(self):
        """
        Creates:
        self.input_images
        self.labels
        self.lr
        self.preds - pre-softmax predictions
        self.loss
        self.accuracy
        self.grads
        self.train_op
        self.change_weights_op
        """
        ## input placeholders
        self.input_images = tf.placeholder(tf.float32, shape=(None,self.cfg.input_dim), name='input_images')
        self.labels       = tf.placeholder(tf.int32,   shape=(None,), name='labels')
        self.lr           = tf.placeholder(tf.float32, shape=(), name='lr')
        self.max_lr       = tf.placeholder(tf.float32, shape=(), name='max_lr')
        self.keep_prob    = tf.placeholder(tf.float32, shape=(), name='keep_prob')
        self.use_past_bt  = tf.placeholder(tf.bool,    shape=(), name='use_past_bt') # to pass previous dropout mask
        self.h1_past_bt   = tf.placeholder(tf.float32, shape=(None, self.cfg.h1_dim), name='h1_past_bt')
        self.h2_past_bt   = tf.placeholder(tf.float32, shape=(None, self.cfg.h2_dim), name='h2_past_bt')

        ## forward pass, note how this is pre-softmax
        h1 = layers.fully_connected(self.input_images, num_outputs=self.cfg.h1_dim, activation_fn=tf.nn.relu,
                                    biases_initializer=layers.initializers.xavier_initializer(), scope='h1')
        h1, self.h1_binary_tensor = tf.cond(self.use_past_bt, lambda: [math_ops.div(h1,self.keep_prob)*self.h1_past_bt, self.h1_past_bt],
                                            lambda: dropout(h1, keep_prob=self.keep_prob))
        #h1, self.h1_binary_tensor = tf.cond(self.use_past_bt, lambda: [math_ops.div(h1,self.keep_prob)*self.h1_past_bt, self.h1_past_bt],
        #                                    lambda: dropout(h1, keep_prob=self.keep_prob))
        h2 = layers.fully_connected(h1, num_outputs=self.cfg.h2_dim, activation_fn=tf.nn.relu,
                                    biases_initializer=layers.initializers.xavier_initializer(), scope='h2')
        h2, self.h2_binary_tensor = tf.cond(self.use_past_bt, lambda: [math_ops.div(h2,self.keep_prob)*self.h2_past_bt, self.h2_past_bt],
                                            lambda: dropout(h2, keep_prob=self.keep_prob))
        #h2, self.h2_binary_tensor = tf.cond(self.use_past_bt, lambda: [math_ops.div(h2,self.keep_prob)*self.h2_past_bt, self.h2_past_bt],
        #                                    lambda: dropout(h2, keep_prob=self.keep_prob))
        self.h2 = h2
        self.preds = layers.fully_connected(h2, num_outputs=self.cfg.output_dim, activation_fn=None,
                                            biases_initializer=layers.initializers.xavier_initializer(), scope='preds')

        ## loss and accuracy
        loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.preds, labels=self.labels)
        self.loss = tf.reduce_mean(loss, name='loss', axis=None)
        self.accuracy = tf.contrib.metrics.accuracy(labels=self.labels, predictions=tf.to_int32(tf.argmax(self.preds, axis=1)))

        ## training op
        if self.cfg.optimizer=='kalpit':
            optimizer = tf.train.MomentumOptimizer(learning_rate=1.0, momentum=self.cfg.momentum, use_nesterov=False) # can set lr every minibatch
            self.initialize_directions()
        if self.cfg.optimizer=='sgd':
            optimizer = tf.train.MomentumOptimizer(learning_rate=self.cfg.learning_rate, momentum=self.cfg.momentum, 
                                                   use_nesterov=self.cfg.nesterov)
        elif self.cfg.optimizer=='adam':
            optimizer = tf.train.AdamOptimizer(learning_rate=self.cfg.learning_rate, beta1=self.cfg.beta1, beta2=self.cfg.beta2)
        elif self.cfg.optimizer=='adadelta':
            optimizer = tf.train.AdadeltaOptimizer(learning_rate=self.cfg.learning_rate, rho=self.cfg.rho)
        gvs = optimizer.compute_gradients(self.loss, var_list=tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES))
        self.grads, vrbs = zip(*gvs)
        self.train_op = optimizer.apply_gradients(gvs)

        ## dixit training op
        if self.cfg.optimizer=='kalpit':
            moms = [optimizer.get_slot(x, 'momentum') for x in tf.trainable_variables()]
            self.direction = [self.cfg.momentum*moms[i] + self.grads[i] for i in range(len(moms))]
            gT_g = tf.add_n([tf.reduce_sum(tf.square(g)) for g in self.grads])
            gT_d = tf.add_n([tf.reduce_sum(tf.multiply(self.grads[i], self.direction[i])) for i in range(len(self.grads))])
            self.gT_d = gT_d
            dT_d = tf.add_n([tf.reduce_sum(tf.square(d)) for d in self.direction])
            #self.lr = self.loss / tf.sqrt(gT_g) / tf.sqrt(dT_d)
            #self.lr = tf.minimum(self.lr, self.max_lr)
            self.lr = (1-self.max_lr)*self.loss/tf.sqrt(gT_g)/tf.sqrt(dT_d)*tf.sign(gT_d) # we want loss-->gamma*loss
            #self.lr = (1-self.max_lr)*self.loss/gT_d # we want loss-->gamma*loss
            self.dixit_train_op = optimizer.apply_gradients(zip([self.lr*d for d in self.direction], vrbs))
예제 #11
0
    def create_convnet_classifier_model(self):
        """
        Creates:
        self.input_images
        self.labels
        self.lr
        self.preds - pre-softmax predictions
        self.loss
        self.accuracy
        self.grads
        self.train_op
        self.change_weights_op
        """
        ## input placeholders
        self.input_images  = tf.placeholder(tf.float32, shape=(None,self.cfg.input_height,self.cfg.input_width,self.cfg.input_nchannels), 
                                           name='input_images')
        self.labels        = tf.placeholder(tf.int32,   shape=(None,), name='labels')
        self.lr            = tf.placeholder(tf.float32, shape=(), name='lr')
        self.max_lr        = tf.placeholder(tf.float32, shape=(), name='max_lr')
        self.keep_prob    = tf.placeholder(tf.float32, shape=(), name='keep_prob')
        self.use_past_bt   = tf.placeholder(tf.bool, shape=(), name='use_past_bt')
        self.input_past_bt = tf.placeholder(tf.float32, shape=(None,self.cfg.input_height,self.cfg.input_width,self.cfg.input_nchannels),
                                            name='input_past_bt') # past binary tensor
        self.fc4_past_bt   = tf.placeholder(tf.float32, shape=(None,1000),
                                            name='input_past_bt') # past binary tensor
        
        ## forward pass, note how this is pre-softmax
        dropout_input_images, self.input_binary_tensor = tf.cond(self.use_past_bt, 
                                               lambda: [math_ops.div(self.input_images,self.keep_prob)*self.input_past_bt, self.input_past_bt],
                                               lambda: dropout(self.input_images, keep_prob=self.keep_prob))
        conv1 = layers.convolution2d(dropout_input_images, num_outputs=64, kernel_size=(5,5), stride=(1,1), 
                                     padding='SAME', biases_initializer=layers.initializers.xavier_initializer(), 
                                     activation_fn=tf.nn.relu, scope='conv1')
        pool1 = tf.nn.max_pool(conv1, ksize=(1,3,3,1), strides=(1,2,2,1), padding='SAME')
        conv2 = layers.convolution2d(pool1, num_outputs=64, kernel_size=(5,5), stride=(1,1), 
                                     padding='SAME', biases_initializer=layers.initializers.xavier_initializer(), 
                                     activation_fn=tf.nn.relu, scope='conv2')
        pool2 = tf.nn.max_pool(conv2, ksize=(1,3,3,1), strides=(1,2,2,1), padding='SAME')
        conv3 = layers.convolution2d(pool2, num_outputs=128, kernel_size=(5,5), stride=(1,1), 
                                     padding='SAME', biases_initializer=layers.initializers.xavier_initializer(), 
                                     activation_fn=tf.nn.relu, scope='conv3')
        pool3 = tf.nn.max_pool(conv3, ksize=(1,3,3,1), strides=(1,2,2,1), padding='SAME')
        pool3_flat = layers.flatten(pool3)
    
        fc4 = layers.fully_connected(pool3_flat, num_outputs=1000, activation_fn=tf.nn.relu,
                                     biases_initializer=layers.initializers.xavier_initializer(), scope='fc4')
        fc4, self.fc4_binary_tensor = tf.cond(self.use_past_bt, 
                                              lambda: [math_ops.div(fc4,self.keep_prob)*self.fc4_past_bt, self.fc4_past_bt],
                                              lambda: dropout(fc4, keep_prob=self.keep_prob))
        fc5 = layers.fully_connected(fc4, num_outputs=self.cfg.output_dim, activation_fn=None,
                                     biases_initializer=layers.initializers.xavier_initializer(), scope='fc5')
        self.preds = fc5

        ## loss and accuracy
        loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.preds, labels=self.labels)
        self.loss = tf.reduce_mean(loss, name='loss', axis=None)
        self.accuracy = tf.contrib.metrics.accuracy(labels=self.labels, predictions=tf.to_int32(tf.argmax(self.preds, axis=1)))

        ## training op
        if self.cfg.optimizer=='kalpit':
            optimizer = tf.train.MomentumOptimizer(learning_rate=1.0, momentum=self.cfg.momentum, use_nesterov=False) # can set lr every minibatch
        elif self.cfg.optimizer=='sgd':
            optimizer = tf.train.MomentumOptimizer(learning_rate=self.cfg.learning_rate, momentum=self.cfg.momentum, 
                                                   use_nesterov=self.cfg.nesterov)
        elif self.cfg.optimizer=='adam':
            optimizer = tf.train.AdamOptimizer(learning_rate=self.cfg.learning_rate, beta1=self.cfg.beta1, beta2=self.cfg.beta2,
                                               epsilon=self.cfg.epsilon)
        elif self.cfg.optimizer=='adadelta':
            optimizer = tf.train.AdadeltaOptimizer(learning_rate=self.cfg.learning_rate, rho=self.cfg.rho)
        gvs = optimizer.compute_gradients(self.loss, var_list=tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES))
        self.grads, vrbs = zip(*gvs)
        self.train_op = optimizer.apply_gradients(gvs)

        ## dixit training op
        if self.cfg.optimizer=='kalpit':
            moms = [optimizer.get_slot(x, 'momentum') for x in tf.trainable_variables()]
            self.direction = [self.cfg.momentum*moms[i] + self.grads[i] for i in range(len(moms))]
            gT_g = tf.add_n([tf.reduce_sum(tf.square(g)) for g in self.grads])
            gT_d = tf.add_n([tf.reduce_sum(tf.multiply(self.grads[i], self.direction[i])) for i in range(len(self.grads))])
            self.gT_d = gT_d
            dT_d = tf.add_n([tf.reduce_sum(tf.square(d)) for d in self.direction])
            #self.lr = self.loss / tf.sqrt(gT_g) / tf.sqrt(dT_d)
            #self.lr = tf.minimum(self.lr, self.max_lr)
            self.lr = (1-self.max_lr)*self.loss/tf.sqrt(gT_g)/tf.sqrt(dT_d)*tf.sign(gT_d) # we want loss-->gamma*loss
            #self.lr = (1-self.max_lr)*self.loss/gT_d # we want loss-->gamma*loss
            self.dixit_train_op = optimizer.apply_gradients(zip([self.lr*d for d in self.direction], vrbs))