def compute_feedforward(Ws, BNva, X, cache, train_mode): dropout_cache, batchnorm_cache = cache N = X.shape[0] As = [] Hs = [] Zs = [] A = dropout(X, dropout_cache[0], train_mode = train_mode) A = add_bias_unit(A) if train_mode: As.append(A) for layer in range(len(Ws)): H = A.dot(Ws[layer]) Z, batchnorm_cache[layer] = batchnorm_forward(H, BNva[layer], batchnorm_cache[layer]) if layer == len(Ws)-1: #last layer A = Z else: A = activation_function(Z) A = dropout(A, dropout_cache[layer+1], train_mode = train_mode) A = add_bias_unit(A) if (train_mode == True) or (layer == len(Ws) -1): Hs.append(copy.deepcopy(H)) Zs.append(copy.deepcopy(Z)) As.append(copy.deepcopy(A)) # if DEBUG: # print layer, ': ', A cache = dropout_cache, batchnorm_cache if train_mode: return As, Hs, Zs, cache else: return As[0]
def __call__(self, inputs, state, scope=None): if self._input_keep_prob < 1.0: inputs = dropout(inputs, self._input_keep_prob, seed=self._seed) output, new_state = self._cell(inputs, state, scope) if self._output_keep_prob < 1.0: output = dropout(output, self._output_keep_prob, seed=self._seed) return output, new_state
def run(self, x, dropout=True): if x.ndim > 2: # x isn't a matrix, make it one. x = x.flatten(2) d = self.dropout if not hasattr(self, 'dropout'): d = 0 else: d = self.dropout if not dropout: d = 0 out = self.activation(T.dot(x, self.w) + self.b) return drp.dropout(srng, out, d, (self.batch_size, self.hidden_size))
def run(self, x, dropout=True): if x.ndim > 2: # x isn't a matrix, make it one. x = x.flatten(2) #d = self.dropout if not hasattr(self, 'dropout'): d = 0 else: d = self.dropout if not dropout: d = 0 out = self.activation(T.dot(x, self.w) + self.b) return drp.dropout(srng, out, d, (out.shape[0], self.hidden_size))
def run(self, x, dropout=True): if x.ndim > 2: x = x.flatten(2) if not hasattr(self, 'dropout'): d = 0 else: d = self.dropout if not dropout: d = 0 out = self.activation(T.dot(x, self.w) + self.b) out = drp.dropout(srng, out, d, (out.shape[0], self.classes)) prob = T.nnet.softmax(out) pred = T.argmax(prob, axis=1) return prob, pred
def compute_feedbackward(W, BNva, As, Hs, Zs, Y, caches): dropout_cache, batchnorm_cache = caches sz = Y.shape[0] delta = As[-1] - Y dz = delta #no activation function at last layer dh, bn_grad = batchnorm_backward(Hs[-1], BNva[-1], dz, batchnorm_cache[-1]) grad = As[-2].T.dot(dh) / sz w_grads = [grad] bn_grads = [bn_grad] for l in range(1, len(W)): delta = dh.dot(W[-l][1:].T) delta = dropout(delta, dropout_cache[-l], train_mode = True) dz = np.multiply(delta, compute_grad_actfunc(Zs[-l])) dh, bn_grad = batchnorm_backward(Hs[-l-1], BNva[-l-1], dz, batchnorm_cache[-l-1]) bn_grads.append(bn_grad) w_grad = As[-l-2].T.dot(dh) / sz w_grads.append(w_grad) return w_grads, bn_grads
def __init__(self, inp, n_labels, n_hidden_previous, update_fn, training=None, keep_prob=None): if type(inp) == list: self.input = T.concatenate(inp) input_size = len(inp) * n_hidden_previous else: self.input = inp input_size = n_hidden_previous if training is not None: assert keep_prob is not None self.input = dropout(self.input, training, keep_prob) self.update_fn = update_fn # input -> hidden (sized somwhere between size of input & softmax) n_hidden = int(math.sqrt(input_size * n_labels)) print "concat sizing %s -> %s -> %s" % (input_size, n_hidden, n_labels) self.Wih = util.sharedMatrix(input_size, n_hidden, 'Wih') self.bh = util.shared(util.zeros((1, n_hidden)), 'bh') # hidden -> softmax self.Whs = util.sharedMatrix(n_hidden, n_labels, 'Whs') self.bs = util.shared(util.zeros((1, n_labels)), 'bs')
def create_feedforward_classifier_model(self): """ Creates: self.input_images self.labels self.lr self.preds - pre-softmax predictions self.loss self.accuracy self.grads self.train_op self.change_weights_op """ ## input placeholders self.input_images = tf.placeholder(tf.float32, shape=(None,self.cfg.input_dim), name='input_images') self.labels = tf.placeholder(tf.int32, shape=(None,), name='labels') self.lr = tf.placeholder(tf.float32, shape=(), name='lr') self.keep_prob = tf.placeholder(tf.float32, shape=(), name='keep_prob') self.use_past_bt = tf.placeholder(tf.bool, shape=(), name='use_past_bt') # to pass previous dropout mask self.h1_past_bt = tf.placeholder(tf.float32, shape=(None, self.cfg.h1_dim), name='h1_past_bt') self.h2_past_bt = tf.placeholder(tf.float32, shape=(None, self.cfg.h2_dim), name='h2_past_bt') ## forward pass, note how this is pre-softmax h1 = layers.fully_connected(self.input_images, num_outputs=self.cfg.h1_dim, activation_fn=tf.nn.relu, biases_initializer=layers.initializers.xavier_initializer(), scope='h1') h1, self.h1_binary_tensor = tf.cond(self.use_past_bt, lambda: [math_ops.div(h1,self.cfg.keep_prob)*self.h1_past_bt, self.h1_past_bt], lambda: dropout(h1, keep_prob=self.cfg.keep_prob)) #h1, self.h1_binary_tensor = tf.cond(self.use_past_bt, lambda: [math_ops.div(h1,self.keep_prob)*self.h1_past_bt, self.h1_past_bt], # lambda: dropout(h1, keep_prob=self.keep_prob)) h2 = layers.fully_connected(h1, num_outputs=self.cfg.h2_dim, activation_fn=tf.nn.relu, biases_initializer=layers.initializers.xavier_initializer(), scope='h2') h2, self.h2_binary_tensor = tf.cond(self.use_past_bt, lambda: [math_ops.div(h2,self.cfg.keep_prob)*self.h2_past_bt, self.h2_past_bt], lambda: dropout(h2, keep_prob=self.cfg.keep_prob)) #h2, self.h2_binary_tensor = tf.cond(self.use_past_bt, lambda: [math_ops.div(h2,self.keep_prob)*self.h2_past_bt, self.h2_past_bt], # lambda: dropout(h2, keep_prob=self.keep_prob)) self.h2 = h2 self.preds = layers.fully_connected(h2, num_outputs=self.cfg.output_dim, activation_fn=None, biases_initializer=layers.initializers.xavier_initializer(), scope='preds') ## loss and accuracy loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.preds, labels=self.labels) self.loss = tf.reduce_mean(loss, name='loss', axis=None) self.accuracy = tf.contrib.metrics.accuracy(labels=self.labels, predictions=tf.to_int32(tf.argmax(self.preds, axis=1))) ## training op if self.cfg.optimizer=='kalpit': optimizer = tf.train.GradientDescentOptimizer(learning_rate=self.lr) # can set lr every minibatch if self.cfg.optimizer=='sgd': optimizer = tf.train.MomentumOptimizer(learning_rate=self.cfg.learning_rate, momentum=self.cfg.momentum, use_nesterov=self.cfg.nesterov) elif self.cfg.optimizer=='adam': optimizer = tf.train.AdamOptimizer(learning_rate=self.cfg.learning_rate, beta1=self.cfg.beta1, beta2=self.cfg.beta2) elif self.cfg.optimizer=='adadelta': optimizer = tf.train.AdadeltaOptimizer(learning_rate=self.cfg.learning_rate, rho=self.cfg.rho) gvs = optimizer.compute_gradients(self.loss, var_list=tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)) self.grads, vrbs = zip(*gvs) self.train_op = optimizer.apply_gradients(gvs) ### op to just apply passed gradients self.h1_W_grad = tf.placeholder(tf.float32, shape=(self.cfg.input_dim,self.cfg.h1_dim), name='h1_W_grad') self.h1_b_grad = tf.placeholder(tf.float32, shape=(self.cfg.h1_dim), name='h1_b_grad') self.h2_W_grad = tf.placeholder(tf.float32, shape=(self.cfg.h1_dim,self.cfg.h2_dim), name='h2_W_grad') self.h2_b_grad = tf.placeholder(tf.float32, shape=(self.cfg.h2_dim), name='h2_b_grad') self.preds_W_grad = tf.placeholder(tf.float32, shape=(self.cfg.h2_dim,self.cfg.output_dim), name='preds_W_grad') self.preds_b_grad = tf.placeholder(tf.float32, shape=(self.cfg.output_dim), name='preds_b_grad') passed_grads = [self.h1_W_grad, self.h1_b_grad, self.h2_W_grad, self.h2_b_grad, self.preds_W_grad, self.preds_b_grad] passed_gvs = zip(passed_grads, vrbs) self.change_weights_op = optimizer.apply_gradients(passed_gvs) ## do L2 target_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)[-2:] assignments = [] assignments.append(target_vars[0].assign(self.preds_W_grad))# abusing use of preds_W_grad assignments.append(target_vars[1].assign(self.preds_b_grad))# abusing use of preds_b_grad self.assign_last_layer = tf.group(*assignments)
def create_convnet_classifier_model(self): """ Creates: self.input_images self.labels self.lr self.preds - pre-softmax predictions self.loss self.accuracy self.grads self.train_op self.change_weights_op """ ## input placeholders self.input_images = tf.placeholder(tf.float32, shape=(None,self.cfg.input_height,self.cfg.input_width,self.cfg.input_nchannels), name='input_images') self.labels = tf.placeholder(tf.int32, shape=(None,), name='labels') self.lr = tf.placeholder(tf.float32, shape=(), name='lr') self.use_past_bt = tf.placeholder(tf.bool, shape=(), name='use_past_bt') self.input_past_bt = tf.placeholder(tf.float32, shape=(None,self.cfg.input_height,self.cfg.input_width,self.cfg.input_nchannels), name='input_past_bt') # past binary tensor self.fc4_past_bt = tf.placeholder(tf.float32, shape=(None,1000), name='input_past_bt') # past binary tensor ## forward pass, note how this is pre-softmax dropout_input_images, self.input_binary_tensor = tf.cond(self.use_past_bt, lambda: [math_ops.div(self.input_images,self.cfg.keep_prob)*self.input_past_bt, self.input_past_bt], lambda: dropout(self.input_images, keep_prob=self.cfg.keep_prob)) conv1 = layers.convolution2d(dropout_input_images, num_outputs=64, kernel_size=(5,5), stride=(1,1), padding='SAME', biases_initializer=layers.initializers.xavier_initializer(), activation_fn=tf.nn.relu, scope='conv1') pool1 = tf.nn.max_pool(conv1, ksize=(1,3,3,1), strides=(1,2,2,1), padding='SAME') conv2 = layers.convolution2d(pool1, num_outputs=64, kernel_size=(5,5), stride=(1,1), padding='SAME', biases_initializer=layers.initializers.xavier_initializer(), activation_fn=tf.nn.relu, scope='conv2') pool2 = tf.nn.max_pool(conv2, ksize=(1,3,3,1), strides=(1,2,2,1), padding='SAME') conv3 = layers.convolution2d(pool2, num_outputs=128, kernel_size=(5,5), stride=(1,1), padding='SAME', biases_initializer=layers.initializers.xavier_initializer(), activation_fn=tf.nn.relu, scope='conv3') pool3 = tf.nn.max_pool(conv3, ksize=(1,3,3,1), strides=(1,2,2,1), padding='SAME') pool3_flat = layers.flatten(pool3) fc4 = layers.fully_connected(pool3_flat, num_outputs=1000, activation_fn=tf.nn.relu, biases_initializer=layers.initializers.xavier_initializer(), scope='fc4') fc4, self.fc4_binary_tensor = tf.cond(self.use_past_bt, lambda: [math_ops.div(fc4,self.cfg.keep_prob)*self.fc4_past_bt, self.fc4_past_bt], lambda: dropout(fc4, keep_prob=self.cfg.keep_prob)) fc5 = layers.fully_connected(fc4, num_outputs=self.cfg.output_dim, activation_fn=None, biases_initializer=layers.initializers.xavier_initializer(), scope='fc5') self.preds = fc5 ## loss and accuracy loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.preds, labels=self.labels) self.loss = tf.reduce_mean(loss, name='loss', axis=None) self.accuracy = tf.contrib.metrics.accuracy(labels=self.labels, predictions=tf.to_int32(tf.argmax(self.preds, axis=1))) ## training op if self.cfg.optimizer=='kalpit': #optimizer = tf.train.MomentumOptimizer(learning_rate=self.lr, momentum=self.cfg.momentum, # use_nesterov=self.cfg.nesterov) optimizer = tf.train.GradientDescentOptimizer(learning_rate=self.lr) # can set lr every minibatch elif self.cfg.optimizer=='sgd': optimizer = tf.train.MomentumOptimizer(learning_rate=self.cfg.learning_rate, momentum=self.cfg.momentum, use_nesterov=self.cfg.nesterov) elif self.cfg.optimizer=='adam': optimizer = tf.train.AdamOptimizer(learning_rate=self.cfg.learning_rate, beta1=self.cfg.beta1, beta2=self.cfg.beta2, epsilon=self.cfg.epsilon) elif self.cfg.optimizer=='adadelta': optimizer = tf.train.AdadeltaOptimizer(learning_rate=self.cfg.learning_rate, rho=self.cfg.rho) gvs = optimizer.compute_gradients(self.loss, var_list=tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)) self.grads, vrbs = zip(*gvs) self.train_op = optimizer.apply_gradients(gvs) ### op to just apply passed gradients self.conv1_W_grad = tf.placeholder(tf.float32, shape=(5,5,self.cfg.input_nchannels,64), name='conv1_W_grad') self.conv1_b_grad = tf.placeholder(tf.float32, shape=(64), name='conv1_b_grad') self.conv2_W_grad = tf.placeholder(tf.float32, shape=(5,5,64,64), name='conv2_W_grad') self.conv2_b_grad = tf.placeholder(tf.float32, shape=(64), name='conv2_b_grad') self.conv3_W_grad = tf.placeholder(tf.float32, shape=(5,5,64,128), name='conv3_W_grad') self.conv3_b_grad = tf.placeholder(tf.float32, shape=(128), name='conv3_b_grad') self.fc4_W_grad = tf.placeholder(tf.float32, shape=(2048,1000), name='fc4_W_grad') self.fc4_b_grad = tf.placeholder(tf.float32, shape=(1000), name='fc4_b_grad') self.fc5_W_grad = tf.placeholder(tf.float32, shape=(1000,self.cfg.output_dim), name='fc5_W_grad') self.fc5_b_grad = tf.placeholder(tf.float32, shape=(self.cfg.output_dim), name='fc5_b_grad') passed_grads = [self.conv1_W_grad, self.conv1_b_grad, self.conv2_W_grad, self.conv2_b_grad, self.conv3_W_grad, self.conv3_b_grad, self.fc4_W_grad, self.fc4_b_grad, self.fc5_W_grad, self.fc5_b_grad] passed_gvs = zip(passed_grads, vrbs) self.change_weights_op = optimizer.apply_gradients(passed_gvs)
def create_feedforward_classifier_model(self): """ Creates: self.input_images self.labels self.lr self.preds - pre-softmax predictions self.loss self.accuracy self.grads self.train_op self.change_weights_op """ ## input placeholders self.input_images = tf.placeholder(tf.float32, shape=(None,self.cfg.input_dim), name='input_images') self.labels = tf.placeholder(tf.int32, shape=(None,), name='labels') self.lr = tf.placeholder(tf.float32, shape=(), name='lr') self.max_lr = tf.placeholder(tf.float32, shape=(), name='max_lr') self.keep_prob = tf.placeholder(tf.float32, shape=(), name='keep_prob') self.use_past_bt = tf.placeholder(tf.bool, shape=(), name='use_past_bt') # to pass previous dropout mask self.h1_past_bt = tf.placeholder(tf.float32, shape=(None, self.cfg.h1_dim), name='h1_past_bt') self.h2_past_bt = tf.placeholder(tf.float32, shape=(None, self.cfg.h2_dim), name='h2_past_bt') ## forward pass, note how this is pre-softmax h1 = layers.fully_connected(self.input_images, num_outputs=self.cfg.h1_dim, activation_fn=tf.nn.relu, biases_initializer=layers.initializers.xavier_initializer(), scope='h1') h1, self.h1_binary_tensor = tf.cond(self.use_past_bt, lambda: [math_ops.div(h1,self.keep_prob)*self.h1_past_bt, self.h1_past_bt], lambda: dropout(h1, keep_prob=self.keep_prob)) #h1, self.h1_binary_tensor = tf.cond(self.use_past_bt, lambda: [math_ops.div(h1,self.keep_prob)*self.h1_past_bt, self.h1_past_bt], # lambda: dropout(h1, keep_prob=self.keep_prob)) h2 = layers.fully_connected(h1, num_outputs=self.cfg.h2_dim, activation_fn=tf.nn.relu, biases_initializer=layers.initializers.xavier_initializer(), scope='h2') h2, self.h2_binary_tensor = tf.cond(self.use_past_bt, lambda: [math_ops.div(h2,self.keep_prob)*self.h2_past_bt, self.h2_past_bt], lambda: dropout(h2, keep_prob=self.keep_prob)) #h2, self.h2_binary_tensor = tf.cond(self.use_past_bt, lambda: [math_ops.div(h2,self.keep_prob)*self.h2_past_bt, self.h2_past_bt], # lambda: dropout(h2, keep_prob=self.keep_prob)) self.h2 = h2 self.preds = layers.fully_connected(h2, num_outputs=self.cfg.output_dim, activation_fn=None, biases_initializer=layers.initializers.xavier_initializer(), scope='preds') ## loss and accuracy loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.preds, labels=self.labels) self.loss = tf.reduce_mean(loss, name='loss', axis=None) self.accuracy = tf.contrib.metrics.accuracy(labels=self.labels, predictions=tf.to_int32(tf.argmax(self.preds, axis=1))) ## training op if self.cfg.optimizer=='kalpit': optimizer = tf.train.MomentumOptimizer(learning_rate=1.0, momentum=self.cfg.momentum, use_nesterov=False) # can set lr every minibatch self.initialize_directions() if self.cfg.optimizer=='sgd': optimizer = tf.train.MomentumOptimizer(learning_rate=self.cfg.learning_rate, momentum=self.cfg.momentum, use_nesterov=self.cfg.nesterov) elif self.cfg.optimizer=='adam': optimizer = tf.train.AdamOptimizer(learning_rate=self.cfg.learning_rate, beta1=self.cfg.beta1, beta2=self.cfg.beta2) elif self.cfg.optimizer=='adadelta': optimizer = tf.train.AdadeltaOptimizer(learning_rate=self.cfg.learning_rate, rho=self.cfg.rho) gvs = optimizer.compute_gradients(self.loss, var_list=tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)) self.grads, vrbs = zip(*gvs) self.train_op = optimizer.apply_gradients(gvs) ## dixit training op if self.cfg.optimizer=='kalpit': moms = [optimizer.get_slot(x, 'momentum') for x in tf.trainable_variables()] self.direction = [self.cfg.momentum*moms[i] + self.grads[i] for i in range(len(moms))] gT_g = tf.add_n([tf.reduce_sum(tf.square(g)) for g in self.grads]) gT_d = tf.add_n([tf.reduce_sum(tf.multiply(self.grads[i], self.direction[i])) for i in range(len(self.grads))]) self.gT_d = gT_d dT_d = tf.add_n([tf.reduce_sum(tf.square(d)) for d in self.direction]) #self.lr = self.loss / tf.sqrt(gT_g) / tf.sqrt(dT_d) #self.lr = tf.minimum(self.lr, self.max_lr) self.lr = (1-self.max_lr)*self.loss/tf.sqrt(gT_g)/tf.sqrt(dT_d)*tf.sign(gT_d) # we want loss-->gamma*loss #self.lr = (1-self.max_lr)*self.loss/gT_d # we want loss-->gamma*loss self.dixit_train_op = optimizer.apply_gradients(zip([self.lr*d for d in self.direction], vrbs))
def create_convnet_classifier_model(self): """ Creates: self.input_images self.labels self.lr self.preds - pre-softmax predictions self.loss self.accuracy self.grads self.train_op self.change_weights_op """ ## input placeholders self.input_images = tf.placeholder(tf.float32, shape=(None,self.cfg.input_height,self.cfg.input_width,self.cfg.input_nchannels), name='input_images') self.labels = tf.placeholder(tf.int32, shape=(None,), name='labels') self.lr = tf.placeholder(tf.float32, shape=(), name='lr') self.max_lr = tf.placeholder(tf.float32, shape=(), name='max_lr') self.keep_prob = tf.placeholder(tf.float32, shape=(), name='keep_prob') self.use_past_bt = tf.placeholder(tf.bool, shape=(), name='use_past_bt') self.input_past_bt = tf.placeholder(tf.float32, shape=(None,self.cfg.input_height,self.cfg.input_width,self.cfg.input_nchannels), name='input_past_bt') # past binary tensor self.fc4_past_bt = tf.placeholder(tf.float32, shape=(None,1000), name='input_past_bt') # past binary tensor ## forward pass, note how this is pre-softmax dropout_input_images, self.input_binary_tensor = tf.cond(self.use_past_bt, lambda: [math_ops.div(self.input_images,self.keep_prob)*self.input_past_bt, self.input_past_bt], lambda: dropout(self.input_images, keep_prob=self.keep_prob)) conv1 = layers.convolution2d(dropout_input_images, num_outputs=64, kernel_size=(5,5), stride=(1,1), padding='SAME', biases_initializer=layers.initializers.xavier_initializer(), activation_fn=tf.nn.relu, scope='conv1') pool1 = tf.nn.max_pool(conv1, ksize=(1,3,3,1), strides=(1,2,2,1), padding='SAME') conv2 = layers.convolution2d(pool1, num_outputs=64, kernel_size=(5,5), stride=(1,1), padding='SAME', biases_initializer=layers.initializers.xavier_initializer(), activation_fn=tf.nn.relu, scope='conv2') pool2 = tf.nn.max_pool(conv2, ksize=(1,3,3,1), strides=(1,2,2,1), padding='SAME') conv3 = layers.convolution2d(pool2, num_outputs=128, kernel_size=(5,5), stride=(1,1), padding='SAME', biases_initializer=layers.initializers.xavier_initializer(), activation_fn=tf.nn.relu, scope='conv3') pool3 = tf.nn.max_pool(conv3, ksize=(1,3,3,1), strides=(1,2,2,1), padding='SAME') pool3_flat = layers.flatten(pool3) fc4 = layers.fully_connected(pool3_flat, num_outputs=1000, activation_fn=tf.nn.relu, biases_initializer=layers.initializers.xavier_initializer(), scope='fc4') fc4, self.fc4_binary_tensor = tf.cond(self.use_past_bt, lambda: [math_ops.div(fc4,self.keep_prob)*self.fc4_past_bt, self.fc4_past_bt], lambda: dropout(fc4, keep_prob=self.keep_prob)) fc5 = layers.fully_connected(fc4, num_outputs=self.cfg.output_dim, activation_fn=None, biases_initializer=layers.initializers.xavier_initializer(), scope='fc5') self.preds = fc5 ## loss and accuracy loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.preds, labels=self.labels) self.loss = tf.reduce_mean(loss, name='loss', axis=None) self.accuracy = tf.contrib.metrics.accuracy(labels=self.labels, predictions=tf.to_int32(tf.argmax(self.preds, axis=1))) ## training op if self.cfg.optimizer=='kalpit': optimizer = tf.train.MomentumOptimizer(learning_rate=1.0, momentum=self.cfg.momentum, use_nesterov=False) # can set lr every minibatch elif self.cfg.optimizer=='sgd': optimizer = tf.train.MomentumOptimizer(learning_rate=self.cfg.learning_rate, momentum=self.cfg.momentum, use_nesterov=self.cfg.nesterov) elif self.cfg.optimizer=='adam': optimizer = tf.train.AdamOptimizer(learning_rate=self.cfg.learning_rate, beta1=self.cfg.beta1, beta2=self.cfg.beta2, epsilon=self.cfg.epsilon) elif self.cfg.optimizer=='adadelta': optimizer = tf.train.AdadeltaOptimizer(learning_rate=self.cfg.learning_rate, rho=self.cfg.rho) gvs = optimizer.compute_gradients(self.loss, var_list=tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)) self.grads, vrbs = zip(*gvs) self.train_op = optimizer.apply_gradients(gvs) ## dixit training op if self.cfg.optimizer=='kalpit': moms = [optimizer.get_slot(x, 'momentum') for x in tf.trainable_variables()] self.direction = [self.cfg.momentum*moms[i] + self.grads[i] for i in range(len(moms))] gT_g = tf.add_n([tf.reduce_sum(tf.square(g)) for g in self.grads]) gT_d = tf.add_n([tf.reduce_sum(tf.multiply(self.grads[i], self.direction[i])) for i in range(len(self.grads))]) self.gT_d = gT_d dT_d = tf.add_n([tf.reduce_sum(tf.square(d)) for d in self.direction]) #self.lr = self.loss / tf.sqrt(gT_g) / tf.sqrt(dT_d) #self.lr = tf.minimum(self.lr, self.max_lr) self.lr = (1-self.max_lr)*self.loss/tf.sqrt(gT_g)/tf.sqrt(dT_d)*tf.sign(gT_d) # we want loss-->gamma*loss #self.lr = (1-self.max_lr)*self.loss/gT_d # we want loss-->gamma*loss self.dixit_train_op = optimizer.apply_gradients(zip([self.lr*d for d in self.direction], vrbs))