def mgpu_train(*xs): gpu_ops = [] gpu_grads = [] xs = (tf.split(x, n_gpu, 0) for x in xs) for i, xs in enumerate(zip(*xs)): do_reuse = True if i > 0 else None with tf.device(assign_to_gpu(i, "/gpu:0")), tf.variable_scope(tf.get_variable_scope(), reuse=do_reuse): clf_logits, clf_losses, lm_losses = model(*xs, train=True, reuse=do_reuse) if lm_coef > 0: train_loss = tf.reduce_mean(clf_losses) + lm_coef*tf.reduce_mean(lm_losses) else: train_loss = tf.reduce_mean(clf_losses) params = find_trainable_variables("model") grads = tf.gradients(train_loss, params) grads = list(zip(grads, params)) gpu_grads.append(grads) gpu_ops.append([clf_logits, clf_losses, lm_losses]) #print(gpu_ops) #print(*gpu_ops) ops = [tf.concat(op, 0) for op in zip(*gpu_ops)] #print(ops) grads = average_grads(gpu_grads) #print(grads) grads = [g for g, p in grads] #print(grads) train = opt_fns[opt](params, grads, lr, partial(lr_schedules[lr_schedule], warmup=lr_warmup), n_updates_total, l2=l2, max_grad_norm=max_grad_norm, vector_l2=vector_l2, b1=b1, b2=b2, e=e) #print([train]) return [train]+ops
def mgpu_train(self, *xs): gpu_ops = [] gpu_grads = [] tvars = None # split input data into number of gpus (4 for Fab, 2 for me, 1 on the computer) xs = (tf.split(x, self.params.n_gpu, 0) for x in xs) for i, xs in enumerate(zip(*xs)): do_reuse = True if i > 0 else None """ reuse: variable foo/gpu:X can be shared in a reusing scope, else gives error logits: the result from the last layer, loss: the difference between this result and label model(): - assign each input to the model and build train graph - clf_logits: [?, 2], clf_loss: [?] where ?: shape of current batch input; logits is [,2] because we are classifying btwn two diff input seqns for train: these results operation are also used to perform gradient descent and update in gpu (unlike in mgpu_predict where they are just used to only calc the themselves in the gpu) tf.gradients(): apply gradient diff. calc (Jacobian) to the trainable variables grads = list(): zips the gradient descent values and the variables to which they are to be applied on gpu_ops.append: appends the logit and loss outputs from each gpu if clf """ with tf.device(utils.assign_to_gpu( i, "/gpu:0")), tf.variable_scope(tf.get_variable_scope(), reuse=do_reuse): clf_logits, lm_logits, clf_losses, lm_losses = self.model( *xs, train=True, reuse=do_reuse) if self.params.head_type == "clf": if self.params.lm_coef > 0: # calculate and apply a joint loss if clf task also includes lm train_loss = tf.reduce_mean( clf_losses ) + self.params.lm_coef * tf.reduce_mean(lm_losses) tf.summary.scalar('Multi-task Clf-Lm Loss average', train_loss) else: train_loss = tf.reduce_mean(clf_losses) tf.summary.scalar('Clf Loss average', train_loss) elif self.params.head_type == "lm": train_loss = tf.reduce_mean(lm_losses) tf.summary.scalar('Lm Loss average', train_loss) else: raise ValueError( "{} is not a valid parameter for head_type!".format( self.params.head_type)) tvars = utils.find_trainable_variables("model") grads = tf.gradients(train_loss, tvars) grads = list(zip(grads, tvars)) gpu_grads.append( grads) # appends the gradient properties from each gpu if self.params.head_type == "clf": gpu_ops.append([clf_logits, clf_losses, lm_losses]) elif self.params.head_type == "lm": gpu_ops.append([ lm_losses ]) # appends just the loss outputs from each gpu if lm else: raise ValueError( "{} is not a valid parameter for head_type!".format( self.params.head_type)) ops = [tf.concat(op, 0) for op in zip(*gpu_ops) ] # concatenate the loss result from the different gpus # contains [an average of the grads from each gpu, and the corresponding variables] grads = utils.average_grads(gpu_grads) """ Gradient operations (only in train, not in predict) Accumulate gradient and perform update after a certain treshold. False for rocstories The threshold condition is defined in the train-loop section in __main__ in train.py zero_ops: operation to assign 0s into a non-trainable tf.Variable of shape tvars accum_ops: operation to store the average of the grads from each gpu into a non-trainable tf.Variable of shape tvars else loop: returns only the gradients, not the variables """ if self.params.gradient_accumulation: tvars = utils.find_trainable_variables("model") accum_tvars = [ tf.Variable(tf.zeros_like(tv.initialized_value()), trainable=False) for tv in tvars ] zero_ops = [tv.assign(tf.zeros_like(tv)) for tv in accum_tvars] accum_ops = [ accum_tvars[i].assign_add(grad[0]) for i, grad in enumerate(grads) ] grads = accum_tvars else: zero_ops = None accum_ops = None grads = [g for g, p in grads] # Perform Optimization (rocstories:- param.opt: adam) # partial(LR_SCHEDULES...): i guess for changing the lr decay value over time (Not sure) train = OPT_FNS[self.params.opt]( tvars, grads, self.params.lr, partial(LR_SCHEDULES[self.params.lr_schedule], warmup=self.params.lr_warmup), self.params.n_updates_total, l2=self.params.l2, max_grad_norm=self.params.max_grad_norm, vector_l2=self.params.vector_l2, b1=self.params.b1, b2=self.params.b2, e=self.params.e) # Tensorboard self.merged = tf.summary.merge_all() self.writer = tf.summary.FileWriter(self.logdir, tf.Session().graph) # sess.graph return [train, accum_ops, zero_ops] + ops
def ccc_train(self): # Resolve hostnames and ports of other nodes host, hosts = client(bootstrap_host, bootstrap_port) # Create a cluster and identify the job name and task of this node cluster = tf.train.ClusterSpec({ 'ps': hosts[:num_ps], 'worker': hosts[num_ps:] }) task = hosts.index(host) job_name = ('ps', 'worker')[task >= num_ps] task = cluster.job_tasks(job_name).index(host) tf_config = tf.ConfigProto(allow_soft_placement=True) tf_config.gpu_options.allow_growth = True server = tf.train.Server(cluster, job_name=job_name, task_index=task, config=tf_config) if job_name == 'ps': # create a shared queue on the parameter server which is visible on /job:ps/task:%d with tf.device('/job:ps/task:%d' % task): queue = tf.FIFOQueue(cluster.num_tasks('worker'), tf.int32, shared_name='done_queue%d' % task) # wait for the queue to be filled with tf.Session(server.target) as sess: for i in range(cluster.num_tasks('worker')): sess.run(queue.dequeue()) print('ps:%d received "done" from worker:%d' % (task, i)) print('ps:%d quitting' % task) elif job_name == 'worker': with tf.device( tf.train.replica_device_setter( worker_device='/job:worker/task:%d' % task, cluster=cluster)): global_step = tf.train.get_or_create_global_step() sentences = self.batched_data( tfrecord_filename, self.single_example_parser, self.n_batch_train, padded_shapes=tf.Dimension(n_ctx), num_epochs=n_iter) sentences = tf.cast(sentences, tf.int32) max_len = tf.shape(sentences)[1] #sentences.get_shape()[1] xmb = tf.reshape(sentences, [self.n_batch_train, 1, max_len, 1]) M_train = tf.cast( tf.reshape(tf.sign(xmb), [self.n_batch_train, 1, max_len]), tf.float32) positions = tf.reshape(tf.range( self.n_vocab + self.n_special, self.n_vocab + self.n_special + max_len), shape=[1, 1, max_len, 1]) #tf.constant(np.arange(self.n_vocab + self.n_special, self.n_vocab + self.n_special + max_len),shape=[1, 1, max_len, 1]) positions = tf.tile(positions, [self.n_batch_train, 1, 1, 1]) X_train = tf.concat([xmb, positions], axis=3) optimizer = tf.train.AdamOptimizer(learning_rate=lr, beta1=b1, beta2=b2, epsilon=e) gpu_grads = [] gpu_loss = [] gpu_ppl = [] xs = [X_train, M_train] xs = (tf.split(x, n_gpu, 0) for x in xs) for i, xs in enumerate(zip(*xs)): do_reuse = True if i > 0 else None with tf.device(assign_to_gpu(i)), tf.variable_scope( tf.get_variable_scope(), reuse=do_reuse): lm_losses = self.model(*xs, train=True, num_ps=num_ps) train_ppl_single = tf.reduce_mean(math.e**lm_losses) train_loss_single = tf.reduce_mean(lm_losses) gpu_loss.append(train_loss_single) gpu_ppl.append(train_ppl_single) optimizer = tf.train.AdamOptimizer(learning_rate=lr, beta1=b1, beta2=b2, epsilon=e) raw_grads_and_vars = optimizer.compute_gradients( train_loss_single) grads_and_vars = [ (tf.clip_by_global_norm([gv[0]], max_grad_norm)[0][0], gv[1]) for gv in raw_grads_and_vars ] gpu_grads.append(grads_and_vars) train_ppl = tf.reduce_mean(gpu_ppl) train_loss = tf.reduce_mean(gpu_loss) grads = average_grads(gpu_grads) train_op = optimizer.apply_gradients(grads, global_step=global_step) saver = tf.train.Saver(max_to_keep=5) X = tf.placeholder(tf.int32, [None, 1, n_ctx, 2]) M = tf.placeholder(tf.float32, [None, 1, n_ctx]) valid_lm_losses = self.model(X, M, train=False, reuse=True) valid_ppl = tf.reduce_mean(math.e**valid_lm_losses) valid_loss = tf.reduce_mean(valid_lm_losses) self.params = find_trainable_variables('model_lm') tf.summary.scalar('train_loss', train_loss) #tf.summary.scalar('valid_loss', valid_loss) tf.summary.scalar('train_ppl', train_ppl) #tf.summary.scalar('valid_ppl', valid_ppl) summary_op = tf.summary.merge_all() done_ops = [] # create a shared queue on the worker which is visible on /job:ps/task:%d for i in range(cluster.num_tasks('ps')): with tf.device('/job:ps/task:%d' % i): with tf.name_scope('done_queue'): done_queue = tf.FIFOQueue(cluster.num_tasks('worker'), tf.int32, shared_name='done_queue' + str(i)) done_ops.append(done_queue.enqueue(task)) scaffold = tf.train.Scaffold(saver=saver) summary_hook = tf.train.SummarySaverHook(save_steps=1000, output_dir=save_dir, summary_op=summary_op) hooks = [ summary_hook, # tf.train.CheckpointSaverHook(save_secs=600, checkpoint_dir=save_dir, saver=saver), tf.train.StopAtStepHook(last_step=1000000), tf.train.LoggingTensorHook( { 'step': global_step, 'train_loss': train_loss, 'ppl': train_ppl }, every_n_iter=100), tf.train.FinalOpsHook([done_ops]) ] valid_data = pre_train_valid(valid_dir) vaX1 = encode_dataset(self.text_encoder, pre_train(valid_data))[0] vaX, vaM = self.transform_roc(vaX1) with tf.train.MonitoredTrainingSession(master=server.target, is_chief=(task == 0), hooks=hooks, save_checkpoint_secs=600, checkpoint_dir=save_dir, scaffold=scaffold) as sess: coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) try: while not coord.should_stop(): ppl, loss, _, step = sess.run([ train_ppl, train_loss, train_op, global_step ]) #,options=run_options, run_metadata=run_metadata) if step % steps_to_validate == 0: va_cost = [] va_ppl = [] for xm, mm in iter_data((vaX, vaM), n_batch=self.n_batch_train, truncate=False, verbose=True): ps = sess.run(self.params) joblib.dump(ps, save_dir + 'model_lm.params', protocol=2) res, ppl = sess.run([valid_loss, valid_ppl], { X: xm, M: mm }) va_cost.append(np.sum(res)) va_ppl.append(np.sum(ppl)) va_cost = np.average(va_cost) va_ppl = np.average(va_ppl) tf.logging.info( '=========n_steps:\t%d valid_cost:\t%.3f valid ppl:\t%.3f==========' % (step, va_cost, va_ppl)) except tf.errors.OutOfRangeError: print('Epochs Complete!') finally: coord.request_stop() coord.join(threads)
def train(self): global_step = tf.train.get_or_create_global_step() X_train = tf.placeholder(tf.int32, [self.n_batch_train, 2, n_ctx, 2]) M_train = tf.placeholder(tf.float32, [self.n_batch_train, 2, n_ctx]) X = tf.placeholder(tf.int32, [None, 2, n_ctx, 2]) M = tf.placeholder(tf.float32, [None, 2, n_ctx]) Y_train = tf.placeholder(tf.int32, [self.n_batch_train]) Y = tf.placeholder(tf.int32, [None]) #self.train, self.logits, self.clf_losses, self.lm_losses = self.mgpu_train(self.X_train, self.M_train, self.Y_train) xs = [X_train, M_train, Y_train] gpu_ops = [] gpu_grads = [] xs = (tf.split(x, n_gpu, 0) for x in xs) optimizer = tf.train.AdamOptimizer(learning_rate=lr, beta1=b1, beta2=b2, epsilon=e) for i, xs in enumerate(zip(*xs)): do_reuse = True if i > 0 else None with tf.device(assign_to_gpu(i, "/gpu:0")), tf.variable_scope( tf.get_variable_scope(), reuse=do_reuse): logits, clf_losses, lm_losses = self.model(*xs, train=True, reuse=do_reuse) if lm_coef > 0: train_loss = tf.reduce_mean( clf_losses) + lm_coef * tf.reduce_mean(lm_losses) else: train_loss = tf.reduce_mean(clf_losses) raw_grads_and_vars = optimizer.compute_gradients(train_loss) grads_and_vars = [(tf.clip_by_global_norm([gv[0]], max_grad_norm)[0][0], gv[1]) for gv in raw_grads_and_vars] gpu_grads.append(grads_and_vars) gpu_ops.append([logits, clf_losses, lm_losses]) ops = [tf.concat(op, 0) for op in zip(*gpu_ops)] logits, clf_losses, lm_losses = ops grads = average_grads(gpu_grads) train_op = optimizer.apply_gradients(grads, global_step=global_step) clf_loss = tf.reduce_mean(clf_losses) saver = tf.train.Saver(max_to_keep=5) self.params = find_trainable_variables('model_lm') self.eval_mgpu_logits, self.eval_mgpu_clf_losses, self.eval_mgpu_lm_losses = self.mgpu_predict( X_train, M_train, Y_train) self.eval_logits, self.eval_clf_losses, self.eval_lm_losses = self.model( X, M, Y, train=False, reuse=True) self.eval_clf_loss = tf.reduce_mean(self.eval_clf_losses) self.eval_mgpu_clf_loss = tf.reduce_mean(self.eval_mgpu_clf_losses) summary_op = tf.get_collection(tf.GraphKeys.SUMMARIES) def trva_split(data, index): return [data[i] for i in index] x1, x2, y = encode_dataset(self.text_encoder, atec(data_dir)) valid_index = np.load('data/valid_index.npy') if data_dir == 'data/para.tsv': valid_index = np.concatenate([ valid_index, valid_index + len(y) // 4, valid_index + len(y) // 2, valid_index + 3 * len(y) // 4 ]) valid_index = valid_index.tolist() train_index = list(set(valid_index) ^ set(range(len(y)))) trX1, trX2, trY = trva_split(x1, train_index), trva_split( x2, train_index), trva_split(y, train_index) vaX1, vaX2, vaY = trva_split(x1, valid_index), trva_split( x2, valid_index), trva_split(y, valid_index) trX, trM = self.transform_roc(trX1, trX2) vaX, vaM = self.transform_roc(vaX1, vaX2) n_train = len(trY) n_valid = len(vaY) self.n_updates_total = (n_train // self.n_batch_train) * n_iter self.build_graph() if pre_load: shapes = json.load(open('model/params_shapes.json')) offsets = np.cumsum([np.prod(shape) for shape in shapes]) init_params = [ np.load('model/params_{}.npy'.format(n)) for n in range(10) ] init_params = np.split(np.concatenate(init_params, 0), offsets)[:-1] init_params = [ param.reshape(shape) for param, shape in zip(init_params, shapes) ] init_params[0] = init_params[0][:+n_ctx] init_params[0] = np.concatenate([ init_params[1], (np.random.randn(self.n_special, n_embd) * 0.02).astype( np.float32), init_params[0] ], 0) del init_params[1] if self.n_transfer == -1: self.n_transfer = 0 else: self.n_transfer = 1 + self.n_transfer * 12 self.sess.run([ p.assign(ip) for p, ip in zip(self.params[:self.n_transfer], init_params[:self.n_transfer]) ]) if not new_model: print('loading old model') self.load() print('load success') n_updates = 0 n_epochs = 0 self.save(os.path.join(save_dir, desc, 'best_params.jl')) self.best_score = 0 def log(): def iter_apply(Xs, Ms, Ys): fns = [ lambda x: np.concatenate(x, 0), lambda x: float(np.sum(x)) ] results = [] for xmb, mmb, ymb in iter_data((Xs, Ms, Ys), n_batch=self.n_batch_train, truncate=False, verbose=True): n = len(xmb) if n == self.n_batch_train: res = sess.run( [self.eval_mgpu_logits, self.eval_mgpu_clf_loss], { X_train: xmb, M_train: mmb, Y_train: ymb }) else: res = sess.run([self.eval_logits, self.eval_clf_loss], { X: xmb, M: mmb, Y: ymb }) res = [r * n for r in res] results.append(res) results = zip(*results) return [fn(res) for res, fn in zip(results, fns)] # global best_score tr_logits, tr_cost = iter_apply(trX[:n_valid], trM[:n_valid], trY[:n_valid]) va_logits, va_cost = iter_apply(vaX, vaM, vaY) tr_cost = tr_cost / len(trY[:n_valid]) va_cost = va_cost / n_valid tr_f1 = f1_score(trY[:n_valid], np.argmax(tr_logits, 1)) * 100. va_f1 = f1_score(vaY, np.argmax(va_logits, 1)) * 100. self.logger.log(n_epochs=n_epochs, n_updates=n_updates, tr_cost=tr_cost, va_cost=va_cost, tr_f1=tr_f1, va_f1=va_f1) print('%d %d %.3f %.3f %.2f %.2f' % (n_epochs, n_updates, tr_cost, va_cost, tr_f1, va_f1)) score = va_f1 if score > self.best_score: self.best_score = score self.save(os.path.join(save_dir, desc, 'best_params.jl')) for i in range(n_iter): for xmb, mmb, ymb in iter_data( (shuffle(trX, trM, trY, random_state=np.random)), n_batch=self.n_batch_train, truncate=True, verbose=True): cost, _ = self.sess.run([self.clf_loss, self.train], { self.X_train: xmb, self.M_train: mmb, self.Y_train: ymb }) n_updates += 1 if n_updates % 1000 == 0: log() n_epochs += 1 log()
def train(self): global_step = tf.train.get_or_create_global_step() X_train = tf.placeholder(tf.int32, [self.n_batch_train, 2, n_ctx, 2]) M_train = tf.placeholder(tf.float32, [self.n_batch_train, 2, n_ctx]) X = tf.placeholder(tf.int32, [None, 2, n_ctx, 2]) M = tf.placeholder(tf.float32, [None, 2, n_ctx]) Y_train = tf.placeholder(tf.int32, [self.n_batch_train]) Y = tf.placeholder(tf.int32, [None]) #self.train, self.logits, self.clf_losses, self.lm_losses = self.mgpu_train(self.X_train, self.M_train, self.Y_train) xs = [X_train, M_train, Y_train] gpu_ops = [] gpu_grads = [] xs = (tf.split(x, n_gpu, 0) for x in xs) optimizer = tf.train.AdamOptimizer(learning_rate=lr, beta1=b1, beta2=b2, epsilon=e) for i, xs in enumerate(zip(*xs)): do_reuse = True if i > 0 else None with tf.device(assign_to_gpu(i, "/gpu:0")), tf.variable_scope( tf.get_variable_scope(), reuse=do_reuse): logits, clf_losses, lm_losses = self.model(*xs, train=True, reuse=do_reuse) if lm_coef > 0: train_loss = tf.reduce_mean( clf_losses) + lm_coef * tf.reduce_mean(lm_losses) else: train_loss = tf.reduce_mean(clf_losses) raw_grads_and_vars = optimizer.compute_gradients(train_loss) grads_and_vars = [(tf.clip_by_global_norm([gv[0]], max_grad_norm)[0][0], gv[1]) for gv in raw_grads_and_vars] gpu_grads.append(grads_and_vars) gpu_ops.append([logits, clf_losses, lm_losses]) ops = [tf.concat(op, 0) for op in zip(*gpu_ops)] logits, clf_losses, lm_losses = ops grads = average_grads(gpu_grads) train_op = optimizer.apply_gradients(grads, global_step=global_step) clf_loss = tf.reduce_mean(clf_losses) saver = tf.train.Saver(max_to_keep=5) self.params = find_trainable_variables('model_lm') if pre_load: restore_op = [ p.assign(ip) for p, ip in zip( self.params, joblib.load(lm_dir + '/model_lm.params')) ] self.eval_mgpu_logits, self.eval_mgpu_clf_losses, self.eval_mgpu_lm_losses = self.mgpu_predict( X_train, M_train, Y_train) self.eval_logits, self.eval_clf_losses, self.eval_lm_losses = self.model( X, M, Y, train=False, reuse=True) self.eval_clf_loss = tf.reduce_mean(self.eval_clf_losses) self.eval_mgpu_clf_loss = tf.reduce_mean(self.eval_mgpu_clf_losses) summary_op = tf.get_collection(tf.GraphKeys.SUMMARIES) def trva_split(data, index): return [data[i] for i in index] x1, x2, y = encode_dataset(self.text_encoder, atec(data_dir)) valid_index = np.load('data/valid_index.npy') if data_dir == 'data/para.tsv': valid_index = np.concatenate([ valid_index, valid_index + len(y) // 4, valid_index + len(y) // 2, valid_index + 3 * len(y) // 4 ]) valid_index = valid_index.tolist() train_index = list(set(valid_index) ^ set(range(len(y)))) trX1, trX2, trY = trva_split(x1, train_index), trva_split( x2, train_index), trva_split(y, train_index) vaX1, vaX2, vaY = trva_split(x1, valid_index), trva_split( x2, valid_index), trva_split(y, valid_index) trX, trM = self.transform_roc(trX1, trX2) vaX, vaM = self.transform_roc(vaX1, vaX2) n_train = len(trY) n_valid = len(vaY) self.n_updates_total = (n_train // self.n_batch_train) * n_iter def log(): def iter_apply(Xs, Ms, Ys): fns = [ lambda x: np.concatenate(x, 0), lambda x: float(np.sum(x)) ] results = [] for xmb, mmb, ymb in iter_data((Xs, Ms, Ys), n_batch=self.n_batch_train, truncate=False, verbose=True): n = len(xmb) if n == self.n_batch_train: res = sess.run( [self.eval_mgpu_logits, self.eval_mgpu_clf_loss], { X_train: xmb, M_train: mmb, Y_train: ymb }) else: res = sess.run([self.eval_logits, self.eval_clf_loss], { X: xmb, M: mmb, Y: ymb }) res = [r * n for r in res] results.append(res) results = zip(*results) return [fn(res) for res, fn in zip(results, fns)] # global best_score tr_logits, tr_cost = iter_apply(trX[:n_valid], trM[:n_valid], trY[:n_valid]) va_logits, va_cost = iter_apply(vaX, vaM, vaY) tr_cost = tr_cost / len(trY[:n_valid]) va_cost = va_cost / n_valid tr_f1 = f1_score(trY[:n_valid], np.argmax(tr_logits, 1)) * 100. va_f1 = f1_score(vaY, np.argmax(va_logits, 1)) * 100. tf.logging.info( '%d %d %.3f %.3f %.2f %.2f' % (n_epochs, n_updates, tr_cost, va_cost, tr_f1, va_f1)) scaffold = tf.train.Scaffold(saver=saver) log_hook = tf.train.LoggingTensorHook( { 'step': global_step, 'train_loss': clf_loss }, every_n_iter=100) summary_hook = tf.train.SummarySaverHook(save_steps=100, output_dir=save_dir, summary_op=summary_op) hooks = [summary_hook, log_hook] tf_config = tf.ConfigProto(allow_soft_placement=True) tf_config.gpu_options.allow_growth = True n_epochs = 0 with tf.train.MonitoredTrainingSession(hooks=hooks, save_checkpoint_secs=600, checkpoint_dir=save_dir, scaffold=scaffold, config=tf_config) as sess: if pre_load: sess.run(restore_op) for i in range(n_iter): for xmb, mmb, ymb in iter_data( (shuffle(trX, trM, trY, random_state=np.random)), n_batch=self.n_batch_train, truncate=True, verbose=True): cost, _, n_updates = sess.run( [clf_loss, train_op, global_step], { X_train: xmb, M_train: mmb, Y_train: ymb }) if n_updates % 100 == 0: log() n_epochs += 1 log()