def evaluate(): """Evaluate embeddings.""" CONFIG.LOGDIR = FLAGS.logdir logdir = CONFIG.LOGDIR setup_eval_dir(logdir) algo = get_algo(CONFIG.TRAINING_ALGO) if FLAGS.defun: algo.call = tf.function(algo.call) algo.compute_loss = tf.function(algo.compute_loss) iterator_tasks, embedding_tasks = get_tasks(CONFIG.EVAL.TASKS) # Setup summary writer. summary_writer = tf.summary.create_file_writer(os.path.join( logdir, 'eval_logs'), flush_millis=10000) iterators = {} if iterator_tasks: # Setup Dataset Iterators from train and val datasets. iterators['train_iterator'] = create_dataset('train', mode='eval') iterators['val_iterator'] = create_dataset('val', mode='eval') if FLAGS.continuous_eval: for _ in tf.train.checkpoints_iterator(logdir, timeout=1, timeout_fn=timeout_fn): evaluate_once(algo, iterator_tasks, embedding_tasks, iterators, summary_writer) else: evaluate_once(algo, iterator_tasks, embedding_tasks, iterators, summary_writer)
def train(): """Trains model and evaluates on relevant downstream tasks.""" #print(CONFIG) CONFIG.LOGDIR = FLAGS.logdir logdir = CONFIG.LOGDIR setup_train_dir(logdir) # Common code for multigpu and single gpu. Set devices here if you don't # want to use all the GPUs on the machine. Default is to use all GPUs. strategy = tf.distribute.MirroredStrategy() with strategy.scope(): algo = get_algo(CONFIG.TRAINING_ALGO) # Setup summary writer. summary_writer = tf.summary.create_file_writer( os.path.join(logdir, 'train_logs'), flush_millis=10000) learning_rate, optimizer, global_step = get_lr_opt_global_step() ckpt_manager, _, _ = restore_ckpt( logdir=logdir, optimizer=optimizer, **algo.model) global_step_value = global_step.numpy() # Remember in Eager mode learning rate variable needs to be updated # manually. Calling lr_fn each iteration to get current learning rate. lr_fn = get_lr_fn(CONFIG.OPTIMIZER) # Setup Dataset Iterators from train and val datasets. batch_size_per_replica = CONFIG.TRAIN.BATCH_SIZE total_batch_size = batch_size_per_replica * strategy.num_replicas_in_sync train_ds = create_dataset('train', mode='train', batch_size=total_batch_size, return_iterator=False) train_iterator = strategy.make_dataset_iterator(train_ds) def train_step(data): steps = data['chosen_steps'] seq_lens = data['seq_lens'] loss = algo.train_one_iter(data, steps, seq_lens, global_step, optimizer) return loss # This reduction only affects reporting, not the gradients. # pylint: disable=g-long-lambda dist_train = lambda it: strategy.reduce( tf.distribute.ReduceOp.SUM, strategy.experimental_run(train_step, it), axis=None) # pylint: enable=g-long-lambda if FLAGS.defun: dist_train = tf.function(dist_train) stopwatch = Stopwatch() try: while global_step_value < CONFIG.TRAIN.MAX_ITERS: with summary_writer.as_default(): with tf.summary.record_if( global_step_value % CONFIG.LOGGING.REPORT_INTERVAL == 0): loss = dist_train(train_iterator) # Update learning rate based in lr_fn. learning_rate.assign(lr_fn(learning_rate, global_step)) tf.summary.scalar('loss', loss, step=global_step) tf.summary.scalar('learning_rate', learning_rate, step=global_step) # Save checkpoint. if global_step_value % CONFIG.CHECKPOINT.SAVE_INTERVAL == 0: ckpt_manager.save() logging.info('Checkpoint saved at iter %d.', global_step_value) # Update global step. global_step_value = global_step.numpy() time_per_iter = stopwatch.elapsed() tf.summary.scalar( 'timing/time_per_iter', time_per_iter, step=global_step) logging.info('Iter[{}/{}], {:.1f}s/iter, Loss: {:.3f}'.format( global_step_value, CONFIG.TRAIN.MAX_ITERS, time_per_iter, loss.numpy())) # Reset stopwatch after iter is complete. stopwatch.reset() except KeyboardInterrupt: logging.info('Caught keyboard interrupt. Saving model before quitting.') finally: # Save the final checkpoint. ckpt_manager.save() logging.info('Checkpoint saved at iter %d', global_step_value)