def main(argv): if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') tf.enable_v2_behavior() config = FLAGS.config logging.info('===========Config Dict============') logging.info(config) batch_size = config.batch_size learning_rate = config.learning_rate num_train_steps = config.num_train_steps num_eval_steps = config.num_eval_steps eval_freq = config.eval_frequency random_seed = config.random_seed model_type = config.model_type if jax.host_id() == 0: summary_writer = tensorboard.SummaryWriter( os.path.join(FLAGS.model_dir, 'summary')) else: summary_writer = None if batch_size % jax.device_count() > 0: raise ValueError( 'Batch size must be divisible by the number of devices') logging.info('Training on %s', FLAGS.task_name) if model_type in ['wideresnet', 'resnet', 'simple_cnn']: normalize = True else: # transformer-based models normalize = False (train_ds, eval_ds, test_ds, num_classes, vocab_size, input_shape) = task_registry.TASK_DATA_DICT[FLAGS.task_name]( n_devices=jax.local_device_count(), batch_size=batch_size, normalize=normalize) train_iter = iter(train_ds) model_kwargs = {} flatten_input = True if model_type in ['wideresnet', 'resnet', 'simple_cnn']: model_kwargs.update({ 'num_classes': num_classes, }) flatten_input = False else: # transformer models # we will flatten the input bs, h, w, c = input_shape assert c == 1 input_shape = (bs, h * w * c) model_kwargs.update({ 'vocab_size': vocab_size, 'max_len': input_shape[1], 'classifier': True, 'num_classes': num_classes, }) model_kwargs.update(config.model) rng = random.PRNGKey(random_seed) rng = jax.random.fold_in(rng, jax.host_id()) rng, init_rng = random.split(rng) # We init the first set of dropout PRNG keys, but update it afterwards inside # the main pmap'd training update for performance. dropout_rngs = random.split(rng, jax.local_device_count()) model, state = get_model(init_rng, input_shape, model_type, model_kwargs) optimizer = create_optimizer(model, learning_rate, config.weight_decay) del model # Don't keep a copy of the initial model. start_step = 0 if config.restore_checkpoints: # Restore unreplicated optimizer + model state from last checkpoint. optimizer, state = checkpoints.restore_checkpoint( FLAGS.model_dir, (optimizer, state)) # Grab last step. start_step = int(optimizer.state.step) # Replicate optimizer and state optimizer = jax_utils.replicate(optimizer) state = jax_utils.replicate(state) learning_rate_fn = train_utils.create_learning_rate_scheduler( factors=config.factors, base_learning_rate=learning_rate, warmup_steps=config.warmup, steps_per_cycle=config.get('steps_per_cycle', None), ) p_train_step = jax.pmap(functools.partial( train_step, learning_rate_fn=learning_rate_fn, num_classes=num_classes, grad_clip_norm=config.get('grad_clip_norm', None), flatten_input=flatten_input), axis_name='batch') p_eval_step = jax.pmap( functools.partial(eval_step, num_classes=num_classes, flatten_input=flatten_input), axis_name='batch', ) optimizer, state, step = train_loop(config, dropout_rngs, eval_ds, eval_freq, num_eval_steps, num_train_steps, optimizer, state, p_eval_step, p_train_step, start_step, train_iter, summary_writer) logging.info('Starting testing') logging.info('====================') test(optimizer, state, p_eval_step, step, test_ds, summary_writer, FLAGS.model_dir)
def main(argv): if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') tf.enable_v2_behavior() config = FLAGS.config logging.info('===========Config Dict============') logging.info(config) batch_size = config.batch_size learning_rate = config.learning_rate num_train_steps = config.num_train_steps num_eval_steps = config.num_eval_steps eval_freq = config.eval_frequency random_seed = config.random_seed model_type = config.model_type max_length = config.max_length if jax.host_id() == 0: summary_writer = tensorboard.SummaryWriter( os.path.join(FLAGS.model_dir, 'summary')) if batch_size % jax.device_count() > 0: raise ValueError( 'Batch size must be divisible by the number of devices') train_ds, eval_ds, test_ds, encoder = input_pipeline.get_tc_datasets( n_devices=jax.local_device_count(), task_name=FLAGS.task_name, data_dir=FLAGS.data_dir, batch_size=batch_size, fixed_vocab=None, max_length=max_length) vocab_size = encoder.vocab_size logging.info('Vocab Size: %d', vocab_size) train_ds = train_ds.repeat() train_iter = iter(train_ds) input_shape = (batch_size, max_length) model_kwargs = { 'vocab_size': vocab_size, 'emb_dim': config.emb_dim, 'num_heads': config.num_heads, 'num_layers': config.num_layers, 'qkv_dim': config.qkv_dim, 'mlp_dim': config.mlp_dim, 'max_len': max_length, 'classifier': True, 'num_classes': CLASS_MAP[FLAGS.task_name], 'classifier_pool': config.classifier_pool } rng = random.PRNGKey(random_seed) rng = jax.random.fold_in(rng, jax.host_id()) rng, init_rng = random.split(rng) # We init the first set of dropout PRNG keys, but update it afterwards inside # the main pmap'd training update for performance. dropout_rngs = random.split(rng, jax.local_device_count()) if model_type == 'transformer': model = create_model(init_rng, transformer.TransformerEncoder, input_shape, model_kwargs) else: raise ValueError('Model type not supported') optimizer = create_optimizer(model, learning_rate, weight_decay=FLAGS.config.weight_decay) del model # Don't keep a copy of the initial model. start_step = 0 if config.restore_checkpoints: # Restore unreplicated optimizer + model state from last checkpoint. optimizer = checkpoints.restore_checkpoint(FLAGS.model_dir, optimizer) # Grab last step. start_step = int(optimizer.state.step) # Replicate optimizer. optimizer = jax_utils.replicate(optimizer) learning_rate_fn = train_utils.create_learning_rate_scheduler( factors=config.factors, base_learning_rate=learning_rate, warmup_steps=config.warmup) p_train_step = jax.pmap(functools.partial( train_step, learning_rate_fn=learning_rate_fn), axis_name='batch') p_eval_step = jax.pmap(eval_step, axis_name='batch') # p_pred_step = jax.pmap(predict_step, axis_name='batch') metrics_all = [] tick = time.time() logging.info('Starting training') logging.info('====================') for step, batch in zip(range(start_step, num_train_steps), train_iter): batch = common_utils.shard(jax.tree_map(lambda x: x._numpy(), batch)) # pylint: disable=protected-access optimizer, metrics, dropout_rngs = p_train_step( optimizer, batch, dropout_rng=dropout_rngs) metrics_all.append(metrics) logging.info('train in step: %d', step) # Save a Checkpoint if ((step % config.checkpoint_freq == 0 and step > 0) or step == num_train_steps - 1): if jax.host_id() == 0 and config.save_checkpoints: # Save unreplicated optimizer + model state. checkpoints.save_checkpoint(FLAGS.model_dir, jax_utils.unreplicate(optimizer), step) # Periodic metric handling. if step % eval_freq == 0 and step > 0: metrics_all = common_utils.get_metrics(metrics_all) lr = metrics_all.pop('learning_rate').mean() metrics_sums = jax.tree_map(jnp.sum, metrics_all) denominator = metrics_sums.pop('denominator') summary = jax.tree_map(lambda x: x / denominator, metrics_sums) # pylint: disable=cell-var-from-loop summary['learning_rate'] = lr # Calculate (clipped) perplexity after averaging log-perplexities: summary['perplexity'] = jnp.clip(jnp.exp(summary['loss']), a_max=1.0e4) logging.info('train in step: %d, loss: %.4f, acc: %.4f', step, summary['loss'], summary['accuracy']) if jax.host_id() == 0: tock = time.time() steps_per_sec = eval_freq / (tock - tick) tick = tock summary_writer.scalar('steps per second', steps_per_sec, step) for key, val in summary.items(): summary_writer.scalar(f'train_{key}', val, step) summary_writer.flush() # Reset metric accumulation for next evaluation cycle. metrics_all = [] # Eval Metrics eval_metrics = [] eval_iter = iter(eval_ds) if num_eval_steps == -1: num_iter = itertools.repeat(1) else: num_iter = range(num_eval_steps) for _, eval_batch in zip(num_iter, eval_iter): # pylint: disable=protected-access eval_batch = common_utils.shard( jax.tree_map(lambda x: x._numpy(), eval_batch)) # pylint: enable=protected-access metrics = p_eval_step(optimizer.target, eval_batch) eval_metrics.append(metrics) eval_metrics = common_utils.get_metrics(eval_metrics) eval_metrics_sums = jax.tree_map(jnp.sum, eval_metrics) eval_denominator = eval_metrics_sums.pop('denominator') eval_summary = jax.tree_map( lambda x: x / eval_denominator, # pylint: disable=cell-var-from-loop eval_metrics_sums) # Calculate (clipped) perplexity after averaging log-perplexities: eval_summary['perplexity'] = jnp.clip(jnp.exp( eval_summary['loss']), a_max=1.0e4) logging.info('eval in step: %d, loss: %.4f, acc: %.4f', step, eval_summary['loss'], eval_summary['accuracy']) if jax.host_id() == 0: for key, val in eval_summary.items(): summary_writer.scalar(f'eval_{key}', val, step) summary_writer.flush()