def train(loss, init_fn, variables_to_train): """Wraps slim.training.train to run a training loop. Args: loss: a loss tensor. init_fn: a callable to be executed after all other initialization is done. variables_to_train: an optional list of variables to train. If None, it will default to all tf.trainable_variables(). """ hparams = common_flags.create_TrainingHparams( learning_rate=FLAGS.learning_rate, optimizer=FLAGS.optimizer, momentum=FLAGS.momentum) optimizer = utils.create_optimizer(hparams) lr = hparams.learning_rate tf.summary.scalar('learning_rate', lr) train_op = slim.learning.create_train_op( loss, optimizer, summarize_gradients=True, variables_to_train=variables_to_train, clip_gradient_norm=FLAGS.clip_gradient_norm) slim.learning.train(train_op, logdir=FLAGS.train_dir, graph=loss.graph, number_of_steps=FLAGS.max_number_of_steps, log_every_n_steps=FLAGS.log_every_n_steps, save_summaries_secs=FLAGS.save_summaries_secs, save_interval_secs=FLAGS.save_interval_secs, init_fn=init_fn)
def __init__(self, op, device): self.G = create_model(Generator, op) self.D = create_model(Discriminator, op) self.D_optimizer = create_optimizer(self.D, op) self.G_optimizer = create_optimizer(self.G, op) self.loss_fn = nn.BCELoss() # Create batch of latent vectors that we will use to visualize # the progression of the generator self.latent_size = op.model.nz self.fixed_noise = torch.randn(64, op.model.nz, 1, 1, device=device) self.op = op self.device = device if op.model.get('wgan_weight_clip'): print('Using weight clipping on D:', op.model.wgan_weight_clip) self.clip = True
def get_train_op(FLAGS, grads_and_vars): global_step = tf.train.get_or_create_global_step() optimizer = create_optimizer(FLAGS) # grads_and_vars = optimizer.compute_gradients(loss, colocate_gradients_with_ops=True) gradients, variables = zip(*grads_and_vars) if FLAGS.global_clip: clipped, gnorm = tf.clip_by_global_norm(gradients, FLAGS.clip) if getattr(FLAGS, "lr_layer_decay_rate", 1.0) != 1.0: for i in range(len(clipped)): for l in range(FLAGS.num_layer): if "Attention-{}/".format(l + 1) in variables[i].name or \ "Attention-Normal-{}/".format(l + 1) in variables[i].name or \ "FeedForward-{}/".format(l + 1) in variables[i].name or \ "FeedForward-Normal-{}/".format(l + 1) in variables[i].name: abs_rate = FLAGS.lr_layer_decay_rate ** (FLAGS.num_layer - 1 - l) clipped[i] *= abs_rate # tf.logging.info("Apply mult {:.4f} to layer-{} grad of {}".format(abs_rate, l, variables[i].name)) break if FLAGS.cross_pipeline and hvd.size() > 1: # pipeline num device devices = cluster_utils.get_pipeline_devices(FLAGS.pipeline_device_num) gradients_list = [[] for i in xrange(len(devices))] for grad, var in grads_and_vars: for i in xrange(len(devices)): if var.device == devices[i]: gradients_list[i].append((grad, var)) break avg_grads_and_vars = [] for i in xrange(len(devices)): with tf.device(devices[i]): for grad, var in gradients_list[i]: if isinstance(grad, tf.IndexedSlices): grad = tf.convert_to_tensor(grad) avg_grad = hvd.allreduce(grad) avg_grads_and_vars.append((avg_grad, var)) grads_and_vars = avg_grads_and_vars train_op = optimizer.apply_gradients( grads_and_vars, global_step=global_step) # zip(clipped, variables), global_step=global_step) return train_op
def get_train_op(FLAGS, loss): global_step = tf.train.get_or_create_global_step() optimizer = create_optimizer(FLAGS) grads_and_vars = optimizer.compute_gradients(loss) gradients, variables = zip(*grads_and_vars) clipped, gnorm = tf.clip_by_global_norm(gradients, FLAGS.clip) if getattr(FLAGS, "lr_layer_decay_rate", 1.0) != 1.0: for i in range(len(clipped)): for l in range(FLAGS.num_layer): if "Attention-{}/".format(l + 1) in variables[i].name or \ "Attention-Normal-{}/".format(l + 1) in variables[i].name or \ "FeedForward-{}/".format(l + 1) in variables[i].name or \ "FeedForward-Normal-{}/".format(l + 1) in variables[i].name: abs_rate = FLAGS.lr_layer_decay_rate**(FLAGS.num_layer - 1 - l) clipped[i] *= abs_rate # tf.logging.info("Apply mult {:.4f} to layer-{} grad of {}".format(abs_rate, l, variables[i].name)) break train_op = optimizer.apply_gradients(zip(clipped, variables), global_step=global_step) return train_op
def main(): # parse command line argument and generate config dictionary config = parse_args() logger.info(json.dumps(config, indent=2)) run_config = config['run_config'] optim_config = config['optim_config'] # TensorBoard SummaryWriter if run_config['tensorboard']: writer = SummaryWriter() else: writer = None # set random seed seed = run_config['seed'] torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) # create output directory outdir = run_config['outdir'] if not os.path.exists(outdir): os.makedirs(outdir) # save config as json file in output directory outpath = os.path.join(outdir, 'config.json') with open(outpath, 'w') as fout: json.dump(config, fout, indent=2) # load data loaders train_loader, test_loader = get_loader(config['data_config']) # load model logger.info('Loading model...') model = load_model(config['model_config']) n_params = sum([param.view(-1).size()[0] for param in model.parameters()]) logger.info('n_params: {}'.format(n_params)) if run_config['use_gpu']: model = nn.DataParallel(model) model.cuda() logger.info('Done') if config['data_config']['use_mixup']: train_criterion = CrossEntropyLoss(size_average=True) else: train_criterion = nn.CrossEntropyLoss(size_average=True) test_criterion = nn.CrossEntropyLoss(size_average=True) # create optimizer optim_config['steps_per_epoch'] = len(train_loader) optimizer, scheduler = create_optimizer(model.parameters(), optim_config) # run test before start training if run_config['test_first']: test(0, model, test_criterion, test_loader, run_config, writer) state = { 'config': config, 'state_dict': None, 'optimizer': None, 'epoch': 0, 'accuracy': 0, 'best_accuracy': 0, 'best_epoch': 0, } for epoch in range(1, optim_config['epochs'] + 1): # train train(epoch, model, optimizer, scheduler, train_criterion, train_loader, config, writer) # test accuracy = test(epoch, model, test_criterion, test_loader, run_config, writer) # update state dictionary state = update_state(state, epoch, accuracy, model, optimizer) # save model save_checkpoint(state, outdir) if run_config['tensorboard']: outpath = os.path.join(outdir, 'all_scalars.json') writer.export_scalars_to_json(outpath)
def main(argv): del argv # unused arg tf.io.gfile.makedirs(FLAGS.output_dir) logging.info('Model checkpoint will be saved at %s', FLAGS.output_dir) tf.random.set_seed(FLAGS.seed) if FLAGS.use_gpu: logging.info('Use GPU') strategy = tf.distribute.MirroredStrategy() else: logging.info('Use TPU at %s', FLAGS.tpu if FLAGS.tpu is not None else 'local') resolver = tf.distribute.cluster_resolver.TPUClusterResolver( tpu=FLAGS.tpu) tf.config.experimental_connect_to_cluster(resolver) tf.tpu.experimental.initialize_tpu_system(resolver) strategy = tf.distribute.TPUStrategy(resolver) batch_size = FLAGS.per_core_batch_size * FLAGS.num_cores test_batch_size = batch_size data_buffer_size = batch_size * 10 train_dataset_builder = ds.WikipediaToxicityDataset( split='train', data_dir=FLAGS.in_dataset_dir, shuffle_buffer_size=data_buffer_size) ind_dataset_builder = ds.WikipediaToxicityDataset( split='test', data_dir=FLAGS.in_dataset_dir, shuffle_buffer_size=data_buffer_size) ood_dataset_builder = ds.CivilCommentsDataset( split='test', data_dir=FLAGS.ood_dataset_dir, shuffle_buffer_size=data_buffer_size) ood_identity_dataset_builder = ds.CivilCommentsIdentitiesDataset( split='test', data_dir=FLAGS.identity_dataset_dir, shuffle_buffer_size=data_buffer_size) train_dataset_builders = { 'wikipedia_toxicity_subtypes': train_dataset_builder } test_dataset_builders = { 'ind': ind_dataset_builder, 'ood': ood_dataset_builder, 'ood_identity': ood_identity_dataset_builder, } class_weight = utils.create_class_weight(train_dataset_builders, test_dataset_builders) logging.info('class_weight: %s', str(class_weight)) ds_info = train_dataset_builder.tfds_info # Positive and negative classes. num_classes = ds_info.metadata['num_classes'] train_datasets = {} dataset_steps_per_epoch = {} total_steps_per_epoch = 0 for dataset_name, dataset_builder in train_dataset_builders.items(): train_datasets[dataset_name] = dataset_builder.load( batch_size=batch_size) dataset_steps_per_epoch[dataset_name] = ( dataset_builder.num_examples // batch_size) total_steps_per_epoch += dataset_steps_per_epoch[dataset_name] test_datasets = {} steps_per_eval = {} for dataset_name, dataset_builder in test_dataset_builders.items(): test_datasets[dataset_name] = dataset_builder.load( batch_size=test_batch_size) steps_per_eval[dataset_name] = (dataset_builder.num_examples // test_batch_size) if FLAGS.use_bfloat16: policy = tf.keras.mixed_precision.experimental.Policy('mixed_bfloat16') tf.keras.mixed_precision.experimental.set_policy(policy) summary_writer = tf.summary.create_file_writer( os.path.join(FLAGS.output_dir, 'summaries')) with strategy.scope(): logging.info('Building %s model', FLAGS.model_family) bert_config_dir, bert_ckpt_dir = utils.resolve_bert_ckpt_and_config_dir( FLAGS.bert_model_type, FLAGS.bert_dir, FLAGS.bert_config_dir, FLAGS.bert_ckpt_dir) bert_config = utils.create_config(bert_config_dir) model, bert_encoder = ub.models.DropoutBertBuilder( num_classes=num_classes, bert_config=bert_config, use_mc_dropout_mha=FLAGS.use_mc_dropout_mha, use_mc_dropout_att=FLAGS.use_mc_dropout_att, use_mc_dropout_ffn=FLAGS.use_mc_dropout_ffn, use_mc_dropout_output=FLAGS.use_mc_dropout_output, channel_wise_dropout_mha=FLAGS.channel_wise_dropout_mha, channel_wise_dropout_att=FLAGS.channel_wise_dropout_att, channel_wise_dropout_ffn=FLAGS.channel_wise_dropout_ffn) optimizer = utils.create_optimizer( FLAGS.base_learning_rate, steps_per_epoch=total_steps_per_epoch, epochs=FLAGS.train_epochs, warmup_proportion=FLAGS.warmup_proportion) logging.info('Model input shape: %s', model.input_shape) logging.info('Model output shape: %s', model.output_shape) logging.info('Model number of weights: %s', model.count_params()) metrics = { 'train/negative_log_likelihood': tf.keras.metrics.Mean(), 'train/accuracy': tf.keras.metrics.Accuracy(), 'train/accuracy_weighted': tf.keras.metrics.Accuracy(), 'train/auroc': tf.keras.metrics.AUC(), 'train/loss': tf.keras.metrics.Mean(), 'train/ece': um.ExpectedCalibrationError(num_bins=FLAGS.num_bins), 'train/precision': tf.keras.metrics.Precision(), 'train/recall': tf.keras.metrics.Recall(), 'train/f1': tfa_metrics.F1Score(num_classes=num_classes, average='micro', threshold=FLAGS.ece_label_threshold), } checkpoint = tf.train.Checkpoint(model=model, optimizer=optimizer) if FLAGS.prediction_mode: latest_checkpoint = tf.train.latest_checkpoint( FLAGS.eval_checkpoint_dir) else: latest_checkpoint = tf.train.latest_checkpoint(FLAGS.output_dir) initial_epoch = 0 if latest_checkpoint: # checkpoint.restore must be within a strategy.scope() so that optimizer # slot variables are mirrored. checkpoint.restore(latest_checkpoint) logging.info('Loaded checkpoint %s', latest_checkpoint) initial_epoch = optimizer.iterations.numpy( ) // total_steps_per_epoch elif FLAGS.model_family.lower() == 'bert': # load BERT from initial checkpoint bert_checkpoint = tf.train.Checkpoint(model=bert_encoder) bert_checkpoint.restore( bert_ckpt_dir).assert_existing_objects_matched() logging.info('Loaded BERT checkpoint %s', bert_ckpt_dir) metrics.update({ 'test/negative_log_likelihood': tf.keras.metrics.Mean(), 'test/auroc': tf.keras.metrics.AUC(curve='ROC'), 'test/aupr': tf.keras.metrics.AUC(curve='PR'), 'test/brier': tf.keras.metrics.MeanSquaredError(), 'test/brier_weighted': tf.keras.metrics.MeanSquaredError(), 'test/ece': um.ExpectedCalibrationError(num_bins=FLAGS.num_bins), 'test/acc': tf.keras.metrics.Accuracy(), 'test/acc_weighted': tf.keras.metrics.Accuracy(), 'test/eval_time': tf.keras.metrics.Mean(), 'test/precision': tf.keras.metrics.Precision(), 'test/recall': tf.keras.metrics.Recall(), 'test/f1': tfa_metrics.F1Score(num_classes=num_classes, average='micro', threshold=FLAGS.ece_label_threshold), }) for fraction in FLAGS.fractions: metrics.update({ 'test_collab_acc/collab_acc_{}'.format(fraction): um.OracleCollaborativeAccuracy(fraction=float(fraction), num_bins=FLAGS.num_bins) }) for dataset_name, test_dataset in test_datasets.items(): if dataset_name != 'ind': metrics.update({ 'test/nll_{}'.format(dataset_name): tf.keras.metrics.Mean(), 'test/auroc_{}'.format(dataset_name): tf.keras.metrics.AUC(curve='ROC'), 'test/aupr_{}'.format(dataset_name): tf.keras.metrics.AUC(curve='PR'), 'test/brier_{}'.format(dataset_name): tf.keras.metrics.MeanSquaredError(), 'test/brier_weighted_{}'.format(dataset_name): tf.keras.metrics.MeanSquaredError(), 'test/ece_{}'.format(dataset_name): um.ExpectedCalibrationError(num_bins=FLAGS.num_bins), 'test/acc_{}'.format(dataset_name): tf.keras.metrics.Accuracy(), 'test/acc_weighted_{}'.format(dataset_name): tf.keras.metrics.Accuracy(), 'test/eval_time_{}'.format(dataset_name): tf.keras.metrics.Mean(), 'test/precision_{}'.format(dataset_name): tf.keras.metrics.Precision(), 'test/recall_{}'.format(dataset_name): tf.keras.metrics.Recall(), 'test/f1_{}'.format(dataset_name): tfa_metrics.F1Score(num_classes=num_classes, average='micro', threshold=FLAGS.ece_label_threshold), }) for fraction in FLAGS.fractions: metrics.update({ 'test_collab_acc/collab_acc_{}_{}'.format( fraction, dataset_name): um.OracleCollaborativeAccuracy( fraction=float(fraction), num_bins=FLAGS.num_bins) }) @tf.function def generate_sample_weight(labels, class_weight, label_threshold=0.7): """Generate sample weight for weighted accuracy calculation.""" if label_threshold != 0.7: logging.warning( 'The class weight was based on `label_threshold` = 0.7, ' 'and weighted accuracy/brier will be meaningless if ' '`label_threshold` is not equal to this value, which is ' 'recommended by Jigsaw Conversation AI team.') labels_int = tf.cast(labels > label_threshold, tf.int32) sample_weight = tf.gather(class_weight, labels_int) return sample_weight @tf.function def train_step(iterator, dataset_name): """Training StepFn.""" def step_fn(inputs): """Per-Replica StepFn.""" features, labels, _ = utils.create_feature_and_label(inputs) with tf.GradientTape() as tape: logits = model(features, training=True) if FLAGS.use_bfloat16: logits = tf.cast(logits, tf.float32) loss_logits = tf.squeeze(logits, axis=1) if FLAGS.loss_type == 'cross_entropy': logging.info('Using cross entropy loss') negative_log_likelihood = tf.nn.sigmoid_cross_entropy_with_logits( labels, loss_logits) elif FLAGS.loss_type == 'focal_cross_entropy': logging.info('Using focal cross entropy loss') negative_log_likelihood = tfa_losses.sigmoid_focal_crossentropy( labels, loss_logits, alpha=FLAGS.focal_loss_alpha, gamma=FLAGS.focal_loss_gamma, from_logits=True) elif FLAGS.loss_type == 'mse': logging.info('Using mean squared error loss') loss_probs = tf.nn.sigmoid(loss_logits) negative_log_likelihood = tf.keras.losses.mean_squared_error( labels, loss_probs) elif FLAGS.loss_type == 'mae': logging.info('Using mean absolute error loss') loss_probs = tf.nn.sigmoid(loss_logits) negative_log_likelihood = tf.keras.losses.mean_absolute_error( labels, loss_probs) negative_log_likelihood = tf.reduce_mean( negative_log_likelihood) l2_loss = sum(model.losses) loss = negative_log_likelihood + l2_loss # Scale the loss given the TPUStrategy will reduce sum all gradients. scaled_loss = loss / strategy.num_replicas_in_sync grads = tape.gradient(scaled_loss, model.trainable_variables) optimizer.apply_gradients(zip(grads, model.trainable_variables)) probs = tf.nn.sigmoid(logits) # Cast labels to discrete for ECE computation. ece_labels = tf.cast(labels > FLAGS.ece_label_threshold, tf.float32) one_hot_labels = tf.one_hot(tf.cast(ece_labels, tf.int32), depth=num_classes) ece_probs = tf.concat([1. - probs, probs], axis=1) auc_probs = tf.squeeze(probs, axis=1) pred_labels = tf.math.argmax(ece_probs, axis=-1) sample_weight = generate_sample_weight( labels, class_weight['train/{}'.format(dataset_name)], FLAGS.ece_label_threshold) metrics['train/negative_log_likelihood'].update_state( negative_log_likelihood) metrics['train/accuracy'].update_state(labels, pred_labels) metrics['train/accuracy_weighted'].update_state( ece_labels, pred_labels, sample_weight=sample_weight) metrics['train/auroc'].update_state(labels, auc_probs) metrics['train/loss'].update_state(loss) metrics['train/ece'].update_state(ece_labels, ece_probs) metrics['train/precision'].update_state(ece_labels, pred_labels) metrics['train/recall'].update_state(ece_labels, pred_labels) metrics['train/f1'].update_state(one_hot_labels, ece_probs) strategy.run(step_fn, args=(next(iterator), )) @tf.function def test_step(iterator, dataset_name): """Evaluation StepFn to log metrics.""" def step_fn(inputs): """Per-Replica StepFn.""" features, labels, _ = utils.create_feature_and_label(inputs) eval_start_time = time.time() logits = model(features, training=False) eval_time = (time.time() - eval_start_time) / FLAGS.per_core_batch_size if FLAGS.use_bfloat16: logits = tf.cast(logits, tf.float32) probs = tf.nn.sigmoid(logits) # Cast labels to discrete for ECE computation. ece_labels = tf.cast(labels > FLAGS.ece_label_threshold, tf.float32) one_hot_labels = tf.one_hot(tf.cast(ece_labels, tf.int32), depth=num_classes) ece_probs = tf.concat([1. - probs, probs], axis=1) pred_labels = tf.math.argmax(ece_probs, axis=-1) auc_probs = tf.squeeze(probs, axis=1) loss_logits = tf.squeeze(logits, axis=1) negative_log_likelihood = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits(labels, loss_logits)) sample_weight = generate_sample_weight( labels, class_weight['test/{}'.format(dataset_name)], FLAGS.ece_label_threshold) if dataset_name == 'ind': metrics['test/negative_log_likelihood'].update_state( negative_log_likelihood) metrics['test/auroc'].update_state(labels, auc_probs) metrics['test/aupr'].update_state(labels, auc_probs) metrics['test/brier'].update_state(labels, auc_probs) metrics['test/brier_weighted'].update_state( tf.expand_dims(labels, -1), probs, sample_weight=sample_weight) metrics['test/ece'].update_state(ece_labels, ece_probs) metrics['test/acc'].update_state(ece_labels, pred_labels) metrics['test/acc_weighted'].update_state( ece_labels, pred_labels, sample_weight=sample_weight) metrics['test/eval_time'].update_state(eval_time) metrics['test/precision'].update_state(ece_labels, pred_labels) metrics['test/recall'].update_state(ece_labels, pred_labels) metrics['test/f1'].update_state(one_hot_labels, ece_probs) for fraction in FLAGS.fractions: metrics['test_collab_acc/collab_acc_{}'.format( fraction)].update_state(ece_labels, ece_probs) else: metrics['test/nll_{}'.format(dataset_name)].update_state( negative_log_likelihood) metrics['test/auroc_{}'.format(dataset_name)].update_state( labels, auc_probs) metrics['test/aupr_{}'.format(dataset_name)].update_state( labels, auc_probs) metrics['test/brier_{}'.format(dataset_name)].update_state( labels, auc_probs) metrics['test/brier_weighted_{}'.format( dataset_name)].update_state(tf.expand_dims(labels, -1), probs, sample_weight=sample_weight) metrics['test/ece_{}'.format(dataset_name)].update_state( ece_labels, ece_probs) metrics['test/acc_{}'.format(dataset_name)].update_state( ece_labels, pred_labels) metrics['test/acc_weighted_{}'.format( dataset_name)].update_state(ece_labels, pred_labels, sample_weight=sample_weight) metrics['test/eval_time_{}'.format(dataset_name)].update_state( eval_time) metrics['test/precision_{}'.format(dataset_name)].update_state( ece_labels, pred_labels) metrics['test/recall_{}'.format(dataset_name)].update_state( ece_labels, pred_labels) metrics['test/f1_{}'.format(dataset_name)].update_state( one_hot_labels, ece_probs) for fraction in FLAGS.fractions: metrics['test_collab_acc/collab_acc_{}_{}'.format( fraction, dataset_name)].update_state(ece_labels, ece_probs) strategy.run(step_fn, args=(next(iterator), )) @tf.function def final_eval_step(iterator): """Final Evaluation StepFn to save prediction to directory.""" def step_fn(inputs): bert_features, labels, additional_labels = utils.create_feature_and_label( inputs) logits = model(bert_features, training=False) features = inputs['input_ids'] return features, logits, labels, additional_labels (per_replica_texts, per_replica_logits, per_replica_labels, per_replica_additional_labels) = (strategy.run( step_fn, args=(next(iterator), ))) if strategy.num_replicas_in_sync > 1: texts_list = tf.concat(per_replica_texts.values, axis=0) logits_list = tf.concat(per_replica_logits.values, axis=0) labels_list = tf.concat(per_replica_labels.values, axis=0) additional_labels_dict = {} for additional_label in utils.IDENTITY_LABELS: if additional_label in per_replica_additional_labels: additional_labels_dict[additional_label] = tf.concat( per_replica_additional_labels[additional_label], axis=0) else: texts_list = per_replica_texts logits_list = per_replica_logits labels_list = per_replica_labels additional_labels_dict = {} for additional_label in utils.IDENTITY_LABELS: if additional_label in per_replica_additional_labels: additional_labels_dict[ additional_label] = per_replica_additional_labels[ additional_label] return texts_list, logits_list, labels_list, additional_labels_dict if FLAGS.prediction_mode: # Prediction and exit. for dataset_name, test_dataset in test_datasets.items(): test_iterator = iter(test_dataset) # pytype: disable=wrong-arg-types message = 'Final eval on dataset {}'.format(dataset_name) logging.info(message) texts_all = [] logits_all = [] labels_all = [] additional_labels_all_dict = {} if 'identity' in dataset_name: for identity_label_name in utils.IDENTITY_LABELS: additional_labels_all_dict[identity_label_name] = [] try: with tf.experimental.async_scope(): for step in range(steps_per_eval[dataset_name]): if step % 20 == 0: message = 'Starting to run eval step {}/{} of dataset: {}'.format( step, steps_per_eval[dataset_name], dataset_name) logging.info(message) (text_step, logits_step, labels_step, additional_labels_dict_step ) = final_eval_step(test_iterator) texts_all.append(text_step) logits_all.append(logits_step) labels_all.append(labels_step) if 'identity' in dataset_name: for identity_label_name in utils.IDENTITY_LABELS: additional_labels_all_dict[ identity_label_name].append( additional_labels_dict_step[ identity_label_name]) except (StopIteration, tf.errors.OutOfRangeError): tf.experimental.async_clear_error() logging.info('Done with eval on %s', dataset_name) texts_all = tf.concat(texts_all, axis=0) logits_all = tf.concat(logits_all, axis=0) labels_all = tf.concat(labels_all, axis=0) additional_labels_all = [] if additional_labels_all_dict: for identity_label_name in utils.IDENTITY_LABELS: additional_labels_all.append( tf.concat( additional_labels_all_dict[identity_label_name], axis=0)) additional_labels_all = tf.convert_to_tensor(additional_labels_all) utils.save_prediction(texts_all.numpy(), path=os.path.join( FLAGS.output_dir, 'texts_{}'.format(dataset_name))) utils.save_prediction(labels_all.numpy(), path=os.path.join( FLAGS.output_dir, 'labels_{}'.format(dataset_name))) utils.save_prediction(logits_all.numpy(), path=os.path.join( FLAGS.output_dir, 'logits_{}'.format(dataset_name))) if 'identity' in dataset_name: utils.save_prediction( additional_labels_all.numpy(), path=os.path.join( FLAGS.output_dir, 'additional_labels_{}'.format(dataset_name))) logging.info('Done with testing on %s', dataset_name) else: # Execute train / eval loop. start_time = time.time() train_iterators = {} for dataset_name, train_dataset in train_datasets.items(): train_iterators[dataset_name] = iter(train_dataset) for epoch in range(initial_epoch, FLAGS.train_epochs): logging.info('Starting to run epoch: %s', epoch) current_step = epoch * total_steps_per_epoch for dataset_name, train_iterator in train_iterators.items(): for step in range(dataset_steps_per_epoch[dataset_name]): train_step(train_iterator, dataset_name) current_step += 1 max_steps = total_steps_per_epoch * FLAGS.train_epochs time_elapsed = time.time() - start_time steps_per_sec = float(current_step) / time_elapsed eta_seconds = (max_steps - current_step) / steps_per_sec message = ( '{:.1%} completion: epoch {:d}/{:d}. {:.1f} steps/s. ' 'ETA: {:.0f} min. Time elapsed: {:.0f} min'.format( current_step / max_steps, epoch + 1, FLAGS.train_epochs, steps_per_sec, eta_seconds / 60, time_elapsed / 60)) if step % 20 == 0: logging.info(message) if epoch % FLAGS.evaluation_interval == 0: for dataset_name, test_dataset in test_datasets.items(): test_iterator = iter(test_dataset) # pytype: disable=wrong-arg-types logging.info('Testing on dataset %s', dataset_name) try: with tf.experimental.async_scope(): for step in range(steps_per_eval[dataset_name]): if step % 20 == 0: logging.info( 'Starting to run eval step %s/%s of epoch: %s', step, steps_per_eval[dataset_name], epoch) test_step(test_iterator, dataset_name) except (StopIteration, tf.errors.OutOfRangeError): tf.experimental.async_clear_error() logging.info('Done with testing on %s', dataset_name) logging.info('Train Loss: %.4f, AUROC: %.4f', metrics['train/loss'].result(), metrics['train/auroc'].result()) logging.info('Test NLL: %.4f, AUROC: %.4f', metrics['test/negative_log_likelihood'].result(), metrics['test/auroc'].result()) # record results total_results = {} for name, metric in metrics.items(): total_results[name] = metric.result() with summary_writer.as_default(): for name, result in total_results.items(): tf.summary.scalar(name, result, step=epoch + 1) for name, metric in metrics.items(): metric.reset_states() checkpoint_interval = min(FLAGS.checkpoint_interval, FLAGS.train_epochs) if checkpoint_interval > 0 and (epoch + 1) % checkpoint_interval == 0: checkpoint_name = checkpoint.save( os.path.join(FLAGS.output_dir, 'checkpoint')) logging.info('Saved checkpoint to %s', checkpoint_name) # Save model in SavedModel format on exit. final_save_name = os.path.join(FLAGS.output_dir, 'model') model.save(final_save_name) logging.info('Saved model to %s', final_save_name)
parser.add_argument("-o", "--output-shape", nargs="+", type=int, required=True) parser.add_argument("-s", "--hidden-sizes", nargs="+", type=int, default=[]) parser.add_argument("-a", "--activation", type=str, default="tanh") parser.add_argument("-A", "--output-activation", type=str, default="tanh") parser.add_argument("-r", "--rnn", type=str, default="") parser.add_argument("-f", "--filepath", type=str, required="true") parser.add_argument("-p", "--optimizer", type=str, default=None) parser.add_argument("-l", "--learning-rate", type=float, default=1e-4) parser.add_argument("-c", "--clipnorm", type=float, default=1.0) parser.add_argument("--loss", type=str, default="mse") args = parser.parse_args() optimizer = create_optimizer(args) rnn = None if args.rnn is None \ else tf.keras.layers.LSTM if args.rnn.lower() == "lstm" \ else tf.keras.layers.GRU if args.rnn.lower() == "gru" \ else tf.keras.layers.SimpleRNN if args.rnn.lower() == "simple" else None model = dense_stack(args.input_shape[0], args.output_shape[0], args.hidden_sizes, rnn, args.activation, args.output_activation) model.compile(loss=args.loss, optimizer=optimizer) print(model.summary()) model.save(args.filepath)
def main(): # parse command line argument and generate config dictionary config = parse_args() logger.info(json.dumps(config, indent=2)) run_config = config['run_config'] optim_config = config['optim_config'] # TensorBoard SummaryWriter if run_config['tensorboard']: writer = SummaryWriter(run_config['outdir']) else: writer = None # set random seed seed = run_config['seed'] torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) epoch_seeds = np.random.randint( np.iinfo(np.int32).max // 2, size=optim_config['epochs']) # create output directory outdir = pathlib.Path(run_config['outdir']) outdir.mkdir(exist_ok=True, parents=True) # save config as json file in output directory outpath = outdir / 'config.json' with open(outpath, 'w') as fout: json.dump(config, fout, indent=2) # load data loaders train_loader, test_loader = get_loader(config['data_config']) # load model logger.info('Loading model...') model = utils.load_model(config['model_config']) n_params = sum([param.view(-1).size()[0] for param in model.parameters()]) logger.info('n_params: {}'.format(n_params)) if run_config['fp16']: model.half() for layer in model.modules(): if isinstance(layer, nn.BatchNorm2d): layer.float() device = run_config['device'] if device is not 'cpu' and torch.cuda.device_count() > 1: model = nn.DataParallel(model) model.to(device) logger.info('Done') train_criterion, test_criterion = utils.get_criterion( config['data_config']) # create optimizer optim_config['steps_per_epoch'] = len(train_loader) optimizer, scheduler = utils.create_optimizer(model.parameters(), optim_config) # run test before start training if run_config['test_first']: test(0, model, test_criterion, test_loader, run_config, writer) state = { 'config': config, 'state_dict': None, 'optimizer': None, 'epoch': 0, 'accuracy': 0, 'best_accuracy': 0, 'best_epoch': 0, } epoch_logs = [] for epoch, seed in zip(range(1, optim_config['epochs'] + 1), epoch_seeds): np.random.seed(seed) # train train_log = train(epoch, model, optimizer, scheduler, train_criterion, train_loader, config, writer) # test test_log = test(epoch, model, test_criterion, test_loader, run_config, writer) epoch_log = train_log.copy() epoch_log.update(test_log) epoch_logs.append(epoch_log) utils.save_epoch_logs(epoch_logs, outdir) # update state dictionary state = update_state(state, epoch, epoch_log['test']['accuracy'], model, optimizer) # save model utils.save_checkpoint(state, outdir)
def main(): # parse command line argument and generate config dictionary config = parse_args() logger.info(json.dumps(config, indent=2)) run_config = config['run_config'] optim_config = config['optim_config'] # TensorBoard SummaryWriter if run_config['tensorboard']: writer = SummaryWriter() else: writer = None # set random seed seed = run_config['seed'] torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) # create output directory outdir = run_config['outdir'] if not os.path.exists(outdir): os.makedirs(outdir) # save config as json file in output directory outpath = os.path.join(outdir, 'config.json') with open(outpath, 'w') as fout: json.dump(config, fout, indent=2) # load data loaders train_loader, test_loader = get_loader(config['data_config']) # compute covariances full_train_loader, _ = get_loader(config['data_config'], return_full=True) for batch in full_train_loader: full_images, full_labels = batch num_classes = config['data_config']['n_classes'] full_targets = onehot(full_labels, num_classes) xbar, ybar, xxcov, xycov, T = taylor.compute_moments(full_images, full_targets) #torch.save(xbar, 'xbar.pt') #torch.save(ybar, 'ybar.pt') #torch.save(xxcov, 'xxcov.pt') #torch.save(xycov, 'xycov.pt') num_components = config['data_config']['cov_components'] Uxx, Sxx, Vxx = taylor.decomposition(xxcov, num_components) Uxy, Sxy, Vxy = taylor.decomposition(xycov, 10) xdim = T.shape[1] # svd's of T[i,:,:] slices T_U = torch.zeros((num_classes, xdim, num_components)) T_S = torch.zeros((num_classes, num_components)) T_V = torch.zeros((num_classes, xdim, num_components)) for i in range(num_classes): T_U[i,:,:], T_S[i,:], T_V[i,:,:] = taylor.decomposition(T[i,:,:], num_components) #torch.save(Uxx, 'Uxx.pt') #torch.save(Sxx, 'Sxx.pt') #torch.save(Vxx, 'Vxx.pt') #torch.save(Uxy, 'Uxy.pt') #torch.save(Sxy, 'Sxy.pt') #torch.save(Vxy, 'Vxy.pt') if run_config['use_gpu']: moment_dict = { 'Uxx': Uxx.cuda(), 'Uxy': Uxy.cuda(), 'Sxx': Sxx.cuda(), 'Sxy': Sxy.cuda(), 'Vxx': Vxx.cuda(), 'Vxy': Vxy.cuda(), 'xbar': xbar.reshape(full_images.shape[1:]).cuda(), 'ybar': ybar.cuda(), 'T_U': T_U.cuda(), 'T_S': T_S.cuda(), 'T_V': T_V.cuda() } else: moment_dict = { 'Uxx': Uxx, 'Uxy': Uxy, 'Sxx': Sxx, 'Sxy': Sxy, 'Vxx': Vxx, 'Vxy': Vxy, 'xbar': xbar.reshape(full_images.shape[1:]), 'ybar': ybar, 'T_U': T_U, 'T_S': T_S, 'T_V': T_V } # set up dataframe for recording results: dfcols = [] dfcols.append('epoch') dfcols.append('train_loss') dfcols.append('train_acc') if config['data_config']['use_mixup']: dfcols.append('doublesum_train') dfcols.append('doublesum_eval') dfcols.append('doublesum_eval2') if config['data_config']['cov_components'] > 0: dfcols.append('taylor_base') dfcols.append('taylor_de') for k in [1, 2, 5, 20, 50, 200]: dfcols.append('taylor_d2_' + str(k)) for k in [1, 2, 5, 20, 50, 200]: dfcols.append('taylor_d2e_' + str(k)) dfcols.append('test_loss') dfcols.append('test_acc') resultsdf = pd.DataFrame(columns=dfcols) # load model logger.info('Loading model...') model = load_model(config['model_config']) n_params = sum([param.view(-1).size()[0] for param in model.parameters()]) logger.info('n_params: {}'.format(n_params)) if run_config['use_gpu']: model = nn.DataParallel(model) model.cuda() logger.info('Done') if config['data_config']['use_mixup']: train_criterion = CrossEntropyLoss(size_average=True) else: train_criterion = nn.CrossEntropyLoss(size_average=True) test_criterion = nn.CrossEntropyLoss(size_average=True) # create optimizer optim_config['steps_per_epoch'] = len(train_loader) optimizer, scheduler = create_optimizer(model.parameters(), optim_config) # run test before start training if run_config['test_first']: test(0, model, test_criterion, test_loader, run_config, writer) state = { 'config': config, 'state_dict': None, 'optimizer': None, 'epoch': 0, 'accuracy': 0, 'best_accuracy': 0, 'best_epoch': 0, } for epoch in range(1, optim_config['epochs'] + 1): # train dfrow = train(epoch, model, optimizer, scheduler, train_criterion, train_loader, config, writer, moment_dict) # test test_loss, accuracy = test(epoch, model, test_criterion, test_loader, run_config, writer) dfrow.append(test_loss) dfrow.append(accuracy) if epoch <= 4 or epoch % 5 == 0: resultsdf.loc[resultsdf.shape[0]] = list(dfrow) resultsdf.to_csv(os.path.join(outdir, 'results.csv')) # update state dictionary state = update_state(state, epoch, accuracy, model, optimizer) # save model save_checkpoint(state, outdir) if run_config['tensorboard']: outpath = os.path.join(outdir, 'all_scalars.json') writer.export_scalars_to_json(outpath)
valid_dataset = ConcatDataset([valid_dataset, flipped_valid_dataset]) train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=args.batch_size, num_workers=args.dataload_workers_nums, drop_last=True) valid_dataloader = DataLoader(valid_dataset, shuffle=False, batch_size=args.batch_size, num_workers=args.dataload_workers_nums) # a name used to save checkpoints etc. full_name = '%s_%s_%s_%s_bs%d_lr%.1e_wd%.1e' % ( args.model, args.data_fold, args.optim, args.lr_scheduler, args.batch_size, args.learning_rate, args.weight_decay) if args.comment: full_name = '%s_%s' % (full_name, args.comment) model = models.create(args.model, basenet=args.basenet, pretrained=args.pretrained) model, optimizer = create_optimizer(model, args.optim, args.learning_rate, args.weight_decay, momentum=0.9, fp16_loss_scale=args.fp16_loss_scale, device=device) lr_scheduler = create_lr_scheduler(optimizer, **vars(args)) start_timestamp = int(time.time() * 1000) start_epoch = 0 best_loss = 1e10 best_metric = 0 best_accuracy = 0 global_step = 0 if args.resume: print("resuming a checkpoint '%s'" % args.resume) if os.path.exists(args.resume): saved_checkpoint = torch.load(args.resume)
if 'bn' not in name ] }, { 'params': [ param for name, param in model.named_parameters() if 'bn' in name ], 'weight_decay': 0 }, ] else: params = model.parameters() optim_config['steps_per_epoch'] = len(train_loader) optimizer, scheduler = utils.create_optimizer(params, optim_config) # for mixed-precision amp_handle = apex.amp.init( enabled=run_config['use_amp']) if is_apex_available else None # run test before start training if run_config['test_first']: test(0, model, test_criterion, test_loader, run_config, writer) state = { 'config': config, 'state_dict': None, 'optimizer': None, 'epoch': 0, 'accuracy': 0,
def main(): args.exp_dir = os.path.join( args.exp_dir, datetime.datetime.now().strftime('%Y%m%d-%H%M%S')) setup_logger(args.exp_dir) if args.local_rank == 0: logging.info(args) use_gpu = False if args.gpu_devices is not None and torch.cuda.is_available(): use_gpu = True if use_gpu and args.local_rank == 0: logging.info('Currently using GPU: {}'.format(args.gpu_devices)) elif not use_gpu and args.local_rank == 0: logging.info('Currently using CPU') set_seeds(args.seed, use_gpu) model = SPOS(args.in_channel_list, args.num_layer_list, args.best_arch, args.num_classes) if args.local_rank == 0: logging.info('Model size: {:.2f}M'.format(calc_params(model) / 1e6)) if use_gpu: if args.distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') args.world_size = torch.distributed.get_world_size() logging.info('Training in distributed mode (process {}/{})'.format( args.local_rank + 1, args.world_size)) model = DDP(model.cuda(), delay_allreduce=True) else: model = nn.DataParallel(model).cuda() train_dataset, train_loader = create_dataset(args.train_dir, args.batch_size, use_gpu, args.distributed, is_training=True) val_dataset, val_loader = create_dataset(args.val_dir, args.batch_size * 4, use_gpu, args.distributed) args.num_sched_steps = len(train_loader) * args.total_epochs args.num_warmup_steps = int(args.num_sched_steps * args.warmup_proportion) optimizer = create_optimizer(model, args.optim_type, args.lr, args.weight_decay, args.momentum) criterion = create_criterion(args.num_classes, args.label_smooth) scheduler = create_scheduler(optimizer, args.sched_type, args.num_sched_steps, args.num_warmup_steps) criterion = criterion.cuda() if use_gpu else criterion optim_tools = [optimizer, criterion, scheduler] if args.resume_path: if os.path.exists(args.resume_path): checkpoint = torch.load(args.resume_path, map_location='cpu') try: model.load_state_dict(checkpoint['state_dict']) except RuntimeError: new_state_dict = OrderedDict() for k, v in checkpoint['state_dict'].items(): name = k[7:] new_state_dict[name] = v model.load_state_dict(new_state_dict) optimizer.load_state_dict(checkpoint['optimizer']) scheduler.load_state_dict(checkpoint['scheduler']) args.start_epoch = checkpoint['epoch'] + 1 if args.local_rank == 0: logging.info('Loaded checkpoint from \'{}\''.format( args.resume_path)) logging.info( 'Start epoch: {}\tPrec@1: {:.2f}%\tPrec@5: {:.2f}%'.format( args.start_epoch, checkpoint['prec1'], checkpoint['prec5'])) else: if args.local_rank == 0: logging.info('No checkpoint found in \'{}\''.format( args.resume_path)) try: train(model, optim_tools, train_loader, val_loader, use_gpu) except KeyboardInterrupt: print('Keyboard interrupt (process {}/{})'.format( args.local_rank + 1, args.world_size))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--config', type=str) args = parser.parse_args() with open(args.config, 'r') as f_yaml: config = yaml.load(f_yaml) outdir = pathlib.Path(config['outdir']) outdir.mkdir(exist_ok=True, parents=True) shutil.copyfile(args.config, str(outdir) + '/config.yaml') zipf = zipfile.ZipFile( str(outdir) + '/codes.zip', 'w', zipfile.ZIP_DEFLATED) zipdir('./', zipf) zipf.close() f_results = open(str(outdir) + '/results.txt', 'w', buffering=1) seed = config['seed'] torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) epoch_seeds = np.random.randint(np.iinfo(np.int32).max // 2, size=config['epochs']) train_loader, test_loader = get_loader(config) module = importlib.import_module('models.{}'.format(config['arch'])) Network = getattr(module, 'Network') model = Network(config) n_params = sum([param.view(-1).size()[0] for param in model.parameters()]) print('n_params: {}'.format(n_params)) device = torch.device(config['device']) if config['load_weights']: state_dict = torch.load(config['model_path'])['state_dict'] model.load_state_dict(state_dict, strict=True) if device.type == 'cuda' and torch.cuda.device_count() > 1: model = nn.DataParallel(model) model.to(device) train_criterion = nn.CrossEntropyLoss(reduction='mean') test_criterion = nn.CrossEntropyLoss(reduction='mean') #params = model.parameters() params = [ { 'params': [ param for name, param in model.named_parameters() if 'bn' not in name ] }, { 'params': [ param for name, param in model.named_parameters() if 'bn' in name ], 'weight_decay': 0 }, ] counter = 0 config['steps_per_epoch'] = len(train_loader) optimizer, scheduler = utils.create_optimizer(params, config) voxceleb2_val = open('scp/voxceleb2_val.txt', 'r').readlines() # run test before start training if config['test_first']: test(0, model, voxceleb2_val, test_loader, config, f_results) state = { 'config': config, 'state_dict': None, 'optimizer': None, 'epoch': 0, 'val_eer': 50., 'best_val_eer': 50., 'best_epoch': 0, } epoch_logs = [] for epoch, seed in zip(range(1, config['epochs'] + 1), epoch_seeds): np.random.seed(seed) train_log = train(epoch, model, optimizer, scheduler, train_criterion, train_loader, config, f_results) epoch_log = train_log.copy() val_eer = test(epoch, model, voxceleb2_val, test_loader, config, f_results) epoch_logs.append(epoch_log) # update state dictionary state = update_state(state, epoch, val_eer, model, optimizer) # save model utils.save_checkpoint(state, outdir)
def init_network(self): input_shape = self.feedback_size + (self.num_frames, ) worker_device = "/job:worker/task:{}/cpu:0".format(self.agent_index) with tf.device( tf.train.replica_device_setter(1, worker_device=worker_device)): with tf.variable_scope("global"): if self.use_lstm is False: self.shared_network = FFPolicy(input_shape, len(self.actions), self.network_type) else: self.shared_network = LSTMPolicy(input_shape, len(self.actions), self.network_type) self.global_step = tf.get_variable( "global_step", shape=[], initializer=tf.constant_initializer(0, dtype=tf.int32), trainable=False, dtype=tf.int32) self.best_score = tf.get_variable( "best_score", shape=[], initializer=tf.constant_initializer(-1e2, dtype=tf.float32), trainable=False, dtype=tf.float32) with tf.device(worker_device): with tf.variable_scope('local'): if self.use_lstm is False: self.network = FFPolicy(input_shape, len(self.actions), self.network_type) else: self.network = LSTMPolicy(input_shape, len(self.actions), self.network_type) # Sync params self.update_local_ops = update_target_graph( self.shared_network.vars, self.network.vars) # Learning rate self.lr = tf.get_variable(name='lr', shape=[], initializer=tf.constant_initializer( self.learning_rate), trainable=False, dtype=tf.float32) self.t_lr = tf.placeholder(dtype=tf.float32, shape=[], name='new_lr') self.assign_lr_op = tf.assign(self.lr, self.t_lr) # Best score self.t_score = tf.placeholder(dtype=tf.float32, shape=[], name='new_score') self.assign_best_score_op = tf.assign(self.best_score, self.t_score) # Build gradient_op self.increase_step = self.global_step.assign_add(1) gradients = self.network.build_gradient_op(clip_grad=40.0) # Additional summaries tf.summary.scalar("learning_rate", self.lr, collections=['a3c']) tf.summary.scalar("score", self.t_score, collections=['a3c']) tf.summary.scalar("best_score", self.best_score, collections=['a3c']) self.summary_op = tf.summary.merge_all('a3c') if self.shared_optimizer: with tf.device( tf.train.replica_device_setter( 1, worker_device=worker_device)): with tf.variable_scope("global"): optimizer = create_optimizer(self.update_method, self.lr, self.rho, self.rmsprop_epsilon) self.train_op = optimizer.apply_gradients( zip(gradients, self.shared_network.vars)) else: with tf.device(worker_device): with tf.variable_scope('local'): optimizer = create_optimizer(self.update_method, self.lr, self.rho, self.rmsprop_epsilon) self.train_op = optimizer.apply_gradients( zip(gradients, self.shared_network.vars))
x = base_model.output x = GlobalAveragePooling2D()(x) # let's add a fully-connected layer x = Dense(1024, activation='relu')(x) # and a logistic layer -- let's say we have 200 classes predictions = Dense(num_classes, activation='softmax')(x) # this the model we will train model = Model(inputs=base_model.input, outputs=predictions) # if train only the top layers, uncomment the following two lines # for layer in base_model.layers: # layer.trainable = False # configure the optimizer optimizer = utils.create_optimizer(opt, learning_rate, momentum) # compile the model (should be done after setting layers to non-trainable) model.compile( optimizer=optimizer, loss="categorical_crossentropy", metrics=['accuracy'], ) # train the model # define callback function early_stopping = EarlyStopping( monitor='val_loss', patience=10, ) reduce_lr = ReduceLROnPlateau(
# a name used to save checkpoints etc. full_name = '%s_%s_%s_%s_bs%d_lr%.1e_wd%.1e' % ( args.model, args.data_fold, args.optim, args.lr_scheduler, args.batch_size, args.learning_rate, args.weight_decay) if args.comment: full_name = '%s_%s' % (full_name, args.comment) model = models.create(args.model, basenet=args.basenet, pretrained=args.pretrained) model, optimizer = create_optimizer(model, args.optim, args.learning_rate, args.weight_decay, momentum=0.9, fp16_loss_scale=args.fp16_loss_scale, device=device) lr_scheduler = create_lr_scheduler(optimizer, **vars(args)) start_timestamp = int(time.time() * 1000) start_epoch = 0 best_loss = 1e10 best_metric = 0 best_accuracy = 0 global_step = 0 if args.resume: print("resuming a checkpoint '%s'" % args.resume)
def main(argv): del argv # unused arg tf.io.gfile.makedirs(FLAGS.output_dir) logging.info('Saving checkpoints at %s', FLAGS.output_dir) tf.random.set_seed(FLAGS.seed) if FLAGS.use_gpu: logging.info('Use GPU') strategy = tf.distribute.MirroredStrategy() else: logging.info('Use TPU at %s', FLAGS.tpu if FLAGS.tpu is not None else 'local') resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu=FLAGS.tpu) tf.config.experimental_connect_to_cluster(resolver) tf.tpu.experimental.initialize_tpu_system(resolver) strategy = tf.distribute.TPUStrategy(resolver) batch_size = FLAGS.per_core_batch_size * FLAGS.num_cores test_batch_size = batch_size data_buffer_size = batch_size * 10 train_dataset_builder = ds.WikipediaToxicityDataset( split='train', data_dir=FLAGS.in_dataset_dir, shuffle_buffer_size=data_buffer_size) ind_dataset_builder = ds.WikipediaToxicityDataset( split='test', data_dir=FLAGS.in_dataset_dir, shuffle_buffer_size=data_buffer_size) ood_dataset_builder = ds.CivilCommentsDataset( split='test', data_dir=FLAGS.ood_dataset_dir, shuffle_buffer_size=data_buffer_size) ood_identity_dataset_builder = ds.CivilCommentsIdentitiesDataset( split='test', data_dir=FLAGS.identity_dataset_dir, shuffle_buffer_size=data_buffer_size) train_dataset_builders = { 'wikipedia_toxicity_subtypes': train_dataset_builder } test_dataset_builders = { 'ind': ind_dataset_builder, 'ood': ood_dataset_builder, 'ood_identity': ood_identity_dataset_builder, } if FLAGS.prediction_mode and FLAGS.identity_prediction: for dataset_name in utils.IDENTITY_LABELS: if utils.NUM_EXAMPLES[dataset_name]['test'] > 100: test_dataset_builders[dataset_name] = ds.CivilCommentsIdentitiesDataset( split='test', data_dir=os.path.join( FLAGS.identity_specific_dataset_dir, dataset_name), shuffle_buffer_size=data_buffer_size) for dataset_name in utils.IDENTITY_TYPES: if utils.NUM_EXAMPLES[dataset_name]['test'] > 100: test_dataset_builders[dataset_name] = ds.CivilCommentsIdentitiesDataset( split='test', data_dir=os.path.join( FLAGS.identity_type_dataset_dir, dataset_name), shuffle_buffer_size=data_buffer_size) class_weight = utils.create_class_weight( train_dataset_builders, test_dataset_builders) logging.info('class_weight: %s', str(class_weight)) ds_info = train_dataset_builder.tfds_info # Positive and negative classes. num_classes = ds_info.metadata['num_classes'] train_datasets = {} dataset_steps_per_epoch = {} total_steps_per_epoch = 0 # TODO(jereliu): Apply strategy.experimental_distribute_dataset to the # dataset_builders. for dataset_name, dataset_builder in train_dataset_builders.items(): train_datasets[dataset_name] = dataset_builder.load( batch_size=FLAGS.per_core_batch_size) dataset_steps_per_epoch[dataset_name] = ( dataset_builder.num_examples // batch_size) total_steps_per_epoch += dataset_steps_per_epoch[dataset_name] test_datasets = {} steps_per_eval = {} for dataset_name, dataset_builder in test_dataset_builders.items(): test_datasets[dataset_name] = dataset_builder.load( batch_size=test_batch_size) if dataset_name in ['ind', 'ood', 'ood_identity']: steps_per_eval[dataset_name] = ( dataset_builder.num_examples // test_batch_size) else: steps_per_eval[dataset_name] = ( utils.NUM_EXAMPLES[dataset_name]['test'] // test_batch_size) if FLAGS.use_bfloat16: policy = tf.keras.mixed_precision.experimental.Policy('mixed_bfloat16') tf.keras.mixed_precision.experimental.set_policy(policy) summary_writer = tf.summary.create_file_writer( os.path.join(FLAGS.output_dir, 'summaries')) with strategy.scope(): logging.info('Building BERT %s model', FLAGS.bert_model_type) logging.info('use_gp_layer=%s', FLAGS.use_gp_layer) logging.info('use_spec_norm_att=%s', FLAGS.use_spec_norm_att) logging.info('use_spec_norm_ffn=%s', FLAGS.use_spec_norm_ffn) logging.info('use_layer_norm_att=%s', FLAGS.use_layer_norm_att) logging.info('use_layer_norm_ffn=%s', FLAGS.use_layer_norm_ffn) bert_config_dir, bert_ckpt_dir = utils.resolve_bert_ckpt_and_config_dir( FLAGS.bert_model_type, FLAGS.bert_dir, FLAGS.bert_config_dir, FLAGS.bert_ckpt_dir) bert_config = utils.create_config(bert_config_dir) gp_layer_kwargs = dict( num_inducing=FLAGS.gp_hidden_dim, gp_kernel_scale=FLAGS.gp_scale, gp_output_bias=FLAGS.gp_bias, normalize_input=FLAGS.gp_input_normalization, gp_cov_momentum=FLAGS.gp_cov_discount_factor, gp_cov_ridge_penalty=FLAGS.gp_cov_ridge_penalty) spec_norm_kwargs = dict( iteration=FLAGS.spec_norm_iteration, norm_multiplier=FLAGS.spec_norm_bound) model, bert_encoder = ub.models.SngpBertBuilder( num_classes=num_classes, bert_config=bert_config, gp_layer_kwargs=gp_layer_kwargs, spec_norm_kwargs=spec_norm_kwargs, use_gp_layer=FLAGS.use_gp_layer, use_spec_norm_att=FLAGS.use_spec_norm_att, use_spec_norm_ffn=FLAGS.use_spec_norm_ffn, use_layer_norm_att=FLAGS.use_layer_norm_att, use_layer_norm_ffn=FLAGS.use_layer_norm_ffn, use_spec_norm_plr=FLAGS.use_spec_norm_plr) # Create an AdamW optimizer with beta_2=0.999, epsilon=1e-6. optimizer = utils.create_optimizer( FLAGS.base_learning_rate, steps_per_epoch=total_steps_per_epoch, epochs=FLAGS.train_epochs, warmup_proportion=FLAGS.warmup_proportion, beta_1=1.0 - FLAGS.one_minus_momentum) logging.info('Model input shape: %s', model.input_shape) logging.info('Model output shape: %s', model.output_shape) logging.info('Model number of weights: %s', model.count_params()) metrics = { 'train/negative_log_likelihood': tf.keras.metrics.Mean(), 'train/accuracy': tf.keras.metrics.Accuracy(), 'train/accuracy_weighted': tf.keras.metrics.Accuracy(), 'train/auroc': tf.keras.metrics.AUC(), 'train/loss': tf.keras.metrics.Mean(), 'train/ece': rm.metrics.ExpectedCalibrationError( num_bins=FLAGS.num_bins), 'train/precision': tf.keras.metrics.Precision(), 'train/recall': tf.keras.metrics.Recall(), 'train/f1': tfa_metrics.F1Score( num_classes=num_classes, average='micro', threshold=FLAGS.ece_label_threshold), } checkpoint = tf.train.Checkpoint(model=model, optimizer=optimizer) if FLAGS.prediction_mode: latest_checkpoint = tf.train.latest_checkpoint(FLAGS.eval_checkpoint_dir) else: latest_checkpoint = tf.train.latest_checkpoint(FLAGS.output_dir) initial_epoch = 0 if latest_checkpoint: # checkpoint.restore must be within a strategy.scope() so that optimizer # slot variables are mirrored. checkpoint.restore(latest_checkpoint) logging.info('Loaded checkpoint %s', latest_checkpoint) initial_epoch = optimizer.iterations.numpy() // total_steps_per_epoch else: # load BERT from initial checkpoint bert_encoder, _, _ = utils.load_bert_weight_from_ckpt( bert_model=bert_encoder, bert_ckpt_dir=bert_ckpt_dir, repl_patterns=ub.models.bert_sngp.CHECKPOINT_REPL_PATTERNS) logging.info('Loaded BERT checkpoint %s', bert_ckpt_dir) metrics.update({ 'test/negative_log_likelihood': tf.keras.metrics.Mean(), 'test/auroc': tf.keras.metrics.AUC(curve='ROC'), 'test/aupr': tf.keras.metrics.AUC(curve='PR'), 'test/brier': tf.keras.metrics.MeanSquaredError(), 'test/brier_weighted': tf.keras.metrics.MeanSquaredError(), 'test/ece': rm.metrics.ExpectedCalibrationError(num_bins=FLAGS.num_bins), 'test/acc': tf.keras.metrics.Accuracy(), 'test/acc_weighted': tf.keras.metrics.Accuracy(), 'test/eval_time': tf.keras.metrics.Mean(), 'test/stddev': tf.keras.metrics.Mean(), 'test/precision': tf.keras.metrics.Precision(), 'test/recall': tf.keras.metrics.Recall(), 'test/f1': tfa_metrics.F1Score( num_classes=num_classes, average='micro', threshold=FLAGS.ece_label_threshold), 'test/calibration_auroc': tc_metrics.CalibrationAUC(curve='ROC'), 'test/calibration_auprc': tc_metrics.CalibrationAUC(curve='PR') }) for fraction in FLAGS.fractions: metrics.update({ 'test_collab_acc/collab_acc_{}'.format(fraction): rm.metrics.OracleCollaborativeAccuracy( fraction=float(fraction), num_bins=FLAGS.num_bins) }) metrics.update({ 'test_abstain_prec/abstain_prec_{}'.format(fraction): tc_metrics.AbstainPrecision(abstain_fraction=float(fraction)) }) metrics.update({ 'test_abstain_recall/abstain_recall_{}'.format(fraction): tc_metrics.AbstainRecall(abstain_fraction=float(fraction)) }) for dataset_name, test_dataset in test_datasets.items(): if dataset_name != 'ind': metrics.update({ 'test/nll_{}'.format(dataset_name): tf.keras.metrics.Mean(), 'test/auroc_{}'.format(dataset_name): tf.keras.metrics.AUC(curve='ROC'), 'test/aupr_{}'.format(dataset_name): tf.keras.metrics.AUC(curve='PR'), 'test/brier_{}'.format(dataset_name): tf.keras.metrics.MeanSquaredError(), 'test/brier_weighted_{}'.format(dataset_name): tf.keras.metrics.MeanSquaredError(), 'test/ece_{}'.format(dataset_name): rm.metrics.ExpectedCalibrationError(num_bins=FLAGS.num_bins), 'test/acc_{}'.format(dataset_name): tf.keras.metrics.Accuracy(), 'test/acc_weighted_{}'.format(dataset_name): tf.keras.metrics.Accuracy(), 'test/eval_time_{}'.format(dataset_name): tf.keras.metrics.Mean(), 'test/stddev_{}'.format(dataset_name): tf.keras.metrics.Mean(), 'test/precision_{}'.format(dataset_name): tf.keras.metrics.Precision(), 'test/recall_{}'.format(dataset_name): tf.keras.metrics.Recall(), 'test/f1_{}'.format(dataset_name): tfa_metrics.F1Score( num_classes=num_classes, average='micro', threshold=FLAGS.ece_label_threshold), 'test/calibration_auroc_{}'.format(dataset_name): tc_metrics.CalibrationAUC(curve='ROC'), 'test/calibration_auprc_{}'.format(dataset_name): tc_metrics.CalibrationAUC(curve='PR'), }) for fraction in FLAGS.fractions: metrics.update({ 'test_collab_acc/collab_acc_{}_{}'.format(fraction, dataset_name): rm.metrics.OracleCollaborativeAccuracy( fraction=float(fraction), num_bins=FLAGS.num_bins) }) metrics.update({ 'test_abstain_prec/abstain_prec_{}_{}'.format( fraction, dataset_name): tc_metrics.AbstainPrecision(abstain_fraction=float(fraction)) }) metrics.update({ 'test_abstain_recall/abstain_recall_{}_{}'.format( fraction, dataset_name): tc_metrics.AbstainRecall(abstain_fraction=float(fraction)) }) @tf.function def generate_sample_weight(labels, class_weight, label_threshold=0.7): """Generate sample weight for weighted accuracy calculation.""" if label_threshold != 0.7: logging.warning('The class weight was based on `label_threshold` = 0.7, ' 'and weighted accuracy/brier will be meaningless if ' '`label_threshold` is not equal to this value, which is ' 'recommended by Jigsaw Conversation AI team.') labels_int = tf.cast(labels > label_threshold, tf.int32) sample_weight = tf.gather(class_weight, labels_int) return sample_weight @tf.function def train_step(iterator, dataset_name, num_steps): """Training StepFn.""" def step_fn(inputs): """Per-Replica StepFn.""" features, labels, _ = utils.create_feature_and_label(inputs) with tf.GradientTape() as tape: logits = model(features, training=True) if isinstance(logits, (list, tuple)): # If model returns a tuple of (logits, covmat), extract logits logits, _ = logits if FLAGS.use_bfloat16: logits = tf.cast(logits, tf.float32) loss_logits = tf.squeeze(logits, axis=1) if FLAGS.loss_type == 'cross_entropy': logging.info('Using cross entropy loss') negative_log_likelihood = tf.nn.sigmoid_cross_entropy_with_logits( labels, loss_logits) elif FLAGS.loss_type == 'focal_cross_entropy': logging.info('Using focal cross entropy loss') negative_log_likelihood = tfa_losses.sigmoid_focal_crossentropy( labels, loss_logits, alpha=FLAGS.focal_loss_alpha, gamma=FLAGS.focal_loss_gamma, from_logits=True) elif FLAGS.loss_type == 'mse': logging.info('Using mean squared error loss') loss_probs = tf.nn.sigmoid(loss_logits) negative_log_likelihood = tf.keras.losses.mean_squared_error( labels, loss_probs) elif FLAGS.loss_type == 'mae': logging.info('Using mean absolute error loss') loss_probs = tf.nn.sigmoid(loss_logits) negative_log_likelihood = tf.keras.losses.mean_absolute_error( labels, loss_probs) negative_log_likelihood = tf.reduce_mean(negative_log_likelihood) l2_loss = sum(model.losses) loss = negative_log_likelihood + l2_loss # Scale the loss given the TPUStrategy will reduce sum all gradients. scaled_loss = loss / strategy.num_replicas_in_sync grads = tape.gradient(scaled_loss, model.trainable_variables) optimizer.apply_gradients(zip(grads, model.trainable_variables)) probs = tf.nn.sigmoid(logits) # Cast labels to discrete for ECE computation. ece_labels = tf.cast(labels > FLAGS.ece_label_threshold, tf.float32) one_hot_labels = tf.one_hot(tf.cast(ece_labels, tf.int32), depth=num_classes) ece_probs = tf.concat([1. - probs, probs], axis=1) auc_probs = tf.squeeze(probs, axis=1) pred_labels = tf.math.argmax(ece_probs, axis=-1) sample_weight = generate_sample_weight( labels, class_weight['train/{}'.format(dataset_name)], FLAGS.ece_label_threshold) metrics['train/negative_log_likelihood'].update_state( negative_log_likelihood) metrics['train/accuracy'].update_state(labels, pred_labels) metrics['train/accuracy_weighted'].update_state( ece_labels, pred_labels, sample_weight=sample_weight) metrics['train/auroc'].update_state(labels, auc_probs) metrics['train/loss'].update_state(loss) metrics['train/ece'].add_batch(ece_probs, label=ece_labels) metrics['train/precision'].update_state(ece_labels, pred_labels) metrics['train/recall'].update_state(ece_labels, pred_labels) metrics['train/f1'].update_state(one_hot_labels, ece_probs) for _ in tf.range(tf.cast(num_steps, tf.int32)): strategy.run(step_fn, args=(next(iterator),)) @tf.function def test_step(iterator, dataset_name): """Evaluation StepFn.""" def step_fn(inputs): """Per-Replica StepFn.""" features, labels, _ = utils.create_feature_and_label(inputs) eval_start_time = time.time() # Compute ensemble prediction over Monte Carlo forward-pass samples. logits_list = [] stddev_list = [] for _ in range(FLAGS.num_mc_samples): logits = model(features, training=False) if isinstance(logits, (list, tuple)): # If model returns a tuple of (logits, covmat), extract both. logits, covmat = logits else: covmat = tf.eye(test_batch_size) if FLAGS.use_bfloat16: logits = tf.cast(logits, tf.float32) covmat = tf.cast(covmat, tf.float32) logits = ed.layers.utils.mean_field_logits( logits, covmat, mean_field_factor=FLAGS.gp_mean_field_factor) stddev = tf.sqrt(tf.linalg.diag_part(covmat)) logits_list.append(logits) stddev_list.append(stddev) eval_time = (time.time() - eval_start_time) / FLAGS.per_core_batch_size # Logits dimension is (num_samples, batch_size, num_classes). logits_list = tf.stack(logits_list, axis=0) stddev_list = tf.stack(stddev_list, axis=0) stddev = tf.reduce_mean(stddev_list, axis=0) probs_list = tf.nn.sigmoid(logits_list) probs = tf.reduce_mean(probs_list, axis=0) # Cast labels to discrete for ECE computation. ece_labels = tf.cast(labels > FLAGS.ece_label_threshold, tf.float32) one_hot_labels = tf.one_hot(tf.cast(ece_labels, tf.int32), depth=num_classes) ece_probs = tf.concat([1. - probs, probs], axis=1) pred_labels = tf.math.argmax(ece_probs, axis=-1) auc_probs = tf.squeeze(probs, axis=1) # Use normalized binary predictive variance as the confidence score. # Since the prediction variance p*(1-p) is within range (0, 0.25), # normalize it by maximum value so the confidence is between (0, 1). calib_confidence = 1. - probs * (1. - probs) / .25 ce = tf.nn.sigmoid_cross_entropy_with_logits( labels=tf.broadcast_to( labels, [FLAGS.num_mc_samples, labels.shape[0]]), logits=tf.squeeze(logits_list, axis=-1) ) negative_log_likelihood = -tf.reduce_logsumexp( -ce, axis=0) + tf.math.log(float(FLAGS.num_mc_samples)) negative_log_likelihood = tf.reduce_mean(negative_log_likelihood) sample_weight = generate_sample_weight( labels, class_weight['test/{}'.format(dataset_name)], FLAGS.ece_label_threshold) if dataset_name == 'ind': metrics['test/negative_log_likelihood'].update_state( negative_log_likelihood) metrics['test/auroc'].update_state(labels, auc_probs) metrics['test/aupr'].update_state(labels, auc_probs) metrics['test/brier'].update_state(labels, auc_probs) metrics['test/brier_weighted'].update_state( tf.expand_dims(labels, -1), probs, sample_weight=sample_weight) metrics['test/ece'].add_batch(ece_probs, label=ece_labels) metrics['test/acc'].update_state(ece_labels, pred_labels) metrics['test/acc_weighted'].update_state( ece_labels, pred_labels, sample_weight=sample_weight) metrics['test/eval_time'].update_state(eval_time) metrics['test/stddev'].update_state(stddev) metrics['test/precision'].update_state(ece_labels, pred_labels) metrics['test/recall'].update_state(ece_labels, pred_labels) metrics['test/f1'].update_state(one_hot_labels, ece_probs) metrics['test/calibration_auroc'].update_state(ece_labels, pred_labels, calib_confidence) metrics['test/calibration_auprc'].update_state(ece_labels, pred_labels, calib_confidence) for fraction in FLAGS.fractions: metrics['test_collab_acc/collab_acc_{}'.format( fraction)].add_batch(ece_probs, label=ece_labels) metrics['test_abstain_prec/abstain_prec_{}'.format( fraction)].update_state(ece_labels, pred_labels, calib_confidence) metrics['test_abstain_recall/abstain_recall_{}'.format( fraction)].update_state(ece_labels, pred_labels, calib_confidence) else: metrics['test/nll_{}'.format(dataset_name)].update_state( negative_log_likelihood) metrics['test/auroc_{}'.format(dataset_name)].update_state( labels, auc_probs) metrics['test/aupr_{}'.format(dataset_name)].update_state( labels, auc_probs) metrics['test/brier_{}'.format(dataset_name)].update_state( labels, auc_probs) metrics['test/brier_weighted_{}'.format(dataset_name)].update_state( tf.expand_dims(labels, -1), probs, sample_weight=sample_weight) metrics['test/ece_{}'.format(dataset_name)].add_batch( ece_probs, label=ece_labels) metrics['test/acc_{}'.format(dataset_name)].update_state( ece_labels, pred_labels) metrics['test/acc_weighted_{}'.format(dataset_name)].update_state( ece_labels, pred_labels, sample_weight=sample_weight) metrics['test/eval_time_{}'.format(dataset_name)].update_state( eval_time) metrics['test/stddev_{}'.format(dataset_name)].update_state(stddev) metrics['test/precision_{}'.format(dataset_name)].update_state( ece_labels, pred_labels) metrics['test/recall_{}'.format(dataset_name)].update_state( ece_labels, pred_labels) metrics['test/f1_{}'.format(dataset_name)].update_state( one_hot_labels, ece_probs) metrics['test/calibration_auroc_{}'.format(dataset_name)].update_state( ece_labels, pred_labels, calib_confidence) metrics['test/calibration_auprc_{}'.format(dataset_name)].update_state( ece_labels, pred_labels, calib_confidence) for fraction in FLAGS.fractions: metrics['test_collab_acc/collab_acc_{}_{}'.format( fraction, dataset_name)].add_batch(ece_probs, label=ece_labels) metrics['test_abstain_prec/abstain_prec_{}_{}'.format( fraction, dataset_name)].update_state(ece_labels, pred_labels, calib_confidence) metrics['test_abstain_recall/abstain_recall_{}_{}'.format( fraction, dataset_name)].update_state(ece_labels, pred_labels, calib_confidence) strategy.run(step_fn, args=(next(iterator),)) @tf.function def final_eval_step(iterator): """Final Evaluation StepFn to save prediction to directory.""" def step_fn(inputs): bert_features, labels, additional_labels = utils.create_feature_and_label( inputs) logits = model(bert_features, training=False) if isinstance(logits, (list, tuple)): # If model returns a tuple of (logits, covmat), extract both. logits, covmat = logits else: covmat = tf.eye(test_batch_size) if FLAGS.use_bfloat16: logits = tf.cast(logits, tf.float32) covmat = tf.cast(covmat, tf.float32) logits = ed.layers.utils.mean_field_logits( logits, covmat, mean_field_factor=FLAGS.gp_mean_field_factor) features = inputs['input_ids'] return features, logits, labels, additional_labels (per_replica_texts, per_replica_logits, per_replica_labels, per_replica_additional_labels) = ( strategy.run(step_fn, args=(next(iterator),))) if strategy.num_replicas_in_sync > 1: texts_list = tf.concat(per_replica_texts.values, axis=0) logits_list = tf.concat(per_replica_logits.values, axis=0) labels_list = tf.concat(per_replica_labels.values, axis=0) additional_labels_dict = {} for additional_label in utils.IDENTITY_LABELS: if additional_label in per_replica_additional_labels: additional_labels_dict[additional_label] = tf.concat( per_replica_additional_labels[additional_label], axis=0) else: texts_list = per_replica_texts logits_list = per_replica_logits labels_list = per_replica_labels additional_labels_dict = {} for additional_label in utils.IDENTITY_LABELS: if additional_label in per_replica_additional_labels: additional_labels_dict[ additional_label] = per_replica_additional_labels[ additional_label] return texts_list, logits_list, labels_list, additional_labels_dict if FLAGS.prediction_mode: # Prediction and exit. for dataset_name, test_dataset in test_datasets.items(): test_iterator = iter(test_dataset) # pytype: disable=wrong-arg-types message = 'Final eval on dataset {}'.format(dataset_name) logging.info(message) texts_all = [] logits_all = [] labels_all = [] additional_labels_all_dict = {} if 'identity' in dataset_name: for identity_label_name in utils.IDENTITY_LABELS: additional_labels_all_dict[identity_label_name] = [] try: with tf.experimental.async_scope(): for step in range(steps_per_eval[dataset_name]): if step % 20 == 0: message = 'Starting to run eval step {}/{} of dataset: {}'.format( step, steps_per_eval[dataset_name], dataset_name) logging.info(message) (text_step, logits_step, labels_step, additional_labels_dict_step) = final_eval_step(test_iterator) texts_all.append(text_step) logits_all.append(logits_step) labels_all.append(labels_step) if 'identity' in dataset_name: for identity_label_name in utils.IDENTITY_LABELS: additional_labels_all_dict[identity_label_name].append( additional_labels_dict_step[identity_label_name]) except (StopIteration, tf.errors.OutOfRangeError): tf.experimental.async_clear_error() logging.info('Done with eval on %s', dataset_name) texts_all = tf.concat(texts_all, axis=0) logits_all = tf.concat(logits_all, axis=0) labels_all = tf.concat(labels_all, axis=0) additional_labels_all = [] if additional_labels_all_dict: for identity_label_name in utils.IDENTITY_LABELS: additional_labels_all.append( tf.concat( additional_labels_all_dict[identity_label_name], axis=0)) additional_labels_all = tf.convert_to_tensor(additional_labels_all) utils.save_prediction( texts_all.numpy(), path=os.path.join(FLAGS.output_dir, 'texts_{}'.format(dataset_name))) utils.save_prediction( labels_all.numpy(), path=os.path.join(FLAGS.output_dir, 'labels_{}'.format(dataset_name))) utils.save_prediction( logits_all.numpy(), path=os.path.join(FLAGS.output_dir, 'logits_{}'.format(dataset_name))) if 'identity' in dataset_name: utils.save_prediction( additional_labels_all.numpy(), path=os.path.join(FLAGS.output_dir, 'additional_labels_{}'.format(dataset_name))) logging.info('Done with testing on %s', dataset_name) else: # Execute train / eval loop. start_time = time.time() train_iterators = {} for dataset_name, train_dataset in train_datasets.items(): train_iterators[dataset_name] = iter(train_dataset) for epoch in range(initial_epoch, FLAGS.train_epochs): logging.info('Starting to run epoch: %s', epoch) for dataset_name, train_iterator in train_iterators.items(): try: with tf.experimental.async_scope(): train_step( train_iterator, dataset_name, dataset_steps_per_epoch[dataset_name]) current_step = ( epoch * total_steps_per_epoch + dataset_steps_per_epoch[dataset_name]) max_steps = total_steps_per_epoch * FLAGS.train_epochs time_elapsed = time.time() - start_time steps_per_sec = float(current_step) / time_elapsed eta_seconds = (max_steps - current_step) / steps_per_sec message = ('{:.1%} completion: epoch {:d}/{:d}. {:.1f} steps/s. ' 'ETA: {:.0f} min. Time elapsed: {:.0f} min'.format( current_step / max_steps, epoch + 1, FLAGS.train_epochs, steps_per_sec, eta_seconds / 60, time_elapsed / 60)) logging.info(message) except (StopIteration, tf.errors.OutOfRangeError): tf.experimental.async_clear_error() logging.info('Done with testing on %s', dataset_name) if epoch % FLAGS.evaluation_interval == 0: for dataset_name, test_dataset in test_datasets.items(): test_iterator = iter(test_dataset) logging.info('Testing on dataset %s', dataset_name) try: with tf.experimental.async_scope(): for step in range(steps_per_eval[dataset_name]): if step % 20 == 0: logging.info('Starting to run eval step %s/%s of epoch: %s', step, steps_per_eval[dataset_name], epoch) test_step(test_iterator, dataset_name) except (StopIteration, tf.errors.OutOfRangeError): tf.experimental.async_clear_error() logging.info('Done with testing on %s', dataset_name) logging.info('Train Loss: %.4f, ECE: %.2f, Accuracy: %.2f', metrics['train/loss'].result(), metrics['train/ece'].result(), metrics['train/accuracy'].result()) total_results = { name: metric.result() for name, metric in metrics.items() } # Metrics from Robustness Metrics (like ECE) will return a dict with a # single key/value, instead of a scalar. total_results = { k: (list(v.values())[0] if isinstance(v, dict) else v) for k, v in total_results.items() } with summary_writer.as_default(): for name, result in total_results.items(): tf.summary.scalar(name, result, step=epoch + 1) for metric in metrics.values(): metric.reset_states() checkpoint_interval = min(FLAGS.checkpoint_interval, FLAGS.train_epochs) if checkpoint_interval > 0 and (epoch + 1) % checkpoint_interval == 0: checkpoint_name = checkpoint.save( os.path.join(FLAGS.output_dir, 'checkpoint')) logging.info('Saved checkpoint to %s', checkpoint_name) # Save model in SavedModel format on exit. final_save_name = os.path.join(FLAGS.output_dir, 'model') model.save(final_save_name) logging.info('Saved model to %s', final_save_name) with summary_writer.as_default(): hp.hparams({ 'base_learning_rate': FLAGS.base_learning_rate, 'one_minus_momentum': FLAGS.one_minus_momentum, 'gp_mean_field_factor': FLAGS.gp_mean_field_factor, })
def train(model, train_fn, validate_fn, bucketing_dl_xy, dev_data, vocab_src, vocab_tgt, config, cycle_iterate_dl_back=None): print("Training...") optimizer, scheduler = create_optimizer(model.parameters(), config) saved_epoch = 0 patience_counter = 0 max_bleu = 0.0 num_batches = sum(1 for _ in iter(bucketing_dl_xy)) checkpoints_path = "{}/{}/checkpoints".format(config["out_dir"], config["session"]) if os.path.exists(checkpoints_path): checkpoints = [ cp for cp in sorted(os.listdir(checkpoints_path)) if cp == config["session"] ] if checkpoints: state = torch.load('{}/{}'.format(checkpoints_path, checkpoints[-1])) saved_epoch = state['epoch'] patience_counter = state['patience_counter'] max_bleu = state['max_bleu'] model.load_state_dict(state['state_dict']) optimizer.load_state_dict(state['optimizer']) scheduler.load_state_dict(state['scheduler']) else: init_model(model, vocab_src[PAD_TOKEN], vocab_tgt[PAD_TOKEN], config) else: init_model(model, vocab_src[PAD_TOKEN], vocab_tgt[PAD_TOKEN], config) cycle_iterate_dl_xy = cycle(bucketing_dl_xy) device = torch.device( "cpu") if config["device"] == "cpu" else torch.device("cuda:0") for epoch in range(saved_epoch, config["num_epochs"]): step = 0 while step < num_batches: model.train() # Back-translation data if not cycle_iterate_dl_back == None: sentences_x, sentences_y = next(cycle_iterate_dl_back) loss = bilingual_step(model, sentences_x, sentences_y, train_fn, optimizer, vocab_src, vocab_tgt, config, step, device) print( "Epoch: {:03d}/{:03d}, Batch {:05d}/{:05d}, Back-Loss: {:.2f}" .format(epoch + 1, config["num_epochs"], step + 1, num_batches, loss)) # step += 1 # Bilingual data sentences_x, sentences_y = next(cycle_iterate_dl_xy) loss = bilingual_step(model, sentences_x, sentences_y, train_fn, optimizer, vocab_src, vocab_tgt, config, step, device) print("Epoch: {:03d}/{:03d}, Batch {:05d}/{:05d}, xy-Loss: {:.2f}". format(epoch + 1, config["num_epochs"], step + 1, num_batches, loss)) step += 1 val_bleu = evaluate(model, validate_fn, dev_data, vocab_src, vocab_tgt, epoch, config) scheduler.step(float(val_bleu)) print("Blue score: {}".format(val_bleu)) if float(val_bleu) > max_bleu: max_bleu = float(val_bleu) patience_counter = 0 # Save checkpoint if not os.path.exists(checkpoints_path): os.makedirs(checkpoints_path) state = { 'epoch': epoch + 1, 'patience_counter': patience_counter, 'max_bleu': max_bleu, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict() } torch.save(state, '{}/{}'.format(checkpoints_path, config["session"])) else: patience_counter += 1 if patience_counter >= config["patience"]: break
def setup_optimizer(self): # Note: The optimizer is created in models/RNN/utils.py with tf.name_scope("train"): self.optimizer = create_optimizer() self.optimizer = self.optimizer.minimize(self.avg_loss)
parser.add_argument('data_dir', action='store') parser.add_argument('--save-dir', action='store', default=os.getcwd()) parser.add_argument('--arch', action='store', default='vgg13') parser.add_argument('--learning_rate', action='store', default=0.002, type=float) parser.add_argument('--gpu', action='store', default=True) parser.add_argument('--hidden_units', action='store', default=512, type=int) parser.add_argument('--epochs', action='store', default=5, type=int) parser.add_argument('--checkpoint', action='store') args = parser.parse_args() print(args) train_dataset, train_dataloader = utils.load_train_data(args.data_dir + "/train/") valid_dataset, valid_dataloader = utils.load_valid_data(args.data_dir + "/valid/") test_dataset, test_dataloader = utils.load_test_data(args.data_dir + "/test/") model = utils.create_network(args.arch, args.hidden_units, True) criterion = utils.create_criterion() optimizer = utils.create_optimizer(model, args.learning_rate) model = utils.train_model(model, criterion, optimizer, train_dataloader, valid_dataloader, args.epochs, args.gpu) utils.save_checkpoint(model, args.arch, args.hidden_units, train_dataset, args.save_dir)