def main(argv): del argv # unused arg tf.enable_v2_behavior() tf.io.gfile.makedirs(FLAGS.output_dir) logging.info('Saving checkpoints at %s', FLAGS.output_dir) tf.random.set_seed(FLAGS.seed) if FLAGS.use_gpu: logging.info('Use GPU') strategy = tf.distribute.MirroredStrategy() else: logging.info('Use TPU at %s', FLAGS.tpu if FLAGS.tpu is not None else 'local') resolver = tf.distribute.cluster_resolver.TPUClusterResolver( tpu=FLAGS.tpu) tf.config.experimental_connect_to_cluster(resolver) tf.tpu.experimental.initialize_tpu_system(resolver) strategy = tf.distribute.experimental.TPUStrategy(resolver) ind_dataset_builder = ub.datasets.ClincIntentDetectionDataset( batch_size=FLAGS.per_core_batch_size, eval_batch_size=FLAGS.per_core_batch_size, data_mode='ind') ood_dataset_builder = ub.datasets.ClincIntentDetectionDataset( batch_size=FLAGS.per_core_batch_size, eval_batch_size=FLAGS.per_core_batch_size, data_mode='ood') dataset_builders = { 'clean': ind_dataset_builder, 'out_of_scope_requests': ood_dataset_builder } train_dataset = ind_dataset_builder.build( split=ub.datasets.base.Split.TRAIN) ds_info = ind_dataset_builder.info feature_size = ds_info['feature_size'] # num_classes is number of valid intents plus out-of-scope intent num_classes = ds_info['num_classes'] + 1 # vocab_size is total number of valid tokens plus the out-of-vocabulary token. vocab_size = ind_dataset_builder.tokenizer.num_words + 1 batch_size = FLAGS.per_core_batch_size * FLAGS.num_cores steps_per_epoch = ds_info['num_train_examples'] // batch_size test_datasets = {} steps_per_eval = {} for dataset_name, dataset_builder in dataset_builders.items(): test_datasets[dataset_name] = dataset_builder.build( split=ub.datasets.base.Split.TEST) steps_per_eval[dataset_name] = ( dataset_builder.info['num_test_examples'] // batch_size) if FLAGS.use_bfloat16: policy = tf.keras.mixed_precision.experimental.Policy('mixed_bfloat16') tf.keras.mixed_precision.experimental.set_policy(policy) summary_writer = tf.summary.create_file_writer( os.path.join(FLAGS.output_dir, 'summaries')) premade_embedding_array = None if FLAGS.word_embedding_dir: with tf.io.gfile.GFile(FLAGS.word_embedding_dir, 'rb') as embedding_file: premade_embedding_array = np.load(embedding_file) with strategy.scope(): logging.info('Building %s model', FLAGS.model_family) if FLAGS.model_family.lower() == 'textcnn': model = cnn_model.textcnn( filter_sizes=FLAGS.filter_sizes, num_filters=FLAGS.num_filters, num_classes=num_classes, feature_size=feature_size, vocab_size=vocab_size, embed_size=FLAGS.embedding_size, dropout_rate=FLAGS.dropout_rate, l2=FLAGS.l2, premade_embedding_arr=premade_embedding_array) optimizer = tf.keras.optimizers.Adam(FLAGS.base_learning_rate) elif FLAGS.model_family.lower() == 'bert': bert_config_dir, bert_ckpt_dir = resolve_bert_ckpt_and_config_dir( FLAGS.bert_dir, FLAGS.bert_config_dir, FLAGS.bert_ckpt_dir) bert_config = bert_model.create_config(bert_config_dir) model, bert_encoder = bert_model.create_model( num_classes=num_classes, feature_size=feature_size, bert_config=bert_config) optimizer = bert_model.create_optimizer( FLAGS.base_learning_rate, steps_per_epoch=steps_per_epoch, epochs=FLAGS.train_epochs, warmup_proportion=FLAGS.warmup_proportion) else: raise ValueError( 'model_family ({}) can only be TextCNN or BERT.'.format( FLAGS.model_family)) logging.info('Model input shape: %s', model.input_shape) logging.info('Model output shape: %s', model.output_shape) logging.info('Model number of weights: %s', model.count_params()) metrics = { 'train/negative_log_likelihood': tf.keras.metrics.Mean(), 'train/accuracy': tf.keras.metrics.SparseCategoricalAccuracy(), 'train/loss': tf.keras.metrics.Mean(), 'train/ece': ed.metrics.ExpectedCalibrationError(num_bins=FLAGS.num_bins), 'test/negative_log_likelihood': tf.keras.metrics.Mean(), 'test/accuracy': tf.keras.metrics.SparseCategoricalAccuracy(), 'test/ece': ed.metrics.ExpectedCalibrationError(num_bins=FLAGS.num_bins), } for dataset_name, test_dataset in test_datasets.items(): if dataset_name != 'clean': metrics.update({ 'test/nll_{}'.format(dataset_name): tf.keras.metrics.Mean(), 'test/accuracy_{}'.format(dataset_name): tf.keras.metrics.SparseCategoricalAccuracy(), 'test/ece_{}'.format(dataset_name): ed.metrics.ExpectedCalibrationError( num_bins=FLAGS.num_bins) }) checkpoint = tf.train.Checkpoint(model=model, optimizer=optimizer) latest_checkpoint = tf.train.latest_checkpoint(FLAGS.output_dir) initial_epoch = 0 if latest_checkpoint: # checkpoint.restore must be within a strategy.scope() so that optimizer # slot variables are mirrored. checkpoint.restore(latest_checkpoint) logging.info('Loaded checkpoint %s', latest_checkpoint) initial_epoch = optimizer.iterations.numpy() // steps_per_epoch elif FLAGS.model_family.lower() == 'bert': # load BERT from initial checkpoint bert_checkpoint = tf.train.Checkpoint(model=bert_encoder) bert_checkpoint.restore( bert_ckpt_dir).assert_existing_objects_matched() logging.info('Loaded BERT checkpoint %s', bert_ckpt_dir) @tf.function def train_step(iterator): """Training StepFn.""" def step_fn(inputs): """Per-Replica StepFn.""" features, labels = create_feature_and_label( inputs, feature_size, model_family=FLAGS.model_family) with tf.GradientTape() as tape: # Set learning phase to enable dropout etc during training. logits = model(features, training=True) if FLAGS.use_bfloat16: logits = tf.cast(logits, tf.float32) negative_log_likelihood = tf.reduce_mean( tf.keras.losses.sparse_categorical_crossentropy( labels, logits, from_logits=True)) l2_loss = sum(model.losses) loss = negative_log_likelihood + l2_loss # Scale the loss given the TPUStrategy will reduce sum all gradients. scaled_loss = loss / strategy.num_replicas_in_sync grads = tape.gradient(scaled_loss, model.trainable_variables) optimizer.apply_gradients(zip(grads, model.trainable_variables)) probs = tf.nn.softmax(logits) metrics['train/ece'].update_state(labels, probs) metrics['train/loss'].update_state(loss) metrics['train/negative_log_likelihood'].update_state( negative_log_likelihood) metrics['train/accuracy'].update_state(labels, logits) strategy.run(step_fn, args=(next(iterator), )) @tf.function def test_step(iterator, dataset_name): """Evaluation StepFn.""" def step_fn(inputs): """Per-Replica StepFn.""" features, labels = create_feature_and_label( inputs, feature_size, model_family=FLAGS.model_family) # Set learning phase to disable dropout etc during eval. logits = model(features, training=False) if FLAGS.use_bfloat16: logits = tf.cast(logits, tf.float32) probs = tf.nn.softmax(logits) negative_log_likelihood = tf.reduce_mean( tf.keras.losses.sparse_categorical_crossentropy(labels, probs)) if dataset_name == 'clean': metrics['test/negative_log_likelihood'].update_state( negative_log_likelihood) metrics['test/accuracy'].update_state(labels, probs) metrics['test/ece'].update_state(labels, probs) else: metrics['test/nll_{}'.format(dataset_name)].update_state( negative_log_likelihood) metrics['test/accuracy_{}'.format(dataset_name)].update_state( labels, probs) metrics['test/ece_{}'.format(dataset_name)].update_state( labels, probs) strategy.run(step_fn, args=(next(iterator), )) train_iterator = iter(train_dataset) start_time = time.time() for epoch in range(initial_epoch, FLAGS.train_epochs): logging.info('Starting to run epoch: %s', epoch) for step in range(steps_per_epoch): train_step(train_iterator) current_step = epoch * steps_per_epoch + (step + 1) max_steps = steps_per_epoch * FLAGS.train_epochs time_elapsed = time.time() - start_time steps_per_sec = float(current_step) / time_elapsed eta_seconds = (max_steps - current_step) / steps_per_sec message = ('{:.1%} completion: epoch {:d}/{:d}. {:.1f} steps/s. ' 'ETA: {:.0f} min. Time elapsed: {:.0f} min'.format( current_step / max_steps, epoch + 1, FLAGS.train_epochs, steps_per_sec, eta_seconds / 60, time_elapsed / 60)) if step % 20 == 0: logging.info(message) if epoch % FLAGS.evaluation_interval == 0: for dataset_name, test_dataset in test_datasets.items(): test_iterator = iter(test_dataset) logging.info('Testing on dataset %s', dataset_name) for step in range(steps_per_eval[dataset_name]): if step % 20 == 0: logging.info( 'Starting to run eval step %s of epoch: %s', step, epoch) test_step(test_iterator, dataset_name) logging.info('Done with testing on %s', dataset_name) logging.info('Train Loss: %.4f, Accuracy: %.2f%%', metrics['train/loss'].result(), metrics['train/accuracy'].result() * 100) logging.info('Test NLL: %.4f, Accuracy: %.2f%%', metrics['test/negative_log_likelihood'].result(), metrics['test/accuracy'].result() * 100) total_results = { name: metric.result() for name, metric in metrics.items() } with summary_writer.as_default(): for name, result in total_results.items(): tf.summary.scalar(name, result, step=epoch + 1) for metric in metrics.values(): metric.reset_states() if (FLAGS.checkpoint_interval > 0 and (epoch + 1) % FLAGS.checkpoint_interval == 0): checkpoint_name = checkpoint.save( os.path.join(FLAGS.output_dir, 'checkpoint')) logging.info('Saved checkpoint to %s', checkpoint_name)
def main(argv): del argv # unused arg if not FLAGS.use_gpu: raise ValueError('Only GPU is currently supported.') if FLAGS.num_cores > 1: raise ValueError('Only a single accelerator is currently supported.') tf.enable_v2_behavior() tf.random.set_seed(FLAGS.seed) tf.io.gfile.makedirs(FLAGS.output_dir) ind_dataset_builder = ub.datasets.ClincIntentDetectionDataset( batch_size=FLAGS.per_core_batch_size, eval_batch_size=FLAGS.per_core_batch_size, dataset_dir=FLAGS.dataset_dir, data_mode='ind') ood_dataset_builder = ub.datasets.ClincIntentDetectionDataset( batch_size=FLAGS.per_core_batch_size, eval_batch_size=FLAGS.per_core_batch_size, dataset_dir=FLAGS.dataset_dir, data_mode='ood') dataset_builders = { 'clean': ind_dataset_builder, 'ood': ood_dataset_builder } ds_info = ind_dataset_builder.info feature_size = ds_info['feature_size'] # num_classes is number of valid intents plus out-of-scope intent num_classes = ds_info['num_classes'] + 1 # vocab_size is total number of valid tokens plus the out-of-vocabulary token. vocab_size = ind_dataset_builder.tokenizer.num_words + 1 batch_size = FLAGS.per_core_batch_size * FLAGS.num_cores test_datasets = {} steps_per_eval = {} for dataset_name, dataset_builder in dataset_builders.items(): test_datasets[dataset_name] = dataset_builder.build( split=ub.datasets.base.Split.TEST) steps_per_eval[dataset_name] = ( dataset_builder.info['num_test_examples'] // batch_size) if FLAGS.model_family.lower() == 'textcnn': model = cnn_model.textcnn(filter_sizes=FLAGS.filter_sizes, num_filters=FLAGS.num_filters, num_classes=num_classes, feature_size=feature_size, vocab_size=vocab_size, embed_size=FLAGS.embedding_size, dropout_rate=FLAGS.dropout_rate, l2=FLAGS.l2) elif FLAGS.model_family.lower() == 'bert': bert_config_dir, _ = deterministic.resolve_bert_ckpt_and_config_dir( FLAGS.bert_dir, FLAGS.bert_config_dir, FLAGS.bert_ckpt_dir) bert_config = bert_model.create_config(bert_config_dir) model, _ = bert_model.create_model(num_classes=num_classes, feature_size=feature_size, bert_config=bert_config) else: raise ValueError( 'model_family ({}) can only be TextCNN or BERT.'.format( FLAGS.model_family)) logging.info('Model input shape: %s', model.input_shape) logging.info('Model output shape: %s', model.output_shape) logging.info('Model number of weights: %s', model.count_params()) # Search for checkpoints from their index file; then remove the index suffix. ensemble_filenames = tf.io.gfile.glob( os.path.join(FLAGS.checkpoint_dir, '**/*.index')) ensemble_filenames = [filename[:-6] for filename in ensemble_filenames] ensemble_size = len(ensemble_filenames) logging.info('Ensemble size: %s', ensemble_size) logging.info('Ensemble number of weights: %s', ensemble_size * model.count_params()) logging.info('Ensemble filenames: %s', str(ensemble_filenames)) checkpoint = tf.train.Checkpoint(model=model) # Write model predictions to files. num_datasets = len(test_datasets) for m, ensemble_filename in enumerate(ensemble_filenames): checkpoint.restore(ensemble_filename) for n, (name, test_dataset) in enumerate(test_datasets.items()): filename = '{dataset}_{member}.npy'.format(dataset=name, member=m) filename = os.path.join(FLAGS.output_dir, filename) if not tf.io.gfile.exists(filename): logits = [] test_iterator = iter(test_dataset) for _ in range(steps_per_eval[name]): inputs = next(test_iterator) features, _ = deterministic.create_feature_and_label( inputs, feature_size, model_family=FLAGS.model_family) logits.append(model(features, training=False)) logits = tf.concat(logits, axis=0) with tf.io.gfile.GFile(filename, 'w') as f: np.save(f, logits.numpy()) percent = (m * num_datasets + (n + 1)) / (ensemble_size * num_datasets) message = ( '{:.1%} completion for prediction: ensemble member {:d}/{:d}. ' 'Dataset {:d}/{:d}'.format(percent, m + 1, ensemble_size, n + 1, num_datasets)) logging.info(message) metrics = { 'test/negative_log_likelihood': tf.keras.metrics.Mean(), 'test/gibbs_cross_entropy': tf.keras.metrics.Mean(), 'test/accuracy': tf.keras.metrics.SparseCategoricalAccuracy(), 'test/ece': ed.metrics.ExpectedCalibrationError(num_bins=FLAGS.num_bins), } for dataset_name, test_dataset in test_datasets.items(): if dataset_name != 'clean': metrics.update({ 'test/nll_{}'.format(dataset_name): tf.keras.metrics.Mean(), 'test/accuracy_{}'.format(dataset_name): tf.keras.metrics.SparseCategoricalAccuracy(), 'test/ece_{}'.format(dataset_name): ed.metrics.ExpectedCalibrationError(num_bins=FLAGS.num_bins) }) # Evaluate model predictions. for n, (name, test_dataset) in enumerate(test_datasets.items()): logits_dataset = [] for m in range(ensemble_size): filename = '{dataset}_{member}.npy'.format(dataset=name, member=m) filename = os.path.join(FLAGS.output_dir, filename) with tf.io.gfile.GFile(filename, 'rb') as f: logits_dataset.append(np.load(f)) logits_dataset = tf.convert_to_tensor(logits_dataset) test_iterator = iter(test_dataset) for step in range(steps_per_eval[name]): inputs = next(test_iterator) _, labels = deterministic.create_feature_and_label( inputs, feature_size, model_family=FLAGS.model_family) logits = logits_dataset[:, (step * batch_size):((step + 1) * batch_size)] labels = tf.cast(labels, tf.int32) negative_log_likelihood = tf.reduce_mean( ensemble_negative_log_likelihood(labels, logits)) per_probs = tf.nn.softmax(logits) probs = tf.reduce_mean(per_probs, axis=0) if name == 'clean': gibbs_ce = tf.reduce_mean(gibbs_cross_entropy(labels, logits)) metrics['test/negative_log_likelihood'].update_state( negative_log_likelihood) metrics['test/gibbs_cross_entropy'].update_state(gibbs_ce) metrics['test/accuracy'].update_state(labels, probs) metrics['test/ece'].update_state(labels, probs) else: metrics['test/nll_{}'.format(name)].update_state( negative_log_likelihood) metrics['test/accuracy_{}'.format(name)].update_state( labels, probs) metrics['test/ece_{}'.format(name)].update_state(labels, probs) message = ( '{:.1%} completion for evaluation: dataset {:d}/{:d}'.format( (n + 1) / num_datasets, n + 1, num_datasets)) logging.info(message) total_results = {name: metric.result() for name, metric in metrics.items()} logging.info('Metrics: %s', total_results)