def test_horovod_allreduce_cpu_gpu_error(self): """Test that the allreduce raises an error if different ranks try to perform reduction on CPU and GPU.""" # Only do this test if there are GPUs available. if not tf.test.is_gpu_available(cuda_only=True): return hvd.init() local_rank = hvd.local_rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return device = "/gpu:0" if local_rank % 2 == 0 else "/cpu:0" one_gpu = tf.GPUOptions(visible_device_list=str(local_rank)) gpu_config = tf.ConfigProto(gpu_options=one_gpu) with self.test_session(config=gpu_config) as session: with tf.device(device): # Same rank, different dimension dims = [17] * 3 tensor = tf.ones(dims, dtype=tf.int32) with self.assertRaises(tf.errors.FailedPreconditionError): session.run(hvd.allreduce(tensor))
def main(unused_argv): # Horovod: initialize Horovod. hvd.init() # Load training and eval data mnist = learn.datasets.mnist.read_data_sets('MNIST-data-%d' % hvd.rank()) train_data = mnist.train.images # Returns np.array train_labels = np.asarray(mnist.train.labels, dtype=np.int32) eval_data = mnist.test.images # Returns np.array eval_labels = np.asarray(mnist.test.labels, dtype=np.int32) # Horovod: pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) # Horovod: save checkpoints only on worker 0 to prevent other workers from # corrupting them. model_dir = './mnist_convnet_model' if hvd.rank() == 0 else None # Create the Estimator mnist_classifier = tf.estimator.Estimator( model_fn=cnn_model_fn, model_dir=model_dir, config=tf.estimator.RunConfig(session_config=config)) # Set up logging for predictions # Log the values in the "Softmax" tensor with label "probabilities" tensors_to_log = {"probabilities": "softmax_tensor"} logging_hook = tf.train.LoggingTensorHook( tensors=tensors_to_log, every_n_iter=500) # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states from # rank 0 to all other processes. This is necessary to ensure consistent # initialization of all workers when training is started with random weights or # restored from a checkpoint. bcast_hook = hvd.BroadcastGlobalVariablesHook(0) # Train the model train_input_fn = tf.estimator.inputs.numpy_input_fn( x={"x": train_data}, y=train_labels, batch_size=100, num_epochs=None, shuffle=True) # Horovod: adjust number of steps based on number of GPUs. mnist_classifier.train( input_fn=train_input_fn, steps=20000 // hvd.size(), hooks=[logging_hook, bcast_hook]) # Evaluate the model and print results eval_input_fn = tf.estimator.inputs.numpy_input_fn( x={"x": eval_data}, y=eval_labels, num_epochs=1, shuffle=False) eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn) print(eval_results)
def tensorflow_session(): # Init session and params config = tf.ConfigProto() config.gpu_options.allow_growth = True # Pin GPU to local rank (one GPU per process) config.gpu_options.visible_device_list = str(hvd.local_rank()) sess = tf.Session(config=config) return sess
def _setup_graph(self): num_gpu = cfg.TRAIN.NUM_GPUS if cfg.TRAINER == 'replicated': # Use two predictor threads per GPU to get better throughput self.num_predictor = num_gpu * 2 self.predictors = [self._build_coco_predictor(k % num_gpu) for k in range(self.num_predictor)] self.dataflows = [get_eval_dataflow(shard=k, num_shards=self.num_predictor) for k in range(self.num_predictor)] else: # Only eval on the first machine. # Alternatively, can eval on all ranks and use allgather, but allgather sometimes hangs self._horovod_run_eval = hvd.rank() == hvd.local_rank() if self._horovod_run_eval: self.predictor = self._build_coco_predictor(0) self.dataflow = get_eval_dataflow(shard=hvd.local_rank(), num_shards=hvd.local_size()) self.barrier = hvd.allreduce(tf.random_normal(shape=[1]))
def init_backend_engine(): """ Initializes ``engine``, which is either :class:`TFEngine.Engine` or Theano :class:`Engine.Engine`. """ BackendEngine.select_engine(config=config) if BackendEngine.is_theano_selected(): print("Theano:", describe_theano_version(), file=log.v3) import TheanoUtil TheanoUtil.monkey_patches() elif BackendEngine.is_tensorflow_selected(): print("TensorFlow:", describe_tensorflow_version(), file=log.v3) if get_tensorflow_version_tuple()[0] == 0: print("Warning: TF <1.0 is not supported and likely broken.", file=log.v2) if os.environ.get("TF_DEVICE"): print("Devices: Use %s via TF_DEVICE instead of %s." % ( os.environ.get("TF_DEVICE"), config.opt_typed_value("device")), file=log.v4) config.set("device", os.environ.get("TF_DEVICE")) if config.is_true("use_horovod"): import socket # noinspection PyPackageRequirements,PyUnresolvedReferences import horovod.tensorflow as hvd from TFUtil import init_horovod init_horovod() # make sure it is initialized if "gpu" in config.value("device", "") or os.environ.get("CUDA_VISIBLE_DEVICES", ""): # We assume that we want to use a GPU. gpu_opts = config.typed_dict.setdefault("tf_session_opts", {}).setdefault("gpu_options", {}) assert "visible_device_list" not in gpu_opts gpu_opts["visible_device_list"] = str(hvd.local_rank()) print("Horovod: Hostname %s, pid %i, using GPU %s." % ( socket.gethostname(), os.getpid(), gpu_opts["visible_device_list"]), file=log.v3) else: if hvd.rank() == 0: # Don't spam in all ranks. print("Horovod: Not using GPU.", file=log.v3) horovod_reduce_type = config.value("horovod_reduce_type", "") if horovod_reduce_type == "": horovod_reduce_type = "grad" config.set("horovod_reduce_type", horovod_reduce_type) else: assert horovod_reduce_type in ["grad", "param"], "config option 'horovod_reduce_type' invalid" if hvd.rank() == 0: # Don't spam in all ranks. print("Horovod: Reduce type:", horovod_reduce_type, file=log.v3) from TFUtil import debug_register_better_repr, setup_tf_thread_pools, print_available_devices tf_session_opts = config.typed_value("tf_session_opts", {}) assert isinstance(tf_session_opts, dict) # This must be done after the Horovod logic, such that we only touch the devices we are supposed to touch. setup_tf_thread_pools(log_file=log.v3, tf_session_opts=tf_session_opts) # Print available devices. Also make sure that get_tf_list_local_devices uses the correct TF session opts. print_available_devices(tf_session_opts=tf_session_opts, file=log.v2) debug_register_better_repr() else: raise NotImplementedError
def main(_): # Initialize Horovod. hvd.init() # Download and load MNIST dataset. mnist = learn.datasets.mnist.read_data_sets('MNIST-data-%d' % hvd.rank()) # Build model... with tf.name_scope('input'): image = tf.placeholder(tf.float32, [None, 784], name='image') label = tf.placeholder(tf.float32, [None], name='label') predict, loss = conv_model(image, label, tf.contrib.learn.ModeKeys.TRAIN) opt = tf.train.RMSPropOptimizer(0.01) # Add Horovod Distributed Optimizer. opt = hvd.DistributedOptimizer(opt) global_step = tf.contrib.framework.get_or_create_global_step() train_op = opt.minimize(loss, global_step=global_step) # BroadcastGlobalVariablesHook broadcasts initial variable states from rank 0 # to all other processes. This is necessary to ensure consistent initialization # of all workers when training is started with random weights or restored # from a checkpoint. hooks = [hvd.BroadcastGlobalVariablesHook(0), tf.train.StopAtStepHook(last_step=100), tf.train.LoggingTensorHook(tensors={'step': global_step, 'loss': loss}, every_n_iter=10), ] # Pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) # Save checkpoints only on worker 0 to prevent other workers from corrupting them. checkpoint_dir = './checkpoints' if hvd.rank() == 0 else None # The MonitoredTrainingSession takes care of session initialization, # restoring from a checkpoint, saving to a checkpoint, and closing when done # or an error occurs. with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir, hooks=hooks, config=config) as mon_sess: while not mon_sess.should_stop(): # Run a training step synchronously. image_, label_ = mnist.train.next_batch(100) mon_sess.run(train_op, feed_dict={image: image_, label: label_})
def test_horovod_allreduce_multi_gpu(self): """Test that the allreduce works on multiple GPUs. This test will crash badly if used with an MPI implementation that does not support GPU memory transfers directly, as it will call MPI_Send on a GPU data pointer.""" # Only do this test if there are GPUs available. if not tf.test.is_gpu_available(cuda_only=True): return hvd.init() local_rank = hvd.local_rank() size = hvd.size() iter = 0 two_gpus = tf.GPUOptions(visible_device_list=( '%d,%d' % (local_rank * 2, local_rank * 2 + 1))) gpu_config = tf.ConfigProto(gpu_options=two_gpus) with self.test_session(config=gpu_config) as session: dtypes = [tf.int32, tf.int64, tf.float32, tf.float64] dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): iter += 1 with tf.device("/gpu:%d" % ((iter + local_rank) % 2)): tf.set_random_seed(1234) tensor = tf.random_uniform( [17] * dim, -100, 100, dtype=dtype) summed = hvd.allreduce(tensor, average=False) multiplied = tensor * size max_difference = tf.reduce_max(tf.abs(summed - multiplied)) # Threshold for floating point equality depends on number of # ranks, since we're comparing against precise multiplication. if size <= 3: threshold = 0 elif size < 10: threshold = 1e-4 elif size < 15: threshold = 5e-4 else: return diff = session.run(max_difference) self.assertTrue(diff <= threshold, "hvd.allreduce on GPU produces incorrect results")
def test_horovod_allreduce_gpu_fused(self): """Test that the allreduce works on GPUs with Tensor Fusion. This test will crash badly if used with an MPI implementation that does not support GPU memory transfers directly, as it will call MPI_Send on a GPU data pointer.""" # Only do this test if there are GPUs available. if not tf.test.is_gpu_available(cuda_only=True): return hvd.init() local_rank = hvd.local_rank() size = hvd.size() with self.test_session(config=self.config) as session: dtypes = [tf.int32, tf.int64, tf.float32, tf.float64] dims = [1, 2, 3] tests = [] for dtype, dim in itertools.product(dtypes, dims): with tf.device("/gpu:%d" % local_rank): tf.set_random_seed(1234) tensor = tf.random_uniform( [17] * dim, -100, 100, dtype=dtype) summed = hvd.allreduce(tensor, average=False) multiplied = tensor * size max_difference = tf.reduce_max(tf.abs(summed - multiplied)) # Threshold for floating point equality depends on number of # ranks, since we're comparing against precise multiplication. if size <= 3 or dtype in [tf.int32, tf.int64]: threshold = 0 elif size < 10: threshold = 1e-4 elif size < 15: threshold = 5e-4 else: return test = max_difference <= threshold tests.append(test) self.assertTrue(session.run(tf.reduce_all(tests)), "hvd.allreduce produces incorrect results")
def _eval(self): logdir = args.logdir if cfg.TRAINER == 'replicated': with ThreadPoolExecutor(max_workers=self.num_predictor, thread_name_prefix='EvalWorker') as executor, \ tqdm.tqdm(total=sum([df.size() for df in self.dataflows])) as pbar: futures = [] for dataflow, pred in zip(self.dataflows, self.predictors): futures.append(executor.submit(eval_coco, dataflow, pred, pbar)) all_results = list(itertools.chain(*[fut.result() for fut in futures])) else: if self._horovod_run_eval: local_results = eval_coco(self.dataflow, self.predictor) output_partial = os.path.join( logdir, 'outputs{}-part{}.json'.format(self.global_step, hvd.local_rank())) with open(output_partial, 'w') as f: json.dump(local_results, f) self.barrier.eval() if hvd.rank() > 0: return all_results = [] for k in range(hvd.local_size()): output_partial = os.path.join( logdir, 'outputs{}-part{}.json'.format(self.global_step, k)) with open(output_partial, 'r') as f: obj = json.load(f) all_results.extend(obj) os.unlink(output_partial) output_file = os.path.join( logdir, 'outputs{}.json'.format(self.global_step)) with open(output_file, 'w') as f: json.dump(all_results, f) try: scores = print_evaluation_scores(output_file) for k, v in scores.items(): self.trainer.monitors.put_scalar(k, v) except Exception: logger.exception("Exception in COCO evaluation.")
def main(_): """ Builds the model and runs. """ if FLAGS.distributed: import horovod.tensorflow as hvd hvd.init() tf.logging.set_verbosity(tf.logging.INFO) tx.utils.maybe_create_dir(FLAGS.output_dir) bert_pretrain_dir = ('bert_pretrained_models' '/%s') % FLAGS.config_bert_pretrain # Loads BERT model configuration if FLAGS.config_format_bert == "json": bert_config = model_utils.transform_bert_to_texar_config( os.path.join(bert_pretrain_dir, 'bert_config.json')) elif FLAGS.config_format_bert == 'texar': bert_config = importlib.import_module( ('bert_config_lib.' 'config_model_%s') % FLAGS.config_bert_pretrain) else: raise ValueError('Unknown config_format_bert.') # Loads data num_classes = config_data.num_classes num_train_data = config_data.num_train_data # Configures distribued mode if FLAGS.distributed: config_data.train_hparam["dataset"]["num_shards"] = hvd.size() config_data.train_hparam["dataset"]["shard_id"] = hvd.rank() config_data.train_hparam["batch_size"] //= hvd.size() train_dataset = tx.data.TFRecordData(hparams=config_data.train_hparam) eval_dataset = tx.data.TFRecordData(hparams=config_data.eval_hparam) test_dataset = tx.data.TFRecordData(hparams=config_data.test_hparam) iterator = tx.data.FeedableDataIterator({ 'train': train_dataset, 'eval': eval_dataset, 'test': test_dataset}) batch = iterator.get_next() input_ids = batch["input_ids"] segment_ids = batch["segment_ids"] batch_size = tf.shape(input_ids)[0] input_length = tf.reduce_sum(1 - tf.to_int32(tf.equal(input_ids, 0)), axis=1) # Builds BERT with tf.variable_scope('bert'): embedder = tx.modules.WordEmbedder( vocab_size=bert_config.vocab_size, hparams=bert_config.embed) word_embeds = embedder(input_ids) # Creates segment embeddings for each type of tokens. segment_embedder = tx.modules.WordEmbedder( vocab_size=bert_config.type_vocab_size, hparams=bert_config.segment_embed) segment_embeds = segment_embedder(segment_ids) input_embeds = word_embeds + segment_embeds # The BERT model (a TransformerEncoder) encoder = tx.modules.TransformerEncoder(hparams=bert_config.encoder) output = encoder(input_embeds, input_length) # Builds layers for downstream classification, which is also # initialized with BERT pre-trained checkpoint. with tf.variable_scope("pooler"): # Uses the projection of the 1st-step hidden vector of BERT output # as the representation of the sentence bert_sent_hidden = tf.squeeze(output[:, 0:1, :], axis=1) bert_sent_output = tf.layers.dense( bert_sent_hidden, config_downstream.hidden_dim, activation=tf.tanh) output = tf.layers.dropout( bert_sent_output, rate=0.1, training=tx.global_mode_train()) # Adds the final classification layer logits = tf.layers.dense( output, num_classes, kernel_initializer=tf.truncated_normal_initializer(stddev=0.02)) preds = tf.argmax(logits, axis=-1, output_type=tf.int32) accu = tx.evals.accuracy(batch['label_ids'], preds) # Optimization loss = tf.losses.sparse_softmax_cross_entropy( labels=batch["label_ids"], logits=logits) global_step = tf.Variable(0, trainable=False) # Builds learning rate decay scheduler static_lr = config_downstream.lr['static_lr'] num_train_steps = int(num_train_data / config_data.train_batch_size * config_data.max_train_epoch) num_warmup_steps = int(num_train_steps * config_data.warmup_proportion) lr = model_utils.get_lr(global_step, num_train_steps, # lr is a Tensor num_warmup_steps, static_lr) opt = tx.core.get_optimizer( global_step=global_step, learning_rate=lr, hparams=config_downstream.opt ) if FLAGS.distributed: opt = hvd.DistributedOptimizer(opt) train_op = tf.contrib.layers.optimize_loss( loss=loss, global_step=global_step, learning_rate=None, optimizer=opt) # Train/eval/test routine def _is_head(): if not FLAGS.distributed: return True else: return hvd.rank() == 0 def _train_epoch(sess): """Trains on the training set, and evaluates on the dev set periodically. """ iterator.restart_dataset(sess, 'train') fetches = { 'train_op': train_op, 'loss': loss, 'batch_size': batch_size, 'step': global_step } while True: try: feed_dict = { iterator.handle: iterator.get_handle(sess, 'train'), tx.global_mode(): tf.estimator.ModeKeys.TRAIN, } rets = sess.run(fetches, feed_dict) step = rets['step'] dis_steps = config_data.display_steps if _is_head() and dis_steps > 0 and step % dis_steps == 0: tf.logging.info('step:%d; loss:%f' % (step, rets['loss'])) eval_steps = config_data.eval_steps if _is_head() and eval_steps > 0 and step % eval_steps == 0: _eval_epoch(sess) except tf.errors.OutOfRangeError: break def _eval_epoch(sess): """Evaluates on the dev set. """ iterator.restart_dataset(sess, 'eval') cum_acc = 0.0 cum_loss = 0.0 nsamples = 0 fetches = { 'accu': accu, 'loss': loss, 'batch_size': batch_size, } while True: try: feed_dict = { iterator.handle: iterator.get_handle(sess, 'eval'), tx.context.global_mode(): tf.estimator.ModeKeys.EVAL, } rets = sess.run(fetches, feed_dict) cum_acc += rets['accu'] * rets['batch_size'] cum_loss += rets['loss'] * rets['batch_size'] nsamples += rets['batch_size'] except tf.errors.OutOfRangeError: break tf.logging.info('eval accu: {}; loss: {}; nsamples: {}'.format( cum_acc / nsamples, cum_loss / nsamples, nsamples)) def _test_epoch(sess): """Does predictions on the test set. """ iterator.restart_dataset(sess, 'test') _all_preds = [] while True: try: feed_dict = { iterator.handle: iterator.get_handle(sess, 'test'), tx.context.global_mode(): tf.estimator.ModeKeys.PREDICT, } _preds = sess.run(preds, feed_dict=feed_dict) _all_preds.extend(_preds.tolist()) except tf.errors.OutOfRangeError: break output_file = os.path.join(FLAGS.output_dir, "test_results.tsv") with tf.gfile.GFile(output_file, "w") as writer: writer.write('\n'.join(str(p) for p in _all_preds)) # Loads pretrained BERT model parameters init_checkpoint = os.path.join(bert_pretrain_dir, 'bert_model.ckpt') model_utils.init_bert_checkpoint(init_checkpoint) # Broadcasts global variables from rank-0 process if FLAGS.distributed: bcast = hvd.broadcast_global_variables(0) session_config = tf.ConfigProto() if FLAGS.distributed: session_config.gpu_options.visible_device_list = str(hvd.local_rank()) with tf.Session(config=session_config) as sess: sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) sess.run(tf.tables_initializer()) if FLAGS.distributed: bcast.run() # Restores trained model if specified saver = tf.train.Saver() if FLAGS.checkpoint: saver.restore(sess, FLAGS.checkpoint) iterator.initialize_dataset(sess) if FLAGS.do_train: for i in range(config_data.max_train_epoch): _train_epoch(sess) saver.save(sess, FLAGS.output_dir + '/model.ckpt') if FLAGS.do_eval: _eval_epoch(sess) if FLAGS.do_test: _test_epoch(sess)
def main(argv=None): ''' ''' main.__doc__ = __doc__ argv = sys.argv if argv is None else sys.argv.extend(argv) desc = main.__doc__ # .format(os.path.basename(__file__)) # CLI parser args = parser_(desc) nranks_per_gpu = args.nranks_per_gpu local_rank = hvd.local_rank() gpu_local_rank = local_rank // nranks_per_gpu print('local_rank, GPU_LOCAL_RANK: {}, {}'.format( local_rank, gpu_local_rank)) # Pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True # config.gpu_options.visible_device_list = str(hvd.local_rank()) config.gpu_options.visible_device_list = str(gpu_local_rank) K.set_session(tf.Session(config=config)) # input image dimensions img_rows, img_cols, img_chns = 28, 28, 1 # number of convolutional filters to use filters = 64 # convolution kernel size num_conv = 3 hvdsize = hvd.size() batch_size = 128 # 100 if K.image_data_format() == 'channels_first': original_img_size = (img_chns, img_rows, img_cols) else: original_img_size = (img_rows, img_cols, img_chns) latent_dim = 2 intermediate_dim = 128 epsilon_std = 1.0 epochs = args.epochs # 5 # train the VAE on MNIST digits (x_train, _), (x_test, y_test) = mnist.load_data() # Data split if going for reduction in each iteration step. Using # tf-queue or dataset is better to preserve uniform random sampling. # nsamples = x_train.shape[0] # mysamples = nsamples // hvdsize # start_sam = hvd.local_rank() * mysamples # stop_sam = min((hvd.local_rank() + 1) * mysamples, nsamples) # x_train = x_train[start_sam:stop_sam, ...] x_train = x_train.astype('float32') / 255. x_train = x_train.reshape((x_train.shape[0],) + original_img_size) x_test = x_test.astype('float32') / 255. x_test = x_test.reshape((x_test.shape[0],) + original_img_size) if hvd.rank() == 0: print('x_train.shape:', x_train.shape) vae, encoder, generator = make_vae_and_codec( original_img_size, img_chns, img_rows, img_cols, batch_size, filters, num_conv, intermediate_dim, latent_dim, epsilon_std) # : :type vae: Model lr = 0.001 # * hvdsize opt = tf.train.RMSPropOptimizer(lr) # Add Horovod Distributed Optimizer. opt = hvd.DistributedOptimizer(opt) # , use_locking=True) opt = TFOptimizer(opt) vae.compile(optimizer=opt, loss=None) if hvd.rank() == 0: vae.summary() callbacks = [] if hvd.rank() == 0: callbacks += [BatchTiming(), SamplesPerSec(batch_size * hvdsize)] sess = K.get_session() sess.run(hvd.broadcast_global_variables(0)) vae.fit(x_train, shuffle=True, epochs=epochs, batch_size=batch_size, validation_data=(x_test, None), callbacks=callbacks) if hvd.rank() == 0: vae_val = vae loss = vae_val.evaluate(x=x_test, y=None, batch_size=batch_size) print('\n\nVAE VALIDATION LOSS: {}'.format(loss)) # display a 2D plot of the digit classes in the latent space x_test_encoded = encoder.predict(x_test, batch_size=batch_size) plt.figure(figsize=(6, 6)) plt.scatter(x_test_encoded[:, 0], x_test_encoded[:, 1], c=y_test) plt.colorbar() # plt.show() plt.savefig('vae_scatter.ps') plt.close() # display a 2D manifold of the digits n = 15 # figure with 15x15 digits digit_size = 28 figure = np.zeros((digit_size * n, digit_size * n)) # Linearly spaced coordinates on the unit square were transformed # through the inverse CDF (ppf) of the Gaussian # To produce values of the latent variables z, since the prior of the # latent space is Gaussian grid_x = norm.ppf(np.linspace(0.05, 0.95, n)) grid_y = norm.ppf(np.linspace(0.05, 0.95, n)) for i, yi in enumerate(grid_x): for j, xi in enumerate(grid_y): z_sample = np.array([[xi, yi]]) z_sample = np.tile(z_sample, batch_size).reshape(batch_size, 2) x_decoded = generator.predict(z_sample, batch_size=batch_size) digit = x_decoded[0].reshape(digit_size, digit_size) figure[i * digit_size: (i + 1) * digit_size, j * digit_size: (j + 1) * digit_size] = digit plt.figure(figsize=(10, 10)) plt.imshow(figure, cmap='Greys_r') # plt.show() plt.savefig('vae_digit.ps') plt.close() K.clear_session()
def main(_): # causes memory fragmentation for bert leading to OOM if os.environ.get("TF_XLA_FLAGS", None) is not None: os.environ["TF_XLA_FLAGS"] += "--tf_xla_enable_lazy_compilation=false" else: os.environ["TF_XLA_FLAGS"] = "--tf_xla_enable_lazy_compilation=false" tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO) dllogging = utils.dllogger_class.dllogger_class(FLAGS.dllog_path) if FLAGS.horovod: hvd.init() processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mrpc": MrpcProcessor, "xnli": XnliProcessor, } if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: raise ValueError( "At least one of `do_train`, `do_eval` or `do_predict' must be True." ) bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) tf.io.gfile.makedirs(FLAGS.output_dir) task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) master_process = True training_hooks = [] global_batch_size = FLAGS.train_batch_size * FLAGS.num_accumulation_steps hvd_rank = 0 config = tf.compat.v1.ConfigProto() if FLAGS.horovod: tf.compat.v1.logging.info("Multi-GPU training with TF Horovod") tf.compat.v1.logging.info("hvd.size() = %d hvd.rank() = %d", hvd.size(), hvd.rank()) global_batch_size = FLAGS.train_batch_size * FLAGS.num_accumulation_steps * hvd.size( ) master_process = (hvd.rank() == 0) hvd_rank = hvd.rank() config.gpu_options.visible_device_list = str(hvd.local_rank()) if hvd.size() > 1: training_hooks.append(hvd.BroadcastGlobalVariablesHook(0)) if FLAGS.use_xla: config.graph_options.optimizer_options.global_jit_level = tf.compat.v1.OptimizerOptions.ON_1 tf.enable_resource_variables() run_config = tf.estimator.RunConfig( model_dir=FLAGS.output_dir if master_process else None, session_config=config, save_checkpoints_steps=FLAGS.save_checkpoints_steps if master_process else None, save_summary_steps=FLAGS.save_checkpoints_steps if master_process else None, log_step_count_steps=FLAGS.display_loss_steps, keep_checkpoint_max=1) if master_process: tf.compat.v1.logging.info("***** Configuaration *****") for key in FLAGS.__flags.keys(): tf.compat.v1.logging.info(' {}: {}'.format( key, getattr(FLAGS, key))) tf.compat.v1.logging.info("**************************") train_examples = None num_train_steps = None num_warmup_steps = None training_hooks.append( LogTrainRunHook(global_batch_size, hvd_rank, FLAGS.save_checkpoints_steps, num_steps_ignore_xla=10)) if FLAGS.do_train: train_examples = processor.get_train_examples(FLAGS.data_dir) num_train_steps = int( len(train_examples) / global_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) start_index = 0 end_index = len(train_examples) tmp_filenames = [os.path.join(FLAGS.output_dir, "train.tf_record")] if FLAGS.horovod: tmp_filenames = [ os.path.join(FLAGS.output_dir, "train.tf_record{}".format(i)) for i in range(hvd.size()) ] num_examples_per_rank = len(train_examples) // hvd.size() remainder = len(train_examples) % hvd.size() if hvd.rank() < remainder: start_index = hvd.rank() * (num_examples_per_rank + 1) end_index = start_index + num_examples_per_rank + 1 else: start_index = hvd.rank() * num_examples_per_rank + remainder end_index = start_index + (num_examples_per_rank) model_fn = model_fn_builder(task_name=task_name, bert_config=bert_config, num_labels=len(label_list), init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate if not FLAGS.horovod else FLAGS.learning_rate * hvd.size(), num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_one_hot_embeddings=False, hvd=None if not FLAGS.horovod else hvd) estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config) if FLAGS.do_train: file_based_convert_examples_to_features( train_examples[start_index:end_index], label_list, FLAGS.max_seq_length, tokenizer, tmp_filenames[hvd_rank]) tf.compat.v1.logging.info("***** Running training *****") tf.compat.v1.logging.info(" Num examples = %d", len(train_examples)) tf.compat.v1.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.compat.v1.logging.info(" Num steps = %d", num_train_steps) train_input_fn = file_based_input_fn_builder( input_file=tmp_filenames, batch_size=FLAGS.train_batch_size, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True, hvd=None if not FLAGS.horovod else hvd) train_start_time = time.time() estimator.train(input_fn=train_input_fn, max_steps=num_train_steps, hooks=training_hooks) train_time_elapsed = time.time() - train_start_time train_time_wo_overhead = training_hooks[-1].total_time avg_sentences_per_second = num_train_steps * global_batch_size * 1.0 / train_time_elapsed ss_sentences_per_second = ( num_train_steps - training_hooks[-1].skipped ) * global_batch_size * 1.0 / train_time_wo_overhead if master_process: tf.compat.v1.logging.info("-----------------------------") tf.compat.v1.logging.info( "Total Training Time = %0.2f for Sentences = %d", train_time_elapsed, num_train_steps * global_batch_size) tf.compat.v1.logging.info( "Total Training Time W/O Overhead = %0.2f for Sentences = %d", train_time_wo_overhead, (num_train_steps - training_hooks[-1].skipped) * global_batch_size) tf.compat.v1.logging.info( "Throughput Average (sentences/sec) with overhead = %0.2f", avg_sentences_per_second) tf.compat.v1.logging.info( "Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second) tf.compat.v1.logging.info("-----------------------------") if FLAGS.do_eval and master_process: eval_examples = processor.get_dev_examples(FLAGS.data_dir) eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") file_based_convert_examples_to_features(eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) tf.compat.v1.logging.info("***** Running evaluation *****") tf.compat.v1.logging.info(" Num examples = %d", len(eval_examples)) tf.compat.v1.logging.info(" Batch size = %d", FLAGS.eval_batch_size) eval_drop_remainder = False eval_input_fn = file_based_input_fn_builder( input_file=eval_file, batch_size=FLAGS.eval_batch_size, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=eval_drop_remainder) eval_hooks = [LogEvalRunHook(FLAGS.eval_batch_size)] eval_start_time = time.time() result = estimator.evaluate(input_fn=eval_input_fn, hooks=eval_hooks) eval_time_elapsed = time.time() - eval_start_time time_list = eval_hooks[-1].time_list time_list.sort() # Removing outliers (init/warmup) in throughput computation. eval_time_wo_overhead = sum(time_list[:int(len(time_list) * 0.8)]) num_sentences = (int(len(time_list) * 0.8)) * FLAGS.eval_batch_size avg = np.mean(time_list) cf_50 = max(time_list[:int(len(time_list) * 0.50)]) cf_90 = max(time_list[:int(len(time_list) * 0.90)]) cf_95 = max(time_list[:int(len(time_list) * 0.95)]) cf_99 = max(time_list[:int(len(time_list) * 0.99)]) cf_100 = max(time_list[:int(len(time_list) * 1)]) ss_sentences_per_second = num_sentences * 1.0 / eval_time_wo_overhead tf.compat.v1.logging.info("-----------------------------") tf.compat.v1.logging.info( "Total Inference Time = %0.2f for Sentences = %d", eval_time_elapsed, eval_hooks[-1].count * FLAGS.eval_batch_size) tf.compat.v1.logging.info( "Total Inference Time W/O Overhead = %0.2f for Sentences = %d", eval_time_wo_overhead, num_sentences) tf.compat.v1.logging.info("Summary Inference Statistics on EVAL set") tf.compat.v1.logging.info("Batch size = %d", FLAGS.eval_batch_size) tf.compat.v1.logging.info("Sequence Length = %d", FLAGS.max_seq_length) tf.compat.v1.logging.info("Precision = %s", "fp16" if FLAGS.amp else "fp32") tf.compat.v1.logging.info("Latency Confidence Level 50 (ms) = %0.2f", cf_50 * 1000) tf.compat.v1.logging.info("Latency Confidence Level 90 (ms) = %0.2f", cf_90 * 1000) tf.compat.v1.logging.info("Latency Confidence Level 95 (ms) = %0.2f", cf_95 * 1000) tf.compat.v1.logging.info("Latency Confidence Level 99 (ms) = %0.2f", cf_99 * 1000) tf.compat.v1.logging.info("Latency Confidence Level 100 (ms) = %0.2f", cf_100 * 1000) tf.compat.v1.logging.info("Latency Average (ms) = %0.2f", avg * 1000) tf.compat.v1.logging.info("Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second) dllogging.logger.log(step=(), data={"throughput_val": ss_sentences_per_second}, verbosity=Verbosity.DEFAULT) tf.compat.v1.logging.info("-----------------------------") output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with tf.io.gfile.GFile(output_eval_file, "w") as writer: tf.compat.v1.logging.info("***** Eval results *****") for key in sorted(result.keys()): dllogging.logger.log(step=(), data={key: float(result[key])}, verbosity=Verbosity.DEFAULT) tf.compat.v1.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if FLAGS.do_predict and master_process: predict_examples = processor.get_test_examples(FLAGS.data_dir) predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") file_based_convert_examples_to_features(predict_examples, label_list, FLAGS.max_seq_length, tokenizer, predict_file) tf.compat.v1.logging.info("***** Running prediction*****") tf.compat.v1.logging.info(" Num examples = %d", len(predict_examples)) tf.compat.v1.logging.info(" Batch size = %d", FLAGS.predict_batch_size) predict_drop_remainder = False predict_input_fn = file_based_input_fn_builder( input_file=predict_file, batch_size=FLAGS.predict_batch_size, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder) predict_hooks = [LogEvalRunHook(FLAGS.predict_batch_size)] predict_start_time = time.time() output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv") with tf.io.gfile.GFile(output_predict_file, "w") as writer: tf.compat.v1.logging.info("***** Predict results *****") for prediction in estimator.predict(input_fn=predict_input_fn, hooks=predict_hooks, yield_single_examples=False): output_line = "\t".join( str(class_probability) for class_probability in prediction) + "\n" writer.write(output_line) predict_time_elapsed = time.time() - predict_start_time time_list = predict_hooks[-1].time_list time_list.sort() # Removing outliers (init/warmup) in throughput computation. predict_time_wo_overhead = sum(time_list[:int(len(time_list) * 0.8)]) num_sentences = (int(len(time_list) * 0.8)) * FLAGS.predict_batch_size avg = np.mean(time_list) cf_50 = max(time_list[:int(len(time_list) * 0.50)]) cf_90 = max(time_list[:int(len(time_list) * 0.90)]) cf_95 = max(time_list[:int(len(time_list) * 0.95)]) cf_99 = max(time_list[:int(len(time_list) * 0.99)]) cf_100 = max(time_list[:int(len(time_list) * 1)]) ss_sentences_per_second = num_sentences * 1.0 / predict_time_wo_overhead tf.compat.v1.logging.info("-----------------------------") tf.compat.v1.logging.info( "Total Inference Time = %0.2f for Sentences = %d", predict_time_elapsed, predict_hooks[-1].count * FLAGS.predict_batch_size) tf.compat.v1.logging.info( "Total Inference Time W/O Overhead = %0.2f for Sentences = %d", predict_time_wo_overhead, num_sentences) tf.compat.v1.logging.info("Summary Inference Statistics on TEST SET") tf.compat.v1.logging.info("Batch size = %d", FLAGS.predict_batch_size) tf.compat.v1.logging.info("Sequence Length = %d", FLAGS.max_seq_length) tf.compat.v1.logging.info("Precision = %s", "fp16" if FLAGS.amp else "fp32") tf.compat.v1.logging.info("Latency Confidence Level 50 (ms) = %0.2f", cf_50 * 1000) tf.compat.v1.logging.info("Latency Confidence Level 90 (ms) = %0.2f", cf_90 * 1000) tf.compat.v1.logging.info("Latency Confidence Level 95 (ms) = %0.2f", cf_95 * 1000) tf.compat.v1.logging.info("Latency Confidence Level 99 (ms) = %0.2f", cf_99 * 1000) tf.compat.v1.logging.info("Latency Confidence Level 100 (ms) = %0.2f", cf_100 * 1000) tf.compat.v1.logging.info("Latency Average (ms) = %0.2f", avg * 1000) tf.compat.v1.logging.info("Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second) dllogging.logger.log(step=(), data={"throughput_val": ss_sentences_per_second}, verbosity=Verbosity.DEFAULT) tf.compat.v1.logging.info("-----------------------------")
def main(): parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments, LoggingArguments, PathArguments)) ( model_args, data_args, train_args, log_args, path_args, remaining_strings, ) = parser.parse_args_into_dataclasses(return_remaining_strings=True) # SageMaker may have some extra strings. TODO: Test this on SM. assert len(remaining_strings ) == 0, f"The args {remaining_strings} could not be parsed." tf.random.set_seed(train_args.seed) tf.autograph.set_verbosity(0) # Settings init parse_bool = lambda arg: arg == "true" do_gradient_accumulation = train_args.gradient_accumulation_steps > 1 do_xla = not parse_bool(train_args.skip_xla) do_eager = parse_bool(train_args.eager) skip_sop = parse_bool(train_args.skip_sop) skip_mlm = parse_bool(train_args.skip_mlm) pre_layer_norm = parse_bool(model_args.pre_layer_norm) fast_squad = parse_bool(log_args.fast_squad) dummy_eval = parse_bool(log_args.dummy_eval) is_sagemaker = path_args.filesystem_prefix.startswith("/opt/ml") disable_tqdm = is_sagemaker global max_grad_norm max_grad_norm = train_args.max_grad_norm # Horovod init hvd.init() gpus = tf.config.list_physical_devices("GPU") for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) if gpus: tf.config.set_visible_devices(gpus[hvd.local_rank()], "GPU") # XLA, AutoGraph tf.config.optimizer.set_jit(do_xla) tf.config.experimental_run_functions_eagerly(do_eager) if hvd.rank() == 0: # Run name should only be used on one process to avoid race conditions current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") platform = "sm" if is_sagemaker else "eks" if skip_sop: loss_str = "-skipsop" elif skip_mlm: loss_str = "-skipmlm" else: loss_str = "" if log_args.run_name is None: metadata = ( f"{model_args.model_type}" f"-{model_args.model_size}" f"-{model_args.load_from}" f"-{hvd.size()}gpus" f"-{train_args.per_gpu_batch_size * hvd.size() * train_args.gradient_accumulation_steps}globalbatch" f"-{train_args.learning_rate}maxlr" f"-{train_args.learning_rate_decay_power}power" f"-{train_args.optimizer}opt" f"-{train_args.total_steps}steps" f"-{'preln' if pre_layer_norm else 'postln'}" f"{loss_str}" f"-{model_args.hidden_dropout_prob}dropout") run_name = f"{current_time}-{platform}-{metadata}-{train_args.name if train_args.name else 'unnamed'}" else: run_name = log_args.run_name # Logging should only happen on a single process # https://stackoverflow.com/questions/9321741/printing-to-screen-and-writing-to-a-file-at-the-same-time level = logging.INFO format = "%(asctime)-15s %(name)-12s: %(levelname)-8s %(message)s" handlers = [ logging.FileHandler( os.path.join(path_args.filesystem_prefix, path_args.log_dir, f"{run_name}.log")), TqdmLoggingHandler(), ] logging.basicConfig(level=level, format=format, handlers=handlers) # Check that arguments passed in properly, only after registering the alert_func and logging assert not (skip_sop and skip_mlm), "Cannot use --skip_sop and --skip_mlm" wrap_global_functions(do_gradient_accumulation) # Create optimizer and enable AMP loss scaling. if train_args.optimizer == "lamb": optimizer = get_lamb_optimizer(train_args) elif train_args.optimizer == "adamw": optimizer = get_adamw_optimizer(train_args) optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite( optimizer, loss_scale="dynamic") gradient_accumulator = GradientAccumulator() loaded_optimizer_weights = None model = create_model(model_class=TFAutoModelForPreTraining, model_args=model_args) tokenizer = create_tokenizer(model_args.model_type) if model_args.load_from == "checkpoint": checkpoint_path = os.path.join(path_args.filesystem_prefix, model_args.checkpoint_path) model_ckpt, optimizer_ckpt = get_checkpoint_paths_from_prefix( checkpoint_path) if hvd.rank() == 0: model.load_weights(model_ckpt) if model_args.load_optimizer_state == "true": loaded_optimizer_weights = np.load(optimizer_ckpt, allow_pickle=True) # We do not set the weights yet, we have to do a first step to initialize the optimizer. # Train filenames are [1, 2047], Val filenames are [0]. Note the different subdirectories # Move to same folder structure and remove if/else train_glob = os.path.join(path_args.filesystem_prefix, path_args.train_dir, "*.tfrecord") validation_glob = os.path.join(path_args.filesystem_prefix, path_args.val_dir, "*.tfrecord") train_filenames = glob.glob(train_glob) validation_filenames = glob.glob(validation_glob) train_dataset = get_dataset_from_tfrecords( model_type=model_args.model_type, filenames=train_filenames, max_seq_length=data_args.max_seq_length, max_predictions_per_seq=data_args.max_predictions_per_seq, per_gpu_batch_size=train_args.per_gpu_batch_size, ) # Of shape [per_gpu_batch_size, ...] # Batch of batches, helpful for gradient accumulation. Shape [grad_steps, per_gpu_batch_size, ...] train_dataset = train_dataset.batch(train_args.gradient_accumulation_steps) # One iteration with 10 dupes, 8 nodes seems to be 60-70k steps. train_dataset = train_dataset.prefetch(buffer_size=8) # Validation should only be done on one node, since Horovod doesn't allow allreduce on a subset of ranks if hvd.rank() == 0: validation_dataset = get_dataset_from_tfrecords( model_type=model_args.model_type, filenames=validation_filenames, max_seq_length=data_args.max_seq_length, max_predictions_per_seq=data_args.max_predictions_per_seq, per_gpu_batch_size=train_args.per_gpu_batch_size, ) # validation_dataset = validation_dataset.batch(1) validation_dataset = validation_dataset.prefetch(buffer_size=8) pbar = tqdm.tqdm(total=train_args.total_steps, disable=disable_tqdm) summary_writer = None # Only create a writer if we make it through a successful step logger.info(f"Starting training, job name {run_name}") i = 1 start_time = time.perf_counter() for batch in train_dataset: learning_rate = optimizer.learning_rate( step=tf.constant(i, dtype=tf.float32)) # weight_decay = wd_schedule(step=tf.constant(i, dtype=tf.float32)) loss_scale = optimizer.loss_scale() loss, mlm_loss, mlm_acc, sop_loss, sop_acc, grad_norm, weight_norm = train_step( model=model, optimizer=optimizer, gradient_accumulator=gradient_accumulator, batch=batch, gradient_accumulation_steps=train_args.gradient_accumulation_steps, skip_sop=skip_sop, skip_mlm=skip_mlm, ) # Don't want to wrap broadcast_variables() in a tf.function, can lead to asynchronous errors if i == 1: if hvd.rank() == 0 and loaded_optimizer_weights is not None: optimizer.set_weights(loaded_optimizer_weights) hvd.broadcast_variables(model.variables, root_rank=0) hvd.broadcast_variables(optimizer.variables(), root_rank=0) i = optimizer.get_weights()[0] is_final_step = i >= train_args.total_steps do_squad = (log_args.squad_frequency != 0) and ( (i % log_args.squad_frequency == 0) or is_final_step) # Squad requires all the ranks to train, but results are only returned on rank 0 if do_squad: squad_results = get_squad_results_while_pretraining( model=model, tokenizer=tokenizer, model_size=model_args.model_size, filesystem_prefix=path_args.filesystem_prefix, step=i, dataset=data_args.squad_version, fast=log_args.fast_squad, dummy_eval=log_args.dummy_eval, ) if hvd.rank() == 0: squad_exact, squad_f1 = squad_results["exact"], squad_results[ "f1"] logger.info( f"SQuAD step {i} -- F1: {squad_f1:.3f}, Exact: {squad_exact:.3f}" ) # Re-wrap autograph so it doesn't get arg mismatches wrap_global_functions(do_gradient_accumulation) gc.collect() if hvd.rank() == 0: do_log = i % log_args.log_frequency == 0 do_checkpoint = (log_args.checkpoint_frequency != 0) and ( (i % log_args.checkpoint_frequency == 0) or is_final_step) do_validation = (log_args.validation_frequency != 0) and ( (i % log_args.validation_frequency == 0) or is_final_step) pbar.update(1) description = f"Loss: {loss:.3f}, MLM: {mlm_loss:.3f}, SOP: {sop_loss:.3f}, MLM_acc: {mlm_acc:.3f}, SOP_acc: {sop_acc:.3f}" pbar.set_description(description) if do_log: elapsed_time = time.perf_counter() - start_time if i == 1: logger.info(f"First step: {elapsed_time:.3f} secs") else: it_per_sec = log_args.log_frequency / elapsed_time logger.info( f"Train step {i} -- {description} -- It/s: {it_per_sec:.2f}" ) start_time = time.perf_counter() if do_checkpoint: checkpoint_prefix = os.path.join(path_args.filesystem_prefix, path_args.checkpoint_dir, f"{run_name}-step{i}") model_ckpt = f"{checkpoint_prefix}.ckpt" optimizer_ckpt = f"{checkpoint_prefix}-optimizer.npy" logger.info( f"Saving model at {model_ckpt}, optimizer at {optimizer_ckpt}" ) model.save_weights(model_ckpt) # model.load_weights(model_ckpt) optimizer_weights = optimizer.get_weights() np.save(optimizer_ckpt, optimizer_weights) # optimizer.set_weights(optimizer_weights) if do_validation: val_loss, val_mlm_loss, val_mlm_acc, val_sop_loss, val_sop_acc = run_validation( model=model, validation_dataset=validation_dataset, skip_sop=skip_sop, skip_mlm=skip_mlm, ) description = f"Loss: {val_loss:.3f}, MLM: {val_mlm_loss:.3f}, SOP: {val_sop_loss:.3f}, MLM_acc: {val_mlm_acc:.3f}, SOP_acc: {val_sop_acc:.3f}" logger.info(f"Validation step {i} -- {description}") # Create summary_writer after the first step if summary_writer is None: summary_writer = tf.summary.create_file_writer( os.path.join(path_args.filesystem_prefix, path_args.log_dir, run_name)) config = { **asdict(model_args), **asdict(data_args), **asdict(train_args), **asdict(log_args), "global_batch_size": train_args.per_gpu_batch_size * hvd.size(), } if is_wandb_available(): wandb.init(config=config, project=model_args.model_type) wandb.run.save() wandb_run_name = wandb.run.name train_metrics = { "weight_norm": weight_norm, "grad_norm": grad_norm, "loss_scale": loss_scale, "learning_rate": learning_rate, "train/loss": loss, "train/mlm_loss": mlm_loss, "train/mlm_acc": mlm_acc, "train/sop_loss": sop_loss, "train/sop_acc": sop_acc, } all_metrics = {**train_metrics} if do_validation: val_metrics = { "val/loss": val_loss, "val/mlm_loss": val_mlm_loss, "val/mlm_acc": val_mlm_acc, "val/sop_loss": val_sop_loss, "val/sop_acc": val_sop_acc, } all_metrics = {**all_metrics, **val_metrics} if do_squad: squad_metrics = { "squad/f1": squad_f1, "squad/exact": squad_exact, } all_metrics = {**all_metrics, **squad_metrics} # Log to TensorBoard with summary_writer.as_default(): for name, val in all_metrics.items(): tf.summary.scalar(name, val, step=i) # Log to Weights & Biases if is_wandb_available(): wandb.log({"step": i, **all_metrics}) i += 1 if is_final_step: break if hvd.rank() == 0: pbar.close() logger.info(f"Finished pretraining, job name {run_name}")
def main(_): # Horovod: initialize Horovod. hvd.init() # Keras automatically creates a cache directory in ~/.keras/datasets for # storing the downloaded MNIST data. This creates a race # condition among the workers that share the same filesystem. If the # directory already exists by the time this worker gets around to creating # it, ignore the resulting exception and continue. cache_dir = os.path.join(os.path.expanduser('~'), '.keras', 'datasets') if not os.path.exists(cache_dir): try: os.mkdir(cache_dir) except OSError as e: if e.errno == errno.EEXIST and os.path.isdir(cache_dir): pass else: raise # Download and load MNIST dataset. (x_train, y_train), (x_test, y_test) = \ keras.datasets.mnist.load_data('MNIST-data-%d' % hvd.rank()) # The shape of downloaded data is (-1, 28, 28), hence we need to reshape it # into (-1, 784) to feed into our network. Also, need to normalize the # features between 0 and 1. x_train = np.reshape(x_train, (-1, 784)) / 255.0 x_test = np.reshape(x_test, (-1, 784)) / 255.0 # Build model... with tf.name_scope('input'): image = tf.placeholder(tf.float32, [None, 784], name='image') label = tf.placeholder(tf.float32, [None], name='label') predict, loss = conv_model(image, label, tf.estimator.ModeKeys.TRAIN) # Horovod: adjust learning rate based on number of GPUs. opt = tf.train.AdamOptimizer(0.001 * hvd.size()) # Horovod: add Horovod Distributed Optimizer. opt = hvd.DistributedOptimizer(opt) global_step = tf.train.get_or_create_global_step() train_op = opt.minimize(loss, global_step=global_step) hooks = [ # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states # from rank 0 to all other processes. This is necessary to ensure consistent # initialization of all workers when training is started with random weights # or restored from a checkpoint. hvd.BroadcastGlobalVariablesHook(0), # Horovod: adjust number of steps based on number of GPUs. tf.train.StopAtStepHook(last_step=20000 // hvd.size()), tf.train.LoggingTensorHook(tensors={ 'step': global_step, 'loss': loss }, every_n_iter=10), ] # Horovod: pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) # Horovod: save checkpoints only on worker 0 to prevent other workers from # corrupting them. checkpoint_dir = './checkpoints' if hvd.rank() == 0 else None training_batch_generator = train_input_generator(x_train, y_train, batch_size=100) # The MonitoredTrainingSession takes care of session initialization, # restoring from a checkpoint, saving to a checkpoint, and closing when done # or an error occurs. with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir, hooks=hooks, config=config) as mon_sess: while not mon_sess.should_stop(): # Run a training step synchronously. image_, label_ = next(training_batch_generator) mon_sess.run(train_op, feed_dict={image: image_, label: label_})
def train_main(dataset, model_name='117M', seed=None, batch_size=2, sample_length=1023, sample_num=1, sample_every=4500, run_name='run1', restore_from='latest', save_every=2000, combine=50000): enc = encoder.get_encoder(model_name) hparams = model.default_hparams() with open(os.path.join('models', model_name, 'hparams.json')) as f: hparams.override_from_dict(json.load(f)) if sample_length is None: sample_length = hparams.n_ctx // 2 elif sample_length > hparams.n_ctx: raise ValueError("Can't get samples longer than window size: %s" % hparams.n_ctx) # TF config config = tf.ConfigProto() config.gpu_options.visible_device_list = str(hvd.local_rank()) config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: context = tf.placeholder(tf.int32, [batch_size, None]) np.random.seed(seed) tf.set_random_seed(seed) output = model.model(hparams=hparams, X=context) loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( labels=context[:, 1:], logits=output['logits'][:, :-1])) tf_sample = sample.sample_sequence(hparams=hparams, length=sample_length, context=context, batch_size=batch_size, temperature=0.8, top_k=40) train_vars = [v for v in tf.trainable_variables() if 'model' in v.name] opt = tf.train.AdamOptimizer() opt = hvd.DistributedOptimizer(opt) train_op = opt.minimize(loss, var_list=train_vars) # Horovod: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. bcast = hvd.broadcast_global_variables(0) saver = tf.train.Saver(var_list=train_vars, max_to_keep=5, keep_checkpoint_every_n_hours=2) sess.run(tf.global_variables_initializer()) if restore_from == 'latest': ckpt = tf.train.latest_checkpoint( os.path.join(CHECKPOINT_DIR, run_name)) if ckpt is None: # Get fresh GPT weights if new run. ckpt = tf.train.latest_checkpoint( os.path.join('models', model_name)) elif restore_from == 'fresh': ckpt = tf.train.latest_checkpoint( os.path.join('models', model_name)) else: ckpt = tf.train.latest_checkpoint(restore_from) print(str(hvd.local_rank()), 'Loading checkpoint', ckpt) saver.restore(sess, ckpt) bcast.run() print(str(hvd.local_rank()), 'Loading dataset...') chunks = load_dataset(enc, dataset, combine) data_sampler = Sampler(chunks) print(str(hvd.local_rank()), 'dataset has', data_sampler.total_size, 'tokens') print(str(hvd.local_rank()), 'Training...') counter = 1 if os.path.exists(os.path.join(CHECKPOINT_DIR, run_name, 'counter')): # Load the step number if we're resuming a run # Add 1 so we don't immediately try to save again with open(os.path.join(CHECKPOINT_DIR, run_name, 'counter'), 'r') as fp: counter = int(fp.read()) + 1 def save(): maketree(os.path.join(CHECKPOINT_DIR, run_name)) print( 'Saving', os.path.join(CHECKPOINT_DIR, run_name, 'model-{}').format(counter)) saver.save(sess, os.path.join(CHECKPOINT_DIR, run_name, 'model'), global_step=counter) with open(os.path.join(CHECKPOINT_DIR, run_name, 'counter'), 'w') as fp: fp.write(str(counter) + '\n') def generate_samples(): context_tokens = data_sampler.sample(1) all_text = [] index = 0 while index < sample_num: out = sess.run( tf_sample, feed_dict={context: batch_size * [context_tokens]}) for i in range(min(sample_num - index, batch_size)): text = enc.decode(out[i]) text = '======== SAMPLE {} ========\n{}\n'.format( index + 1, text) all_text.append(text) index += 1 print(text) maketree(os.path.join(SAMPLE_DIR, run_name)) with open( os.path.join(SAMPLE_DIR, run_name, 'samples-{}').format(counter), 'w') as fp: fp.write('\n'.join(all_text)) avg_loss = (0.0, 0.0) start_time = time.time() try: while True: batch = [data_sampler.sample(1024) for _ in range(batch_size)] _, lv = sess.run((train_op, loss), feed_dict={context: batch}) avg_loss = (avg_loss[0] * 0.99 + lv, avg_loss[1] * 0.99 + 1.0) if hvd.rank() == 0: if counter % save_every == 0: save() if counter % sample_every == 0: generate_samples() print( '[{counter} | {time:2.2f}] loss={loss:2.2f} avg={avg:2.2f}' .format(counter=counter, time=time.time() - start_time, loss=lv, avg=avg_loss[0] / avg_loss[1])) counter += 1 except KeyboardInterrupt: print('interrupted') if hvd.rank() == 0: save()
def train_eval_fn(FLAGS, worker_count, task_index, is_chief, target, init_checkpoint, train_file, dev_file, checkpoint_dir, is_debug, **kargs): graph = tf.Graph() with graph.as_default(): import json # config = model_config_parser(FLAGS) print(FLAGS.train_size) if FLAGS.if_shard == "0": train_size = FLAGS.train_size epoch = int(FLAGS.epoch / worker_count) elif FLAGS.if_shard == "1": train_size = int(FLAGS.train_size / worker_count) epoch = FLAGS.epoch else: train_size = int(FLAGS.train_size / worker_count) epoch = FLAGS.epoch multi_task_config = Bunch( json.load(tf.gfile.Open(FLAGS.multi_task_config))) num_train_steps = int(train_size / FLAGS.batch_size * epoch) num_warmup_steps = int(num_train_steps * 0.1) num_storage_steps = int(train_size / FLAGS.batch_size) num_eval_steps = int(FLAGS.eval_size / FLAGS.batch_size) if is_debug == "0": num_storage_steps = 190 num_eval_steps = 100 num_train_steps = 200 print("num_train_steps {}, num_eval_steps {}, num_storage_steps {}". format(num_train_steps, num_eval_steps, num_storage_steps)) print(" model type {}".format(FLAGS.model_type)) print(num_train_steps, num_warmup_steps, "=============") print("==init lr==", FLAGS.init_lr) opt_config = Bunch({ "init_lr": FLAGS.init_lr, "num_train_steps": num_train_steps, "num_warmup_steps": num_warmup_steps, "worker_count": worker_count, "opt_type": FLAGS.opt_type, "is_chief": is_chief, "train_op": kargs.get("train_op", "adam"), "decay": kargs.get("decay", "no"), "warmup": kargs.get("warmup", "no"), "grad_clip": kargs.get("grad_clip", "global_norm"), "clip_norm": kargs.get("clip_norm", 1000.0), "opt_ema": kargs.get("opt_ema", "no") }) anneal_config = Bunch({ "initial_value": 1.0, "num_train_steps": num_train_steps }) model_io_config = Bunch({ "fix_lm": False, "ema_saver": kargs.get("opt_ema", "no") }) if FLAGS.opt_type == "hvd" and hvd: checkpoint_dir = checkpoint_dir if task_index == 0 else None elif FLAGS.opt_type == "all_reduce": checkpoint_dir = checkpoint_dir elif FLAGS.opt_type == "collective_reduce": checkpoint_dir = checkpoint_dir if task_index == 0 else None elif FLAGS.opt_type == "ps" or FLAGS.opt_type == "ps_sync": checkpoint_dir = checkpoint_dir if task_index == 0 else None print("==checkpoint_dir==", checkpoint_dir, is_chief) model_config_dict = {} num_labels_dict = {} init_checkpoint_dict = {} load_pretrained_dict = {} exclude_scope_dict = {} not_storage_params_dict = {} target_dict = {} task_type_dict = {} model_type_lst = [] label_dict = {} for task_type in FLAGS.multi_task_type.split(","): print("==task type==", task_type) multi_task_config[task_type]['buckets'] = FLAGS.buckets multi_task_config[task_type]['w2v_path'] = FLAGS.w2v_path model_config_dict[task_type] = model_config_parser( Bunch(multi_task_config[task_type])) num_labels_dict[task_type] = multi_task_config[task_type][ "num_labels"] init_checkpoint_dict[task_type] = os.path.join( FLAGS.buckets, multi_task_config[task_type]["init_checkpoint"]) load_pretrained_dict[task_type] = multi_task_config[task_type][ "load_pretrained"] exclude_scope_dict[task_type] = multi_task_config[task_type][ "exclude_scope"] not_storage_params_dict[task_type] = multi_task_config[task_type][ "not_storage_params"] target_dict[task_type] = multi_task_config[task_type]["target"] task_type_dict[task_type] = multi_task_config[task_type][ "task_type"] label_dict[task_type] = json.load( tf.gfile.Open( os.path.join(FLAGS.buckets, multi_task_config[task_type]["label_id"]))) model_fn = multitask_model_fn( model_config_dict, num_labels_dict, task_type_dict, init_checkpoint_dict, load_pretrained_dict=load_pretrained_dict, opt_config=opt_config, model_io_config=model_io_config, exclude_scope_dict=exclude_scope_dict, not_storage_params_dict=not_storage_params_dict, target_dict=target_dict, output_type="estimator", checkpoint_dir=checkpoint_dir, num_storage_steps=num_storage_steps, anneal_config=anneal_config, task_layer_reuse=None, model_type_lst=model_type_lst, task_invariant=FLAGS.task_invariant, multi_task_config=multi_task_config, flags=FLAGS, **kargs) print("==succeeded in building model==") name_to_features = data_interface_dual_encoder( FLAGS, multi_task_config, FLAGS.multi_task_type.split(",")) def _decode_record(record, name_to_features): """Decodes a record to a TensorFlow example. """ example = tf.parse_single_example(record, name_to_features) # tf.Example only supports tf.int64, but the TPU only supports tf.int32. # So cast all int64 to int32. for name in list(example.keys()): t = example[name] if t.dtype == tf.int64: t = tf.to_int32(t) example[name] = t return example def _decode_batch_record(record, name_to_features): example = tf.parse_example(record, name_to_features) return example params = Bunch({}) params.epoch = epoch params.batch_size = FLAGS.batch_size # data_prior = [] # for task_type in FLAGS.multi_task_type.split(","): # data_prior.append(multi_task_config[task_type].get("data_prior", None)) # if None in data_prior: # data_prior = None # else: # print("===task prior===", data_prior) if kargs.get("parse_type", "parse_single") == "parse_single": train_file_lst = [ multi_task_config[task_type]["train_result_file"] for task_type in FLAGS.multi_task_type.split(",") ] print(train_file_lst) train_features = lambda: tf_data_utils.all_reduce_multitask_train_input_fn( train_file_lst, _decode_record, name_to_features, params, if_shard=FLAGS.if_shard, worker_count=worker_count, task_index=task_index) elif kargs.get("parse_type", "parse_single") == "parse_batch": train_file_path_lst = [] data_prior = [] # train_file_lst = [multi_task_config[task_type]["train_result_file"] for task_type in FLAGS.multi_task_type.split(",")] # train_file_path_lst = [os.path.join(FLAGS.buckets, train_file) for train_file in train_file_lst] for task_type in FLAGS.multi_task_type.split(","): task_prior = multi_task_config[task_type].get( "data_prior", None) task_path = multi_task_config[task_type][ "train_result_file"].split(',') for task_sub_path in task_path: train_file_path_lst.append( os.path.join(FLAGS.buckets, task_sub_path)) data_prior.append(task_prior) print(train_file_path_lst) if None in data_prior: data_prior = None else: print("===task prior===", data_prior) # train_features = lambda: tf_data_utils.all_reduce_train_batch_input_fn(train_file_path_lst, # _decode_batch_record, # name_to_features, # params, # if_shard=FLAGS.if_shard, # worker_count=worker_count, # task_index=task_index) train_features = lambda: tf_data_utils.all_reduce_multitask_train_batch_input_fn_sample( train_file_path_lst, _decode_record, name_to_features, params, data_prior=data_prior, if_shard=FLAGS.if_shard, worker_count=worker_count, task_index=task_index) # elif kargs.get("parse_type", "parse_single") == "generator": # def train_features(): return train_eval_input_fn(FLAGS, multi_task_config, "train", 0) print("==succeeded in building data and model==") print("start training") train_hooks = [] sess_config = tf.ConfigProto(allow_soft_placement=False, log_device_placement=False) if FLAGS.opt_type == "ps" or FLAGS.opt_type == "ps_sync": print("==no need for hook==") elif FLAGS.opt_type == "pai_soar" and pai: print("no need for hook") elif FLAGS.opt_type == "hvd" and hvd: sess_config.gpu_options.allow_growth = True sess_config.gpu_options.visible_device_list = str(hvd.local_rank()) print("==no need fo hook==") else: print("==no need for hooks==") if kargs.get("run_config", None): run_config = kargs.get("run_config", None) run_config = run_config.replace( save_checkpoints_steps=num_storage_steps) print("==run config==", run_config.save_checkpoints_steps) else: run_config = tf.estimator.RunConfig( model_dir=checkpoint_dir, save_checkpoints_steps=num_storage_steps, session_config=sess_config) if kargs.get("profiler", "profiler") == "profiler": if checkpoint_dir: hooks = tf.train.ProfilerHook( save_steps=100, save_secs=None, output_dir=os.path.join(checkpoint_dir, "profiler"), ) train_hooks.append(hooks) print("==add profiler hooks==") model_estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config) print("==finish build model estimator==") train_being_time = time.time() tf.logging.info("==training distribution_strategy=={}".format( kargs.get("distribution_strategy", "MirroredStrategy"))) if kargs.get("distribution_strategy", "MirroredStrategy") == "MirroredStrategy": print("==apply single machine multi-card training==") model_estimator.train(input_fn=train_features, max_steps=num_train_steps) train_end_time = time.time() print("==training time==", train_end_time - train_being_time) tf.logging.info("==training time=={}".format(train_end_time - train_being_time)) elif kargs.get("distribution_strategy", "MirroredStrategy") in [ "ParameterServerStrategy", "CollectiveAllReduceStrategy" ]: print("==apply multi-machine machine multi-card training==") try: print(os.environ['TF_CONFIG'], "==tf_run_config==") except: print("==not tf config==") train_spec = tf.estimator.TrainSpec(input_fn=train_features, max_steps=num_train_steps) eval_spec = tf.estimator.EvalSpec(input_fn=eval_features, steps=num_eval_steps) tf.estimator.train_and_evaluate(model_estimator, train_spec, eval_spec) train_end_time = time.time() print("==training time==", train_end_time - train_being_time)
def train_eval_fn(FLAGS, worker_count, task_index, is_chief, target, init_checkpoint, train_file, dev_file, checkpoint_dir, is_debug, **kargs): graph = tf.Graph() with graph.as_default(): import json config = json.load(open(FLAGS.config_file, "r")) config = Bunch(config) config.use_one_hot_embeddings = True config.scope = "bert" config.dropout_prob = 0.1 config.label_type = "single_label" config.model = FLAGS.model_type config.init_lr = 3e-4 config.ln_type = FLAGS.ln_type config.loss = 'entropy' print('==init learning rate==', config.init_lr) if FLAGS.if_shard == "0": train_size = FLAGS.train_size epoch = int(FLAGS.epoch / worker_count) elif FLAGS.if_shard == "1": train_size = int(FLAGS.train_size / worker_count) epoch = FLAGS.epoch else: train_size = int(FLAGS.train_size / worker_count) epoch = FLAGS.epoch init_lr = config.init_lr label_dict = json.load(tf.gfile.Open(FLAGS.label_id)) num_train_steps = int(train_size / FLAGS.batch_size * epoch) num_warmup_steps = int(num_train_steps * 0.1) num_storage_steps = int(train_size / FLAGS.batch_size) num_eval_steps = int(FLAGS.eval_size / FLAGS.batch_size) if is_debug == "0": num_storage_steps = 2 num_eval_steps = 10 num_train_steps = 10 print("num_train_steps {}, num_eval_steps {}, num_storage_steps {}". format(num_train_steps, num_eval_steps, num_storage_steps)) print(" model type {}".format(FLAGS.model_type)) print(num_train_steps, num_warmup_steps, "=============") if worker_count * kargs.get("num_gpus", 1) >= 2: clip_norm_scale = 1.0 lr_scale = 0.75 else: clip_norm_scale = 1.0 lr_scale = 1.0 lr = init_lr * worker_count * kargs.get("num_gpus", 1) * lr_scale # if lr >= 1e-3: # lr = 1e-3 lr = config.init_lr print('--training learning rate--', lr) opt_config = Bunch({ "init_lr": lr, "num_train_steps": num_train_steps, "num_warmup_steps": num_warmup_steps, "worker_count": worker_count, "opt_type": FLAGS.opt_type, "is_chief": is_chief, "train_op": kargs.get("train_op", "adam"), "decay": kargs.get("decay", "no"), "warmup": kargs.get("warmup", "no"), "clip_norm": 1 }) anneal_config = Bunch({ "initial_value": 1.0, "num_train_steps": num_train_steps }) model_io_config = Bunch({"fix_lm": False}) model_io_fn = model_io.ModelIO(model_io_config) num_classes = FLAGS.num_classes print("==current task_index==", task_index) if FLAGS.opt_type == "hvd" and hvd: checkpoint_dir = checkpoint_dir if task_index == 0 else None elif FLAGS.opt_type == "all_reduce": checkpoint_dir = checkpoint_dir elif FLAGS.opt_type == "collective_reduce": checkpoint_dir = checkpoint_dir if task_index == 0 else None elif FLAGS.opt_type == "ps" or FLAGS.opt_type == "ps_sync": checkpoint_dir = checkpoint_dir if task_index == 0 else None print("==checkpoint_dir==", checkpoint_dir, is_chief) model_fn = model_fn_builder(config, num_classes, init_checkpoint, model_reuse=None, load_pretrained=FLAGS.load_pretrained, model_io_config=model_io_config, opt_config=opt_config, model_io_fn=model_io_fn, exclude_scope="", not_storage_params=[], target=kargs.get("input_target", ""), output_type="estimator", checkpoint_dir=checkpoint_dir, num_storage_steps=num_storage_steps, task_index=task_index, anneal_config=anneal_config, **kargs) name_to_features = { "input_ids": tf.FixedLenFeature([FLAGS.max_length], tf.int64), "input_mask": tf.FixedLenFeature([FLAGS.max_length], tf.int64), "segment_ids": tf.FixedLenFeature([FLAGS.max_length], tf.int64), "label_ids": tf.FixedLenFeature([], tf.int64), } def _decode_record(record, name_to_features): """Decodes a record to a TensorFlow example. """ example = tf.parse_single_example(record, name_to_features) # tf.Example only supports tf.int64, but the TPU only supports tf.int32. # So cast all int64 to int32. for name in list(example.keys()): t = example[name] if t.dtype == tf.int64: t = tf.to_int32(t) example[name] = t return example def _decode_batch_record(record, name_to_features): example = tf.parse_example(record, name_to_features) # for name in list(example.keys()): # t = example[name] # if t.dtype == tf.int64: # t = tf.to_int32(t) # example[name] = t return example params = Bunch({}) params.epoch = FLAGS.epoch params.batch_size = FLAGS.batch_size train_features = lambda: tf_data_utils.all_reduce_train_batch_input_fn( train_file, _decode_batch_record, name_to_features, params, if_shard=FLAGS.if_shard, worker_count=worker_count, task_index=task_index) eval_features = lambda: tf_data_utils.all_reduce_eval_batch_input_fn( dev_file, _decode_batch_record, name_to_features, params, if_shard=FLAGS.if_shard, worker_count=worker_count, task_index=task_index) sess_config = tf.ConfigProto(allow_soft_placement=False, log_device_placement=False) if FLAGS.opt_type == "ps" or FLAGS.opt_type == "ps_sync": print("==no need for hook==") elif FLAGS.opt_type == "pai_soar" and pai: print("no need for hook") elif FLAGS.opt_type == "hvd" and hvd: sess_config.gpu_options.allow_growth = True sess_config.gpu_options.visible_device_list = str(hvd.local_rank()) print("==no need fo hook==") else: print("==no need for hooks==") if kargs.get("run_config", None): run_config = kargs.get("run_config", None) run_config = run_config.replace( save_checkpoints_steps=num_storage_steps) print("==run config==", run_config.save_checkpoints_steps) else: run_config = tf.estimator.RunConfig( model_dir=checkpoint_dir, save_checkpoints_steps=num_storage_steps, session_config=sess_config) train_hooks = [] if kargs.get("profiler", "profiler") == "profiler": if checkpoint_dir: hooks = tf.train.ProfilerHook( save_steps=100, save_secs=None, output_dir=os.path.join(checkpoint_dir, "profiler"), ) train_hooks.append(hooks) print("==add profiler hooks==") model_estimator = tf.estimator.Estimator(model_fn=model_fn, model_dir=checkpoint_dir, config=run_config) train_being_time = time.time() tf.logging.info("==training distribution_strategy=={}".format( kargs.get("distribution_strategy", "MirroredStrategy"))) if kargs.get("distribution_strategy", "MirroredStrategy") == "MirroredStrategy": print("==apply single machine multi-card training==") # model_estimator.train(input_fn=train_features, # max_steps=num_train_steps, # hooks=train_hooks) train_spec = tf.estimator.TrainSpec(input_fn=train_features, max_steps=num_train_steps) eval_spec = tf.estimator.EvalSpec(input_fn=eval_features, steps=num_eval_steps) model_estimator.train(input_fn=train_features, max_steps=num_train_steps, hooks=train_hooks) # tf.estimator.train(model_estimator, train_spec) # tf.estimator.evaluate(model_estimator, eval_spec) train_end_time = time.time() print("==training time==", train_end_time - train_being_time) tf.logging.info("==training time=={}".format(train_end_time - train_being_time)) eval_results = model_estimator.evaluate(input_fn=eval_features, steps=num_eval_steps) # print(eval_results) elif kargs.get("distribution_strategy", "MirroredStrategy") in [ "ParameterServerStrategy", "CollectiveAllReduceStrategy" ]: print("==apply multi-machine machine multi-card training==") try: print(os.environ['TF_CONFIG'], "==tf_run_config==") except: print("==not tf config==") train_spec = tf.estimator.TrainSpec(input_fn=train_features, max_steps=num_train_steps) eval_spec = tf.estimator.EvalSpec(input_fn=eval_features, steps=num_eval_steps) tf.estimator.train_and_evaluate(model_estimator, train_spec, eval_spec) train_end_time = time.time() print("==training time==", train_end_time - train_being_time)
def _Horovod(): import horovod.tensorflow as hvd return hvd.local_rank()
def train_eval_fn(FLAGS, worker_count, task_index, is_chief, target, init_checkpoint, train_file, dev_file, checkpoint_dir, is_debug): graph = tf.Graph() with graph.as_default(): import json config = json.load(open(FLAGS.config_file, "r")) config = Bunch(config) config.use_one_hot_embeddings = True config.scope = "bert" config.dropout_prob = 0.1 config.label_type = "single_label" if FLAGS.if_shard == "0": train_size = FLAGS.train_size epoch = int(FLAGS.epoch / worker_count) elif FLAGS.if_shard == "1": train_size = int(FLAGS.train_size/worker_count) epoch = FLAGS.epoch init_lr = 2e-5 label_dict = json.load(open(FLAGS.label_id)) num_train_steps = int( train_size / FLAGS.batch_size * epoch) num_warmup_steps = int(num_train_steps * 0.1) num_storage_steps = int(train_size / FLAGS.batch_size) num_eval_steps = int(FLAGS.eval_size / FLAGS.batch_size) if is_debug == "0": num_storage_steps = 2 num_eval_steps = 10 num_train_steps = 10 print("num_train_steps {}, num_eval_steps {}, num_storage_steps {}".format(num_train_steps, num_eval_steps, num_storage_steps)) print(" model type {}".format(FLAGS.model_type)) print(num_train_steps, num_warmup_steps, "=============") opt_config = Bunch({"init_lr":init_lr/worker_count, "num_train_steps":num_train_steps, "num_warmup_steps":num_warmup_steps, "worker_count":worker_count, "opt_type":FLAGS.opt_type}) model_io_config = Bunch({"fix_lm":False}) num_classes = FLAGS.num_classes model_fn = model_fn_builder(config, num_classes, init_checkpoint, model_reuse=None, load_pretrained=True, model_io_config=model_io_config, opt_config=opt_config, exclude_scope="", not_storage_params=[], target="", output_type="estimator") name_to_features = { "input_ids": tf.FixedLenFeature([FLAGS.max_length], tf.int64), "input_mask": tf.FixedLenFeature([FLAGS.max_length], tf.int64), "segment_ids": tf.FixedLenFeature([FLAGS.max_length], tf.int64), "label_ids": tf.FixedLenFeature([], tf.int64), } def _decode_record(record, name_to_features): """Decodes a record to a TensorFlow example. """ example = tf.parse_single_example(record, name_to_features) # tf.Example only supports tf.int64, but the TPU only supports tf.int32. # So cast all int64 to int32. for name in list(example.keys()): t = example[name] if t.dtype == tf.int64: t = tf.to_int32(t) example[name] = t return example params = Bunch({}) params.epoch = FLAGS.epoch params.batch_size = FLAGS.batch_size train_features = lambda: tf_data_utils.train_input_fn(train_file, _decode_record, name_to_features, params, if_shard=FLAGS.if_shard, worker_count=worker_count, task_index=task_index) eval_features = lambda: tf_data_utils.eval_input_fn(dev_file, _decode_record, name_to_features, params, if_shard=FLAGS.if_shard, worker_count=worker_count, task_index=task_index) print("===========begin to train============") sess_config = tf.ConfigProto(allow_soft_placement=False, log_device_placement=False) checkpoint_dir = checkpoint_dir if task_index == 0 else None print("start training") hooks = [] if FLAGS.opt_type == "ps": sync_replicas_hook = optimizer_fn.opt.make_session_run_hook(is_chief, num_tokens=0) hooks.append(sync_replicas_hook) elif FLAGS.opt_type == "pai_soar" and pai: sess = tf.train.MonitoredTrainingSession(master=target, is_chief=is_chief, config=sess_config, hooks=hooks, checkpoint_dir=checkpoint_dir, save_checkpoint_steps=num_storage_steps) elif FLAGS.opt_type == "hvd" and hvd: bcast_hook = hvd.BroadcastGlobalVariablesHook(0) hooks.append(bcast_hook) sess_config.gpu_options.allow_growth = True sess_config.gpu_options.visible_device_list = str(hvd.local_rank()) else: print("==not supported==") run_config = tf.estimator.RunConfig(model_dir=checkpoint_dir, save_checkpoints_steps=num_storage_steps, session_config=sess_config) model_estimator = tf.estimator.Estimator( model_fn=model_fn, config=run_config) train_spec = tf.estimator.TrainSpec(input_fn=train_features, max_steps=num_train_steps) eval_spec = tf.estimator.EvalSpec(input_fn=eval_features, steps=num_eval_steps) tf.estimator.train_and_evaluate(model_estimator, train_spec, eval_spec)
def main(): """ Run training/evaluation """ script_start = time.time() hvd_init() mpi_comm = MPI.COMM_WORLD args = parse_args() if hvd.rank() == 0: log_args(args) if args.seed is not None: tf.random.set_random_seed(args.seed) np.random.seed(args.seed) cp.random.seed(args.seed) if args.amp: os.environ["TF_ENABLE_AUTO_MIXED_PRECISION"] = "1" if "TF_ENABLE_AUTO_MIXED_PRECISION" in os.environ \ and os.environ["TF_ENABLE_AUTO_MIXED_PRECISION"] == "1": args.fp16 = False # directory to store/read final checkpoint if args.mode == 'train' and hvd.rank() == 0: LOGGER.log("Saving best checkpoint to {}".format(args.checkpoint_dir)) if not os.path.exists(args.checkpoint_dir) and args.checkpoint_dir != '': os.makedirs(args.checkpoint_dir, exist_ok=True) final_checkpoint_path = os.path.join(args.checkpoint_dir, 'model.ckpt') # Load converted data and get statistics train_df = pd.read_pickle(args.data+'/train_ratings.pickle') test_df = pd.read_pickle(args.data+'/test_ratings.pickle') nb_users, nb_items = train_df.max() + 1 # Extract train and test feature tensors from dataframe pos_train_users = train_df.iloc[:, 0].values.astype(np.int32) pos_train_items = train_df.iloc[:, 1].values.astype(np.int32) pos_test_users = test_df.iloc[:, 0].values.astype(np.int32) pos_test_items = test_df.iloc[:, 1].values.astype(np.int32) # Negatives indicator for negatives generation neg_mat = np.ones((nb_users, nb_items), dtype=np.bool) neg_mat[pos_train_users, pos_train_items] = 0 # Get the local training/test data train_users, train_items, train_labels = get_local_train_data( pos_train_users, pos_train_items, args.negative_samples ) test_users, test_items = get_local_test_data( pos_test_users, pos_test_items ) # Create and run Data Generator in a separate thread data_generator = DataGenerator( args.seed, hvd.rank(), nb_users, nb_items, neg_mat, train_users, train_items, train_labels, args.batch_size // hvd.size(), args.negative_samples, test_users, test_items, args.valid_users_per_batch, args.valid_negative, ) # Create tensorflow session and saver config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) if args.xla: config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 sess = tf.Session(config=config) # Input tensors users = tf.placeholder(tf.int32, shape=(None,)) items = tf.placeholder(tf.int32, shape=(None,)) labels = tf.placeholder(tf.int32, shape=(None,)) is_dup = tf.placeholder(tf.float32, shape=(None,)) dropout = tf.placeholder_with_default(args.dropout, shape=()) # Model ops and saver hit_rate, ndcg, eval_op, train_op = ncf_model_ops( users, items, labels, is_dup, params={ 'fp16': args.fp16, 'val_batch_size': args.valid_negative+1, 'top_k': args.topk, 'learning_rate': args.learning_rate, 'beta_1': args.beta1, 'beta_2': args.beta2, 'epsilon': args.eps, 'num_users': nb_users, 'num_items': nb_items, 'num_factors': args.factors, 'mf_reg': 0, 'layer_sizes': args.layers, 'layer_regs': [0. for i in args.layers], 'dropout': dropout, 'sigmoid': True, 'loss_scale': args.loss_scale }, mode='TRAIN' if args.mode == 'train' else 'EVAL' ) saver = tf.train.Saver() # Accuracy metric tensors hr_sum = tf.get_default_graph().get_tensor_by_name('neumf/hit_rate/total:0') hr_cnt = tf.get_default_graph().get_tensor_by_name('neumf/hit_rate/count:0') ndcg_sum = tf.get_default_graph().get_tensor_by_name('neumf/ndcg/total:0') ndcg_cnt = tf.get_default_graph().get_tensor_by_name('neumf/ndcg/count:0') # Prepare evaluation data data_generator.prepare_eval_data() if args.load_checkpoint_path: saver.restore(sess, args.load_checkpoint_path) else: # Manual initialize weights sess.run(tf.global_variables_initializer()) # If test mode, run one eval if args.mode == 'test': sess.run(tf.local_variables_initializer()) eval_start = time.time() for user_batch, item_batch, dup_batch \ in zip(data_generator.eval_users, data_generator.eval_items, data_generator.dup_mask): sess.run( eval_op, feed_dict={ users: user_batch, items: item_batch, is_dup:dup_batch, dropout: 0.0 } ) eval_duration = time.time() - eval_start # Report results hit_rate_sum = sess.run(hvd.allreduce(hr_sum, average=False)) hit_rate_cnt = sess.run(hvd.allreduce(hr_cnt, average=False)) ndcg_sum = sess.run(hvd.allreduce(ndcg_sum, average=False)) ndcg_cnt = sess.run(hvd.allreduce(ndcg_cnt, average=False)) hit_rate = hit_rate_sum / hit_rate_cnt ndcg = ndcg_sum / ndcg_cnt if hvd.rank() == 0: LOGGER.log("Eval Time: {:.4f}, HR: {:.4f}, NDCG: {:.4f}" .format(eval_duration, hit_rate, ndcg)) eval_throughput = pos_test_users.shape[0] * (args.valid_negative + 1) / eval_duration LOGGER.log('Average Eval Throughput: {:.4f}'.format(eval_throughput)) return # Performance Metrics train_times = list() eval_times = list() # Accuracy Metrics first_to_target = None time_to_train = 0.0 best_hr = 0 best_epoch = 0 # Buffers for global metrics global_hr_sum = np.ones(1) global_hr_count = np.ones(1) global_ndcg_sum = np.ones(1) global_ndcg_count = np.ones(1) # Buffers for local metrics local_hr_sum = np.ones(1) local_hr_count = np.ones(1) local_ndcg_sum = np.ones(1) local_ndcg_count = np.ones(1) # Begin training begin_train = time.time() if hvd.rank() == 0: LOGGER.log("Begin Training. Setup Time: {}".format(begin_train - script_start)) for epoch in range(args.epochs): # Train for one epoch train_start = time.time() data_generator.prepare_train_data() for user_batch, item_batch, label_batch \ in zip(data_generator.train_users_batches, data_generator.train_items_batches, data_generator.train_labels_batches): sess.run( train_op, feed_dict={ users: user_batch.get(), items: item_batch.get(), labels: label_batch.get() } ) train_duration = time.time() - train_start ## Only log "warm" epochs if epoch >= 1: train_times.append(train_duration) # Evaluate if epoch > args.eval_after: eval_start = time.time() sess.run(tf.local_variables_initializer()) for user_batch, item_batch, dup_batch \ in zip(data_generator.eval_users, data_generator.eval_items, data_generator.dup_mask): sess.run( eval_op, feed_dict={ users: user_batch, items: item_batch, is_dup: dup_batch, dropout: 0.0 } ) # Compute local metrics local_hr_sum[0] = sess.run(hr_sum) local_hr_count[0] = sess.run(hr_cnt) local_ndcg_sum[0] = sess.run(ndcg_sum) local_ndcg_count[0] = sess.run(ndcg_cnt) # Reduce metrics across all workers mpi_comm.Reduce(local_hr_count, global_hr_count) mpi_comm.Reduce(local_hr_sum, global_hr_sum) mpi_comm.Reduce(local_ndcg_count, global_ndcg_count) mpi_comm.Reduce(local_ndcg_sum, global_ndcg_sum) # Calculate metrics hit_rate = global_hr_sum[0] / global_hr_count[0] ndcg = global_ndcg_sum[0] / global_ndcg_count[0] eval_duration = time.time() - eval_start ## Only log "warm" epochs if epoch >= 1: eval_times.append(eval_duration) if hvd.rank() == 0: if args.verbose: log_string = "Epoch: {:02d}, Train Time: {:.4f}, Eval Time: {:.4f}, HR: {:.4f}, NDCG: {:.4f}" LOGGER.log( log_string.format( epoch, train_duration, eval_duration, hit_rate, ndcg ) ) # Update summary metrics if hit_rate > args.target and first_to_target is None: first_to_target = epoch time_to_train = time.time() - begin_train if hit_rate > best_hr: best_hr = hit_rate best_epoch = epoch time_to_best = time.time() - begin_train if not args.verbose: log_string = "New Best Epoch: {:02d}, Train Time: {:.4f}, Eval Time: {:.4f}, HR: {:.4f}, NDCG: {:.4f}" LOGGER.log( log_string.format( epoch, train_duration, eval_duration, hit_rate, ndcg ) ) # Save, if meets target if hit_rate > args.target: saver.save(sess, final_checkpoint_path) # Final Summary if hvd.rank() == 0: train_times = np.array(train_times) train_throughputs = pos_train_users.shape[0]*(args.negative_samples+1) / train_times eval_times = np.array(eval_times) eval_throughputs = pos_test_users.shape[0]*(args.valid_negative+1) / eval_times LOGGER.log(' ') LOGGER.log('batch_size: {}'.format(args.batch_size)) LOGGER.log('num_gpus: {}'.format(hvd.size())) LOGGER.log('AMP: {}'.format(1 if args.amp else 0)) LOGGER.log('seed: {}'.format(args.seed)) LOGGER.log('Minimum Train Time per Epoch: {:.4f}'.format(np.min(train_times))) LOGGER.log('Average Train Time per Epoch: {:.4f}'.format(np.mean(train_times))) LOGGER.log('Average Train Throughput: {:.4f}'.format(np.mean(train_throughputs))) LOGGER.log('Minimum Eval Time per Epoch: {:.4f}'.format(np.min(eval_times))) LOGGER.log('Average Eval Time per Epoch: {:.4f}'.format(np.mean(eval_times))) LOGGER.log('Average Eval Throughput: {:.4f}'.format(np.mean(eval_throughputs))) LOGGER.log('First Epoch to hit: {}'.format(first_to_target)) LOGGER.log('Time to Train: {:.4f}'.format(time_to_train)) LOGGER.log('Time to Best: {:.4f}'.format(time_to_best)) LOGGER.log('Best HR: {:.4f}'.format(best_hr)) LOGGER.log('Best Epoch: {}'.format(best_epoch)) sess.close() return
def set_conf(cfg): print(cfg.pretty()) # debug if cfg.get("enable_check_numerics"): print("Enabling numerics debugging...") tf.debugging.enable_check_numerics(stack_height_limit=30, path_length_limit=50) # the model we are running setattr(sys.modules[__name__], "MODEL_NAME", cfg.MODEL_NAME) seed_offset = 0 setattr(sys.modules[__name__], "horovod_worker", False) # distributed setup if horovod: # Initialize Horovod hvd.init() # Pin GPU to be used to process local rank (one GPU per process) gpus = tf.config.experimental.list_physical_devices('GPU') for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) if gpus: tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU') seed_offset = hvd.rank() if seed_offset > 0: setattr(sys.modules[__name__], "horovod_worker", True) # RNG tf.random.set_seed(cfg.seed + seed_offset) rng_state = tf.Variable([0, 0, cfg.seed + seed_offset], dtype=tf.int64) setattr(sys.modules[__name__], "rng", tf.random.Generator.from_state(rng_state, alg='philox')) # RUN CONFIGURATION setattr(sys.modules[__name__], "N_sim_batch", cfg.N_sim_batch) setattr(sys.modules[__name__], "N_epochs_per_episode", cfg.N_epochs_per_episode) setattr(sys.modules[__name__], "N_minibatch_size", cfg.N_minibatch_size) setattr(sys.modules[__name__], "N_episode_length", cfg.N_episode_length) setattr(sys.modules[__name__], "N_episodes", cfg.N_episodes) setattr(sys.modules[__name__], "expectation_pseudo_draws", cfg.get('expectation_pseudo_draws', 5)) setattr(sys.modules[__name__], "expectation_type", cfg.get('expectation_type', 'product')) # OUTPUT FILE FOR ERROR MEASURES setattr(sys.modules[__name__], "error_filename", cfg.error_filename) # VARIABLES try: import importlib variables = importlib.import_module(MODEL_NAME + ".Variables") config_states = variables.states config_policies = variables.policies config_definitions = variables.definitions config_constants = variables.constants # for backward compatibility in case constants are also in yaml if cfg.constants: config_constants.update(cfg.constants) print("Variables imported from Variables module") print(__name__) except ImportError: config_states = cfg.states config_policies = cfg.policies config_definitions = cfg.definitions config_constants = cfg.constants setattr(sys.modules[__name__], "states", [s['name'] for s in config_states]) setattr(sys.modules[__name__], "policy_states", [s['name'] for s in config_policies]) setattr(sys.modules[__name__], "definitions", [s['name'] for s in config_definitions]) state_bounds = { "lower": {}, "penalty_lower": {}, "upper": {}, "penalty_upper": {} } for s in config_states: if "bounds" in s.keys() and "lower" in s["bounds"].keys(): state_bounds["lower"][s["name"]] = s["bounds"]["lower"] if 'penalty_lower' in s['bounds'].keys(): penalty = s["bounds"]["penalty_lower"] else: penalty = 1 / s['bounds']['lower']**2 state_bounds["penalty_lower"][s["name"]] = penalty if "bounds" in s.keys() and "upper" in s["bounds"].keys(): state_bounds["upper"][s["name"]] = s["bounds"]["upper"] if 'penalty_upper' in s['bounds'].keys(): penalty = s["bounds"]["penalty_upper"] else: penalty = 1 / s['bounds']['upper']**2 state_bounds["penalty_upper"][s["name"]] = penalty setattr(sys.modules[__name__], "state_bounds_hard", state_bounds) policy_bounds = { 'lower': {}, 'penalty_lower': {}, 'upper': {}, 'penalty_upper': {} } for s in config_policies: if 'bounds' in s.keys() and 'lower' in s['bounds'].keys(): policy_bounds['lower'][s['name']] = s['bounds']['lower'] if 'penalty_lower' in s['bounds'].keys(): penalty = s["bounds"]["penalty_lower"] else: penalty = 1 / s['bounds']['lower']**2 policy_bounds['penalty_lower'][s['name']] = penalty if 'bounds' in s.keys() and 'upper' in s['bounds'].keys(): policy_bounds['upper'][s['name']] = s['bounds']['upper'] if 'penalty_upper' in s['bounds'].keys(): penalty = s["bounds"]["penalty_upper"] else: penalty = 1 / s['bounds']['upper']**2 policy_bounds['penalty_upper'][s['name']] = penalty setattr(sys.modules[__name__], "policy_bounds_hard", policy_bounds) definition_bounds = { 'lower': {}, 'penalty_lower': {}, 'upper': {}, 'penalty_upper': {} } for s in config_definitions: if 'bounds' in s.keys() and 'lower' in s['bounds'].keys(): definition_bounds['lower'][s['name']] = s['bounds']['lower'] if 'penalty_lower' in s['bounds'].keys(): penalty = s["bounds"]["penalty_lower"] else: penalty = 1 / s['bounds']['lower']**2 definition_bounds['penalty_lower'][s['name']] = penalty if 'bounds' in s.keys() and 'upper' in s['bounds'].keys(): definition_bounds['upper'][s['name']] = s['bounds']['upper'] if 'penalty_upper' in s['bounds'].keys(): penalty = s["bounds"]["penalty_upper"] else: penalty = 1 / s['bounds']['upper']**2 definition_bounds['penalty_upper'][s['name']] = penalty setattr(sys.modules[__name__], "definition_bounds_hard", definition_bounds) # NEURAL NET tf.keras.backend.set_floatx(cfg.get('keras_precision', 'float32')) layers = [] for i, layer in enumerate(cfg.layers, start=1): if i < len(cfg.layers): if 'dropout_rate' in layer['hidden']: layers.append( tf.keras.layers.Dropout( rate=layer['hidden']['dropout_rate'])) if 'batch_normalize' in layer['hidden']: layers.append( tf.keras.layers.BatchNormalization( **layer['hidden']['batch_normalize'])) layers.append( tf.keras.layers.Dense( units=layer['hidden']['units'], activation=layer['hidden']['activation'], kernel_initializer=tf.keras.initializers.VarianceScaling( scale=layer['hidden'].get('init_scale', 1.0), mode=cfg.get('net_initializer_mode', 'fan_in'), distribution=cfg.get('net_initializer_distribution', 'truncated_normal'), seed=i))) else: layers.append( tf.keras.layers.Dense( units=len(policy_states), activation=layer['output']['activation'], kernel_initializer=tf.keras.initializers.VarianceScaling( scale=layer['output'].get('init_scale', 1.0), mode=cfg.get('net_initializer_mode', 'fan_in'), distribution=cfg.get('net_initializer_distribution', 'truncated_normal'), seed=i))) policy_net = tf.keras.models.Sequential(layers) policy_net.build(input_shape=(None, len(states))) learning_rate_multiplier = 1 if not horovod else hvd.size() optim = getattr(tf.keras.optimizers, cfg.optimizer)( learning_rate=cfg.learning_rate * learning_rate_multiplier, clipvalue=cfg.clipvalue) # apply post-processing per-variable def policy(s): raw_policy = policy_net(s) for i, pol in enumerate(config_policies): if 'activation' in pol.keys(): activation_str = pol['activation'] if pol['activation'] == 'implied': if 'lower' in pol['bounds'].keys( ) and 'upper' in pol['bounds'].keys(): activation_str = 'lambda x: {l} + ({u} - {l}) * tf.math.sigmoid(x)'.format( l=str(pol['bounds']['lower']), u=str(pol['bounds']['upper'])) raw_policy = tf.tensor_scatter_nd_update( raw_policy, [[j, i] for j in range(s.shape[0])], eval(activation_str)(raw_policy[:, i])) if cfg.keras_precision == 'float64': return tf.cast(raw_policy, tf.dtypes.float32) return raw_policy setattr(sys.modules[__name__], "policy", policy) setattr(sys.modules[__name__], "policy_net", policy_net) # OPTIMIZER setattr(sys.modules[__name__], "optimizer", optim) # CONSTANTS for (key, value) in config_constants.items(): setattr(sys.modules[__name__], key, value) # STATE INITIALIZATION def initialize_states(N_batch=N_sim_batch): # starting state init_val = tf.ones([N_batch, len(states)]) # apply special inits if any for i, s in enumerate(config_states): if 'init' in s: init_val = tf.tensor_scatter_nd_update( init_val, [[j, i] for j in range(init_val.shape[0])], getattr(rng, s["init"]["distribution"])(shape=(N_batch, ), **s["init"]["kwargs"])) return init_val starting_state = tf.Variable(initialize_states()) setattr(sys.modules[__name__], "starting_state", starting_state) setattr(sys.modules[__name__], "initialize_states", initialize_states) setattr(sys.modules[__name__], "initialize_each_episode", cfg.get("initialize_each_episode", False)) setattr(sys.modules[__name__], "N_simulated_batch_size", cfg.get("N_simulated_batch_size", None)) setattr(sys.modules[__name__], "N_simulated_episode_length", cfg.get("N_simulated_episode_length", None)) # LOGGING setattr(sys.modules[__name__], "LOG_DIR", os.getcwd()) if cfg.STARTING_POINT == 'NEW' and not horovod_worker: for file in os.scandir(os.getcwd()): if not ".hydra" in file.path: os.unlink(file.path) setattr(sys.modules[__name__], "writer", tf.summary.create_file_writer(os.getcwd())) setattr(sys.modules[__name__], "current_episode", tf.Variable(1)) ckpt = tf.train.Checkpoint(step=tf.Variable(1), current_episode=current_episode, optimizer=optimizer, policy=policy_net, rng_state=rng_state, starting_state=starting_state) manager = tf.train.CheckpointManager( ckpt, os.getcwd(), max_to_keep=cfg.MAX_TO_KEEP_NUMBER, step_counter=current_episode, checkpoint_interval=cfg.CHECKPOINT_INTERVAL) if cfg.STARTING_POINT == 'LATEST' and manager.latest_checkpoint: print("Restored from {}".format(manager.latest_checkpoint)) ckpt.restore(manager.latest_checkpoint) if cfg.STARTING_POINT != 'LATEST' and cfg.STARTING_POINT != 'NEW': print("Restored from {}".format(cfg.STARTING_POINT)) ckpt.restore(cfg.STARTING_POINT) setattr(sys.modules[__name__], "optimizer_starting_iteration", optimizer.iterations.numpy()) setattr(sys.modules[__name__], "ckpt", ckpt) setattr(sys.modules[__name__], "manager", manager) tf.print("Optimizer configuration:") tf.print(optimizer.get_config()) tf.print("Starting state:") tf.print(starting_state)
def main(_): # Horovod: initialize Horovod if using multiple GPUs. if FLAGS.use_multi_gpu: hvd.init() FLAGS.output_dir = FLAGS.output_dir if hvd.rank( ) == 0 else os.path.join(FLAGS.output_dir, str(hvd.rank())) tf.logging.set_verbosity(tf.logging.INFO) processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mrpc": MrpcProcessor, "xnli": XnliProcessor, } tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, FLAGS.init_checkpoint) if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: raise ValueError( "At least one of `do_train`, `do_eval` or `do_predict' must be True." ) bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) tf.gfile.MakeDirs(FLAGS.output_dir) task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 if FLAGS.use_multi_gpu: # Horovod: pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.visible_device_list = str(hvd.local_rank()) run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host), log_step_count_steps=25, session_config=config) else: run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) train_examples = None num_train_steps = None num_warmup_steps = None if FLAGS.do_train: train_examples = processor.get_train_examples(FLAGS.data_dir) num_train_steps = int( len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) # Horovod: adjust number of steps based on number of GPUs. if FLAGS.use_multi_gpu: num_warmup_steps = num_warmup_steps // hvd.size() num_train_steps = num_train_steps // hvd.size() model_fn = model_fn_builder(bert_config=bert_config, num_labels=len(label_list), init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu, use_multi_gpu=FLAGS.use_multi_gpu) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) if FLAGS.do_train: train_file = os.path.join(FLAGS.output_dir, "train.tf_record") file_based_convert_examples_to_features(train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) tf.logging.info("***** Running training *****") tf.logging.info(" Num examples = %d", len(train_examples)) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True) # Horovod: In the case of multi GPU training with Horovod, adding # hvd.BroadcastGlobalVariablesHook(0) hook, broadcasts the initial variable # states from rank 0 to all other processes. This is necessary to ensure # consistent initialization of all workers when training is started with # random weights or restored from a checkpoint. if FLAGS.use_multi_gpu: hooks = [hvd.BroadcastGlobalVariablesHook(0)] else: hooks = [] estimator.train(input_fn=train_input_fn, max_steps=num_train_steps, hooks=hooks) if FLAGS.do_eval: eval_examples = processor.get_dev_examples(FLAGS.data_dir) num_actual_eval_examples = len(eval_examples) if FLAGS.use_tpu: # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. These do NOT count towards the metric (all tf.metrics # support a per-instance weight, and these get a weight of 0.0). while len(eval_examples) % FLAGS.eval_batch_size != 0: eval_examples.append(PaddingInputExample()) eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") file_based_convert_examples_to_features(eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(eval_examples), num_actual_eval_examples, len(eval_examples) - num_actual_eval_examples) tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) # This tells the estimator to run through the entire set. eval_steps = None # However, if running eval on the TPU, you will need to specify the # number of steps. if FLAGS.use_tpu: assert len(eval_examples) % FLAGS.eval_batch_size == 0 eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size) eval_drop_remainder = True if FLAGS.use_tpu else False eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=eval_drop_remainder) result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with tf.gfile.GFile(output_eval_file, "w") as writer: tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if FLAGS.do_predict: predict_examples = processor.get_test_examples(FLAGS.data_dir) num_actual_predict_examples = len(predict_examples) if FLAGS.use_tpu: # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. while len(predict_examples) % FLAGS.predict_batch_size != 0: predict_examples.append(PaddingInputExample()) predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") file_based_convert_examples_to_features(predict_examples, label_list, FLAGS.max_seq_length, tokenizer, predict_file) tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(predict_examples), num_actual_predict_examples, len(predict_examples) - num_actual_predict_examples) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) predict_drop_remainder = True if FLAGS.use_tpu else False predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder) result = estimator.predict(input_fn=predict_input_fn) output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv") with tf.gfile.GFile(output_predict_file, "w") as writer: num_written_lines = 0 tf.logging.info("***** Predict results *****") for (i, prediction) in enumerate(result): probabilities = prediction["probabilities"] if i >= num_actual_predict_examples: break output_line = "\t".join( str(class_probability) for class_probability in probabilities) + "\n" writer.write(output_line) num_written_lines += 1 assert num_written_lines == num_actual_predict_examples
def main(argv=None): ''' ''' main.__doc__ = __doc__ argv = sys.argv if argv is None else sys.argv.extend(argv) desc = main.__doc__ # .format(os.path.basename(__file__)) # CLI parser args = parser_(desc) nranks_per_gpu = args.nranks_per_gpu local_rank = hvd.local_rank() gpu_local_rank = local_rank // nranks_per_gpu print('local_rank, GPU_LOCAL_RANK: {}, {}'.format(local_rank, gpu_local_rank)) # Pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True # config.gpu_options.visible_device_list = str(hvd.local_rank()) config.gpu_options.visible_device_list = str(gpu_local_rank) K.set_session(tf.Session(config=config)) # input image dimensions img_rows, img_cols, img_chns = 28, 28, 1 # number of convolutional filters to use filters = 64 # convolution kernel size num_conv = 3 hvdsize = hvd.size() batch_size = 128 # 100 if K.image_data_format() == 'channels_first': original_img_size = (img_chns, img_rows, img_cols) else: original_img_size = (img_rows, img_cols, img_chns) latent_dim = 2 intermediate_dim = 128 epsilon_std = 1.0 epochs = args.epochs # 5 # train the VAE on MNIST digits (x_train, _), (x_test, y_test) = mnist.load_data() # Data split if going for reduction in each iteration step. Using # tf-queue or dataset is better to preserve uniform random sampling. # nsamples = x_train.shape[0] # mysamples = nsamples // hvdsize # start_sam = hvd.local_rank() * mysamples # stop_sam = min((hvd.local_rank() + 1) * mysamples, nsamples) # x_train = x_train[start_sam:stop_sam, ...] x_train = x_train.astype('float32') / 255. x_train = x_train.reshape((x_train.shape[0], ) + original_img_size) x_test = x_test.astype('float32') / 255. x_test = x_test.reshape((x_test.shape[0], ) + original_img_size) if hvd.rank() == 0: print('x_train.shape:', x_train.shape) vae, encoder, generator = make_vae_and_codec(original_img_size, img_chns, img_rows, img_cols, batch_size, filters, num_conv, intermediate_dim, latent_dim, epsilon_std) # : :type vae: Model lr = 0.001 # * hvdsize opt = tf.train.RMSPropOptimizer(lr) # Add Horovod Distributed Optimizer. opt = hvd.DistributedOptimizer(opt) # , use_locking=True) opt = TFOptimizer(opt) vae.compile(optimizer=opt, loss=None) if hvd.rank() == 0: vae.summary() # callbacks = [] callbacks = [hvd_keras.callbacks.BroadcastGlobalVariablesCallback(0)] if hvd.rank() == 0: callbacks += [BatchTiming(), SamplesPerSec(batch_size * hvdsize)] sess = K.get_session() sess.run(hvd.broadcast_global_variables(0)) vae.fit(x_train, shuffle=True, epochs=epochs, batch_size=batch_size, verbose=hvd.local_rank() == 0, validation_data=(x_test, None), callbacks=callbacks) if hvd.rank() == 0: vae_val = vae loss = vae_val.evaluate(x=x_test, y=None, batch_size=batch_size) print('\n\nVAE VALIDATION LOSS: {}'.format(loss)) # display a 2D plot of the digit classes in the latent space x_test_encoded = encoder.predict(x_test, batch_size=batch_size) plt.figure(figsize=(6, 6)) plt.scatter(x_test_encoded[:, 0], x_test_encoded[:, 1], c=y_test) plt.colorbar() # plt.show() plt.savefig('vae_scatter.ps') plt.close() # display a 2D manifold of the digits n = 15 # figure with 15x15 digits digit_size = 28 figure = np.zeros((digit_size * n, digit_size * n)) # Linearly spaced coordinates on the unit square were transformed # through the inverse CDF (ppf) of the Gaussian # To produce values of the latent variables z, since the prior of the # latent space is Gaussian grid_x = norm.ppf(np.linspace(0.05, 0.95, n)) grid_y = norm.ppf(np.linspace(0.05, 0.95, n)) for i, yi in enumerate(grid_x): for j, xi in enumerate(grid_y): z_sample = np.array([[xi, yi]]) z_sample = np.tile(z_sample, batch_size).reshape(batch_size, 2) x_decoded = generator.predict(z_sample, batch_size=batch_size) digit = x_decoded[0].reshape(digit_size, digit_size) figure[i * digit_size:(i + 1) * digit_size, j * digit_size:(j + 1) * digit_size] = digit plt.figure(figsize=(10, 10)) plt.imshow(figure, cmap='Greys_r') # plt.show() plt.savefig('vae_digit.ps') plt.close() K.clear_session()
init = tf.global_variables_initializer() # Horovod: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. bcast = hvd.broadcast_global_variables(0) # Step 5: Begin training. # Horovod: adjust number of steps based on number of GPUs. num_steps = 100000 // hvd.size() + 1 # Horovod: pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) with tf.Session(graph=graph, config=config) as session: # We must initialize all variables before we use them. init.run() bcast.run() print('Initialized') average_loss = 0 for step in xrange(num_steps): # simulate various sentence length by randomization batch_size = random.randint(max_batch_size // 2, max_batch_size) batch_inputs, batch_labels = generate_batch( batch_size, num_skips, skip_window) feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}
parser.add_argument('--init_restore_dir', type=str, default='check_points/pretrain_models/google_bert_base/bert_model.ckpt') parser.add_argument('--checkpoint_dir', type=str, default='check_points/DRCD/google_bert_base/') parser.add_argument('--setting_file', type=str, default='setting.txt') parser.add_argument('--log_file', type=str, default='log.txt') # use some global vars for convenience args = parser.parse_args() os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_ids n_gpu = len(args.gpu_ids.split(',')) if n_gpu > 1: assert hvd hvd.init() mpi_size = hvd.size() mpi_rank = hvd.local_rank() assert mpi_size == n_gpu training_hooks = [hvd.BroadcastGlobalVariablesHook(0)] print_rank0('GPU NUM', n_gpu) else: hvd = None mpi_size = 1 mpi_rank = 0 training_hooks = None print('GPU NUM', n_gpu) args.checkpoint_dir += ('/epoch{}_batch{}_lr{}_warmup{}_anslen{}_tf/' .format(args.train_epochs, args.n_batch, args.lr, args.warmup_rate, args.max_ans_length)) args = utils.check_args(args, mpi_rank) print_rank0('######## generating data ########')
def main(_): tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO) if FLAGS.horovod: hvd.init() if FLAGS.use_fp16: os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1" processors = {'consensus': ConsensusProcessor} tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, FLAGS.init_checkpoint) if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) tf.io.gfile.makedirs(FLAGS.output_dir) processor = processors[task_name]() label_list, label_map = processor.get_labels() inv_label_map = {v: k for k, v in label_map.items()} tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 master_process = True training_hooks = [] global_batch_size = FLAGS.train_batch_size hvd_rank = 0 config = tf.compat.v1.ConfigProto() if FLAGS.horovod: global_batch_size = FLAGS.train_batch_size * hvd.size() master_process = (hvd.rank() == 0) hvd_rank = hvd.rank() config.gpu_options.visible_device_list = str(hvd.local_rank()) if hvd.size() > 1: training_hooks.append(hvd.BroadcastGlobalVariablesHook(0)) if FLAGS.use_xla: config.graph_options.optimizer_options.global_jit_level = tf.compat.v1.OptimizerOptions.ON_1 run_config = tf.estimator.RunConfig( model_dir=FLAGS.output_dir if master_process else None, session_config=config, save_checkpoints_steps=FLAGS.save_checkpoints_steps if master_process else None, keep_checkpoint_max=1) if master_process: tf.compat.v1.logging.info("***** Configuration *****") for key in FLAGS.__flags.keys(): tf.compat.v1.logging.info(' {}: {}'.format( key, getattr(FLAGS, key))) tf.compat.v1.logging.info("**************************") train_examples = None num_train_steps = None num_warmup_steps = None training_hooks.append(LogTrainRunHook(global_batch_size, hvd_rank)) if FLAGS.do_train: train_examples = processor.get_train_examples(FLAGS.data_dir) num_train_steps = int( len(train_examples) / global_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) start_index = 0 end_index = len(train_examples) tmp_filenames = [os.path.join(FLAGS.output_dir, "train.tf_record")] if FLAGS.horovod: tmp_filenames = [ os.path.join(FLAGS.output_dir, "train.tf_record{}".format(i)) for i in range(hvd.size()) ] num_examples_per_rank = len(train_examples) // hvd.size() remainder = len(train_examples) % hvd.size() if hvd.rank() < remainder: start_index = hvd.rank() * (num_examples_per_rank + 1) end_index = start_index + num_examples_per_rank + 1 else: start_index = hvd.rank() * num_examples_per_rank + remainder end_index = start_index + (num_examples_per_rank) model_fn = model_fn_builder(bert_config=bert_config, num_labels=len(label_list) + 1, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_one_hot_embeddings=False, hvd=None if not FLAGS.horovod else hvd, use_fp16=FLAGS.use_fp16) estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config) if FLAGS.do_train: filed_based_convert_examples_to_features( train_examples[start_index:end_index], label_list, label_map, FLAGS.max_seq_length, tokenizer, tmp_filenames[hvd_rank], FLAGS.replace_span_A, FLAGS.replace_span_B) tf.compat.v1.logging.info("***** Running training *****") tf.compat.v1.logging.info(" Num examples = %d", len(train_examples)) tf.compat.v1.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.compat.v1.logging.info(" Num steps = %d", num_train_steps) tf.compat.v1.logging.info(" Num of labels = %d", len(label_list)) train_input_fn = file_based_input_fn_builder( input_file=tmp_filenames, batch_size=FLAGS.train_batch_size, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True, hvd=None if not FLAGS.horovod else hvd) train_start_time = time.time() estimator.train(input_fn=train_input_fn, max_steps=num_train_steps, hooks=training_hooks) train_time_elapsed = time.time() - train_start_time train_time_wo_overhead = training_hooks[-1].total_time avg_sentences_per_second = num_train_steps * global_batch_size * 1.0 / train_time_elapsed ss_sentences_per_second = ( num_train_steps - training_hooks[-1].skipped ) * global_batch_size * 1.0 / train_time_wo_overhead if master_process: tf.compat.v1.logging.info("-----------------------------") tf.compat.v1.logging.info( "Total Training Time = %0.2f for Sentences = %d", train_time_elapsed, num_train_steps * global_batch_size) tf.compat.v1.logging.info( "Total Training Time W/O Overhead = %0.2f for Sentences = %d", train_time_wo_overhead, (num_train_steps - training_hooks[-1].skipped) * global_batch_size) tf.compat.v1.logging.info( "Throughput Average (sentences/sec) with overhead = %0.2f", avg_sentences_per_second) tf.compat.v1.logging.info( "Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second) tf.compat.v1.logging.info("-----------------------------") if FLAGS.do_eval and master_process: eval_examples = processor.get_dev_examples(FLAGS.data_dir) num_actual_eval_examples = len(eval_examples) eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") filed_based_convert_examples_to_features( eval_examples, label_list, label_map, FLAGS.max_seq_length, tokenizer, eval_file, FLAGS.replace_span_A, FLAGS.replace_span_B) tf.compat.v1.logging.info("***** Running evaluation *****") tf.compat.v1.logging.info( " Num examples = %d (%d actual, %d padding)", len(eval_examples), num_actual_eval_examples, len(eval_examples) - num_actual_eval_examples) tf.compat.v1.logging.info(" Batch size = %d", FLAGS.eval_batch_size) # This tells the estimator to run through the entire set. eval_steps = None eval_drop_remainder = False eval_input_fn = file_based_input_fn_builder( input_file=eval_file, batch_size=FLAGS.eval_batch_size, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=eval_drop_remainder) result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with tf.io.gfile.GFile(output_eval_file, "w") as writer: tf.compat.v1.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.compat.v1.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if FLAGS.do_predict and master_process: predict_examples = processor.get_test_examples(FLAGS.data_dir) num_actual_predict_examples = len(predict_examples) predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") filed_based_convert_examples_to_features(predict_examples, label_list, label_map, FLAGS.max_seq_length, tokenizer, predict_file, FLAGS.replace_span_A, FLAGS.replace_span_B) tf.compat.v1.logging.info("***** Running prediction*****") tf.compat.v1.logging.info( " Num examples = %d (%d actual, %d padding)", len(predict_examples), num_actual_predict_examples, len(predict_examples) - num_actual_predict_examples) tf.compat.v1.logging.info(" Batch size = %d", FLAGS.predict_batch_size) predict_drop_remainder = False predict_input_fn = file_based_input_fn_builder( input_file=predict_file, batch_size=FLAGS.predict_batch_size, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder) eval_hooks = [LogEvalRunHook(FLAGS.predict_batch_size)] eval_start_time = time.time() output_class_file = os.path.join(FLAGS.output_dir, "test_output_labels.txt") output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv") with tf.io.gfile.GFile(output_predict_file, "w") as writer, tf.io.gfile.GFile( output_class_file, "w") as writer2: num_written_lines = 0 tf.compat.v1.logging.info("***** Predict results *****") for prediction in estimator.predict(input_fn=predict_input_fn, hooks=eval_hooks, yield_single_examples=True): probabilities = prediction["probabilities"] logits = prediction["logits"] pr_res = np.argmax(logits, axis=-1) output = str(inv_label_map[pr_res]) + "\n" writer2.write(output) output_line = "\t".join( str(class_probability) for class_probability in probabilities) + "\n" writer.write(output_line) num_written_lines += 1 assert num_written_lines == num_actual_predict_examples eval_time_elapsed = time.time() - eval_start_time eval_time_wo_overhead = eval_hooks[-1].total_time time_list = eval_hooks[-1].time_list time_list.sort() num_sentences = (eval_hooks[-1].count - eval_hooks[-1].skipped) * FLAGS.predict_batch_size avg = np.mean(time_list) cf_50 = max(time_list[:int(len(time_list) * 0.50)]) cf_90 = max(time_list[:int(len(time_list) * 0.90)]) cf_95 = max(time_list[:int(len(time_list) * 0.95)]) cf_99 = max(time_list[:int(len(time_list) * 0.99)]) cf_100 = max(time_list[:int(len(time_list) * 1)]) ss_sentences_per_second = num_sentences * 1.0 / eval_time_wo_overhead tf.compat.v1.logging.info("-----------------------------") tf.compat.v1.logging.info( "Total Inference Time = %0.2f for Sentences = %d", eval_time_elapsed, eval_hooks[-1].count * FLAGS.predict_batch_size) tf.compat.v1.logging.info( "Total Inference Time W/O Overhead = %0.2f for Sentences = %d", eval_time_wo_overhead, (eval_hooks[-1].count - eval_hooks[-1].skipped) * FLAGS.predict_batch_size) tf.compat.v1.logging.info("Summary Inference Statistics") tf.compat.v1.logging.info("Batch size = %d", FLAGS.predict_batch_size) tf.compat.v1.logging.info("Sequence Length = %d", FLAGS.max_seq_length) tf.compat.v1.logging.info("Precision = %s", "fp16" if FLAGS.use_fp16 else "fp32") tf.compat.v1.logging.info("Latency Confidence Level 50 (ms) = %0.2f", cf_50 * 1000) tf.compat.v1.logging.info("Latency Confidence Level 90 (ms) = %0.2f", cf_90 * 1000) tf.compat.v1.logging.info("Latency Confidence Level 95 (ms) = %0.2f", cf_95 * 1000) tf.compat.v1.logging.info("Latency Confidence Level 99 (ms) = %0.2f", cf_99 * 1000) tf.compat.v1.logging.info("Latency Confidence Level 100 (ms) = %0.2f", cf_100 * 1000) tf.compat.v1.logging.info("Latency Average (ms) = %0.2f", avg * 1000) tf.compat.v1.logging.info("Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second) tf.compat.v1.logging.info("-----------------------------")
def main(_): hvd.init() sess_config = tf.ConfigProto() sess_config.gpu_options.visible_device_list = str(hvd.local_rank()) graph = tf.Graph() from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score with graph.as_default(): import json # config = json.load(open("/data/xuht/bert/chinese_L-12_H-768_A-12/bert_config.json", "r")) config = json.load(open(FLAGS.config_file, "r")) init_checkpoint = FLAGS.init_checkpoint print("===init checkoutpoint==={}".format(init_checkpoint)) # init_checkpoint = "/data/xuht/bert/chinese_L-12_H-768_A-12/bert_model.ckpt" # init_checkpoint = "/data/xuht/concat/model_1/oqmrc.ckpt" config = Bunch(config) config.use_one_hot_embeddings = True config.scope = "esim/bert" config.dropout_prob = 0.1 config.label_type = "single_label" config.lstm_dim = 128 config.num_heads = 12 config.num_units = 768 import json label_dict = json.load(open(FLAGS.label_id)) # label_tensor = np.asarray(label_dict["class_ratio"]).astype(np.float32) label_tensor = None # config.loss = "focal_loss" json.dump(config, open(FLAGS.model_output + "/config.json", "w")) # os.environ["CUDA_VISIBLE_DEVICES"] = FLAGS.gpu_id sess = tf.Session(config=sess_config) train_size = int(FLAGS.train_size / hvd.size()) num_train_steps = int(train_size / FLAGS.batch_size * FLAGS.epoch) num_warmup_steps = int(num_train_steps * 0.01) num_storage_steps = int(train_size / FLAGS.batch_size) print(num_train_steps, num_warmup_steps, "=============") opt_config = Bunch({ "init_lr": (5e-5 / hvd.size()), "num_train_steps": num_train_steps, "num_warmup_steps": num_warmup_steps, "train_op": "adam" }) model_io_config = Bunch({"fix_lm": False}) model_io_fn = model_io.ModelIO(model_io_config) num_choice = FLAGS.num_classes max_seq_length = FLAGS.max_length * 2 + 3 model_function = bert_esim.classifier_attn_model_fn_builder model_eval_fn = model_function(config, num_choice, init_checkpoint, model_reuse=None, load_pretrained=True, model_io_fn=model_io_fn, model_io_config=model_io_config, opt_config=opt_config, input_name=["a", "b"], label_tensor=label_tensor, not_storage_params=["adam", "adam_1"], exclude_scope_dict={"task": "esim"}) def metric_fn(features, logits, loss): print(logits.get_shape(), "===logits shape===") pred_label = tf.argmax(logits, axis=-1, output_type=tf.int32) prob = tf.nn.softmax(logits) accuracy = correct = tf.equal( tf.cast(pred_label, tf.int32), tf.cast(features["label_ids"], tf.int32)) accuracy = tf.reduce_mean(tf.cast(correct, tf.float32)) return { "accuracy": accuracy, "loss": loss, "pred_label": pred_label, "label_ids": features["label_ids"] } name_to_features = { "input_ids_a": tf.FixedLenFeature([max_seq_length], tf.int64), "input_mask_a": tf.FixedLenFeature([max_seq_length], tf.int64), "segment_ids_a": tf.FixedLenFeature([max_seq_length], tf.int64), "input_ids_b": tf.FixedLenFeature([max_seq_length], tf.int64), "input_mask_b": tf.FixedLenFeature([max_seq_length], tf.int64), "segment_ids_b": tf.FixedLenFeature([max_seq_length], tf.int64), "label_ids": tf.FixedLenFeature([], tf.int64), } def _decode_record(record, name_to_features): """Decodes a record to a TensorFlow example. """ example = tf.parse_single_example(record, name_to_features) # tf.Example only supports tf.int64, but the TPU only supports tf.int32. # So cast all int64 to int32. for name in list(example.keys()): t = example[name] if t.dtype == tf.int64: t = tf.to_int32(t) example[name] = t return example params = Bunch({}) params.epoch = FLAGS.epoch params.batch_size = FLAGS.batch_size # train_features = tf_data_utils.train_input_fn("/data/xuht/wsdm19/data/train.tfrecords", # _decode_record, name_to_features, params) # eval_features = tf_data_utils.eval_input_fn("/data/xuht/wsdm19/data/dev.tfrecords", # _decode_record, name_to_features, params) # train_features = tf_data_utils.train_input_fn(FLAGS.train_file, # _decode_record, name_to_features, params) eval_features = tf_data_utils.eval_input_fn(FLAGS.dev_file, _decode_record, name_to_features, params) # [train_op, train_loss, train_per_example_loss, train_logits] = model_train_fn(train_features, [], tf.estimator.ModeKeys.TRAIN) [_, eval_loss, eval_per_example_loss, eval_logits] = model_eval_fn(eval_features, [], tf.estimator.ModeKeys.EVAL) result = metric_fn(eval_features, eval_logits, eval_loss) model_io_fn.set_saver() init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) sess.run(init_op) model_io_fn.load_model(sess, init_checkpoint) print(" ==succeeded in loading model== ") sess.run(hvd.broadcast_global_variables(0)) def eval_fn(result): i = 0 total_accuracy = 0 label, label_id = [], [] # label_weight = [] while True: try: eval_result = sess.run(result) total_accuracy += eval_result["accuracy"] label_id.extend(eval_result["label_ids"]) label.extend(eval_result["pred_label"]) # for item in eval_result["label_ids"]: # label_weight.append(label_tensor[item]) i += 1 except tf.errors.OutOfRangeError: print("End of dataset") break # f1 = f1_score(label_id, label, average="macro", sample_weight=label_weight) # accuracy = accuracy_score(label_id, label, sample_weight=label_weight) f1 = f1_score(label_id, label, average="macro") accuracy = accuracy_score(label_id, label) print("test accuracy accuracy {} {} f1 {}".format( total_accuracy / i, accuracy, f1)) return total_accuracy / i, f1 # print("===========begin to train============") # train_fn(train_op, train_loss) print("===========begin to eval============") accuracy, f1 = eval_fn(result) print("==accuracy {} f1 {}==".format(accuracy, f1))
def parallax_run_mpi(single_gpu_meta_graph_def, config): hostname = os.getenv(PARALLAX_HOSTNAME, 0) create_profile_directory(config.profile_config.profile_dir, config.resource_info, hostname) mpi_meta_graph_def, tensor_or_op_name_to_replica_names = \ graph_transform_mpi(single_gpu_meta_graph_def, config) worker_id = hvd.rank() num_workers = hvd.size() if config.profile_config.profile_dir: append_task_info(config.profile_config.profile_dir, hostname, ['worker:%d'%worker_id]) with tf.Graph().as_default() as graph_to_run: parallax_log.debug("Importing MPI graph on worker %d" % worker_id) tf.train.import_meta_graph(mpi_meta_graph_def) if config.export_graph_path: export_meta_graph(config.export_graph_path, worker_id) if config.profile_config.profile_dir: path = os.path.join(config.profile_config.profile_dir, hostname, 'worker:%d'%worker_id) export_meta_graph(path, worker_id) if config.profile_config.profile_worker != None and worker_id != config.profile_config.profile_worker: #Only one CUPTI profiler can run in a machine #See tensorflow/tensorflow/core/platform/default/device_tracer.cc:L452 config.profile_config.profile_dir = None else: config.profile_config.profile_dir = \ os.path.join(config.profile_config.profile_dir, hostname, 'worker:%d'%worker_id, 'run_meta') ckpt_hooks = build_ckpt_hooks(config.get_ckpt_config()) if worker_id == 0 else None sess_config = config.sess_config if sess_config is None: sess_config = tf.ConfigProto(allow_soft_placement=True) sess_config.gpu_options.visible_device_list = str(hvd.local_rank()) sess = tf.train.MonitoredTrainingSession( is_chief=True, checkpoint_dir=config.get_ckpt_config().ckpt_dir if worker_id == 0 else None, # TODO: Allow user-defined hooks hooks=None, chief_only_hooks=ckpt_hooks, save_checkpoint_secs=None, save_summaries_steps=None, save_summaries_secs=None, config=sess_config) parallax_log.debug( "Created MonitoredTrainingSession for worker %d" % worker_id) _init_global_vars(sess) parallax_log.debug( "Finished initialization process, start training on \ worker %d" % worker_id) step = sess.run(tf.get_collection(tf.GraphKeys.GLOBAL_STEP)[0]) sess_context = \ ParallaxSessionContext(step, tf.get_collection(tf.GraphKeys.GLOBAL_STEP)[0], config.profile_config.profile_dir, config.profile_config.profile_steps, config.profile_config.profile_range, tensor_or_op_name_to_replica_names, 1, config.resource_info['master'][0]) sess_context.set_parallax_session_context() return sess, num_workers, worker_id, 1
def test_horovod_allgather_grad_gpu(self): """Test the correctness of the allgather gradient on GPU.""" # Only do this test if there are GPUs available. if not tf.test.is_gpu_available(cuda_only=True): return if os.environ.get('HOROVOD_MIXED_INSTALL'): # Skip if compiled with CUDA but without HOROVOD_GPU_ALLGATHER. return hvd.init() rank = hvd.rank() local_rank = hvd.local_rank() size = hvd.size() # As of TensorFlow v1.9, gradients are not supported on # integer tensors dtypes = [tf.float32, tf.float64] dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): tensor_sizes = [3, 2, 7, 4, 6, 8, 10] * 5 tensor_sizes = tensor_sizes[:size] if _executing_eagerly(): with tf.GradientTape() as tape: tensor = self.tfe.Variable( tf.ones([tensor_sizes[rank]] + [17] * (dim - 1)) * rank) if dtype == tf.bool: tensor = tensor % 2 tensor = tf.cast(tensor, dtype=dtype) gathered = hvd.allgather(tensor) grad_list = [] for r, tensor_size in enumerate(tensor_sizes): g = tf.ones([tensor_size] + [17] * (dim - 1)) * r grad_list.append(g) grad_ys = tf.concat(grad_list, axis=0) with tf.device("/gpu:%d" % local_rank): grad_out = tape.gradient(gathered, tensor, grad_ys) else: tensor = tf.ones([tensor_sizes[rank]] + [17] * (dim - 1)) * rank if dtype == tf.bool: tensor = tensor % 2 tensor = tf.cast(tensor, dtype=dtype) gathered = hvd.allgather(tensor) grad_list = [] for r, tensor_size in enumerate(tensor_sizes): g = tf.ones([tensor_size] + [17] * (dim - 1)) * r grad_list.append(g) grad_ys = tf.concat(grad_list, axis=0) with tf.device("/gpu:%d" % local_rank): grad = tf.gradients(gathered, tensor, grad_ys)[0] grad_out = self.evaluate(grad) expected = np.ones([tensor_sizes[rank]] + [17] * (dim - 1)) * rank * size err = np.linalg.norm(expected - grad_out) self.assertLess( err, 0.00000001, "gradient %s differs from expected %s, " "error: %s" % (grad_out, expected, str(err)))
def FastTextEstimator(model_dir, config=None): params = { "learning_rate": FLAGS.learning_rate, } def model_fn(features, labels, mode, params): features["text"] = tf.sparse_tensor_to_dense(features["text"], default_value=" ") # if FLAGS.use_ngrams: # if FLAGS.ngrams is not None: # ngrams_list = text_utils.ParseNgramsOpts(FLAGS.ngrams) # features["ngrams"] = tf.py_func(text_utils.GenerateNgrams, # [features["text"], ngrams_list], tf.string) text_lookup_table = tf.contrib.lookup.index_table_from_file( FLAGS.vocab_file, FLAGS.num_oov_vocab_buckets, FLAGS.vocab_size) text_ids = text_lookup_table.lookup(features["text"]) text_embedding_w = tf.Variable( tf.random_uniform([ FLAGS.vocab_size + FLAGS.num_oov_vocab_buckets, FLAGS.embedding_dimension ], -0.1, 0.1)) text_embedding = tf.reduce_mean(tf.nn.embedding_lookup( text_embedding_w, text_ids), axis=-2) input_layer = text_embedding # if FLAGS.use_ngrams: # ngram_hash = tf.string_to_hash_bucket(features["ngrams"], # FLAGS.num_ngram_buckets) # ngram_embedding_w = tf.Variable(tf.random_uniform( # [FLAGS.num_ngram_buckets, FLAGS.ngram_embedding_dimension], -0.1, 0.1)) # ngram_embedding = tf.reduce_mean(tf.nn.embedding_lookup( # ngram_embedding_w, ngram_hash), axis=-2) # ngram_embedding = tf.expand_dims(ngram_embedding, -2) # input_layer = tf.concat([text_embedding, ngram_embedding], -1) num_classes = FLAGS.num_labels logits = tf.contrib.layers.fully_connected(inputs=input_layer, num_outputs=num_classes, activation_fn=None) predictions = tf.argmax(logits, axis=-1) probs = tf.nn.softmax(logits) loss, train_op = None, None metrics = {} if mode != tf.estimator.ModeKeys.PREDICT: label_lookup_table = tf.contrib.lookup.index_table_from_file( FLAGS.label_file, vocab_size=FLAGS.num_labels) labels = label_lookup_table.lookup(labels) loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels, logits=logits)) opt = tf.train.AdamOptimizer(params["learning_rate"]) if FLAGS.horovod: opt = hvd.DistributedOptimizer(opt) train_op = opt.minimize(loss, global_step=tf.train.get_global_step()) metrics = {"accuracy": tf.metrics.accuracy(labels, predictions)} exports = {} if FLAGS.export_dir: exports = Exports(probs, text_embedding) return tf.estimator.EstimatorSpec(mode, predictions=predictions, loss=loss, train_op=train_op, eval_metric_ops=metrics, export_outputs=exports) session_config = tf.ConfigProto( log_device_placement=FLAGS.log_device_placement) if FLAGS.horovod: session_config.gpu_options.visible_device_list = str(hvd.local_rank()) config = tf.contrib.learn.RunConfig( save_checkpoints_secs=None, save_checkpoints_steps=FLAGS.checkpoint_steps, session_config=session_config) return tf.estimator.Estimator(model_fn=model_fn, model_dir=model_dir, params=params, config=config)
ds = get_val_dataflow(args.data, batch, fbresnet_augmentor(False)) model = Model(args.depth) eval_on_ILSVRC12(model, get_model_loader(args.load), ds) sys.exit() logger.info("Training on {}".format(socket.gethostname())) # Print some information for sanity check. os.system("nvidia-smi") assert args.load is None hvd.init() if args.logdir is None: args.logdir = os.path.join( 'train_log', 'Horovod-{}GPUs-{}Batch'.format(hvd.size(), args.batch)) if hvd.rank() == 0: logger.set_logger_dir(args.logdir, 'd') logger.info("Rank={}, Local Rank={}, Size={}".format( hvd.rank(), hvd.local_rank(), hvd.size())) model = Model(args.depth, loss_scale=1.0 / hvd.size()) config = get_config(model, fake=args.fake) """ Sec 3: standard communication primitives like allreduce [11] perform summing, not averaging """ trainer = HorovodTrainer(average=False) launch_train_with_config(config, trainer)
def set_model(self, model): self.model = model self.graph = tf.Graph() with self.graph.as_default(): #Horovod added: Normal workflow config1 = tf.ConfigProto(log_device_placement=False) config1.gpu_options.allow_growth = True config1.gpu_options.visible_device_list = str(hvd.local_rank()) self.sess = tf.Session(config=config1) #Horovod end with self.sess.as_default(): initializer = tf.contrib.layers.xavier_initializer(uniform = True) with tf.variable_scope("model", reuse=tf.AUTO_REUSE, initializer = initializer): self.trainModel = self.model(config = self) #Horovod added: Vary the learning rate, dist optimizer if self.optimizer != None: pass elif self.opt_method == "Adagrad" or self.opt_method == "adagrad": self.optimizer = tf.train.AdagradOptimizer(learning_rate = self.alpha * hvd.size(), initial_accumulator_value=1e-20) elif self.opt_method == "Adadelta" or self.opt_method == "adadelta": self.optimizer = tf.train.AdadeltaOptimizer(self.alpha * hvd.size()) elif self.opt_method == "Adam" or self.opt_method == "adam": self.optimizer = tf.train.AdamOptimizer(self.alpha * hvd.size()) else: self.optimizer = tf.train.GradientDescentOptimizer(self.alpha * hvd.size() * self.sync_after) ################################################################ # Fetch a list of our network's trainable parameters. self.trainable_vars = tf.trainable_variables() #print("Shape of trainable vars: {}".format(np.array(self.trainable_vars))) # Create variables to store accumulated gradients self.accumulators = [ tf.Variable( tf.zeros_like(tv.initialized_value()), trainable=False ) for tv in self.trainable_vars ] #print("Shape of accumulators: {}".format(np.array(self.accumulators))) # Create a variable for counting the number of accumulations self.accumulation_counter = tf.Variable(0.0, trainable=False) # Compute gradients; grad_pairs contains (gradient, variable) pairs self.grad_pairs = self.optimizer.compute_gradients(self.trainModel.loss, self.trainable_vars) # print("Shape of grad_pairs: {}".format(np.array(self.grad_pairs))) # for g, v in self.grad_pairs: # print("Shape of grad: {}".format(np.array(g))) # Create operations which add a variable's gradient to its accumulator. self.accumulate_ops = [ accumulator.assign_add( grad ) for (accumulator, (grad, var)) in zip(self.accumulators, self.grad_pairs) if grad is not None ] # The final accumulation operation is to increment the counter self.accumulate_ops.append(self.accumulation_counter.assign_add(1.0)) # Update trainable variables by applying the accumulated gradients # divided by the counter. Note: apply_gradients takes in a list of # (grad, var) pairs # self.apply_step = self.optimizer.apply_gradients( # [(accumulator / self.accumulation_counter, var) \ # for (accumulator, (grad, var)) in zip(self.accumulators, self.grad_pairs)] # ) # Accumulators must be zeroed once the accumulated gradient is applied. self.zero_ops = [ accumulator.assign( tf.zeros_like(tv) ) for (accumulator, tv) in zip(self.accumulators, self.trainable_vars) ] # Add one last op for zeroing the counter self.zero_ops.append(self.accumulation_counter.assign(0.0)) ################################################################/////////// # self.dist_optimizer = hvd.DistributedOptimizer(self.optimizer) # self.train_op = self.dist_optimizer.minimize(self.trainModel.loss) #Horovod end self.barrier = hvd.allreduce(tf.random_normal(shape=[1])) if(hvd.rank() == 0): self.saver = tf.train.Saver() # self.logSummary = tf.summary.scalar('Train_loss', self.trainModel.loss) # self.train_writer = tf.summary.FileWriter('./train', self.sess.graph) self.sess.run(tf.global_variables_initializer()) #Horovod added: Normal workflow self.sess.run(hvd.broadcast_global_variables(0))
def main(device, input_path_train, input_path_validation, downsampling_fact, downsampling_mode, channels, data_format, label_id, weights, image_dir, checkpoint_dir, trn_sz, val_sz, loss_type, model, decoder, fs_type, optimizer, batch, batchnorm, num_epochs, dtype, disable_checkpoints, disable_imsave, tracing, trace_dir, output_sampling, scale_factor): #init horovod comm_rank = 0 comm_local_rank = 0 comm_size = 1 comm_local_size = 1 if horovod: hvd.init() comm_rank = hvd.rank() comm_local_rank = hvd.local_rank() comm_size = hvd.size() #not all horovod versions have that implemented try: comm_local_size = hvd.local_size() except: comm_local_size = 1 if comm_rank == 0: print("Using distributed computation with Horovod: {} total ranks". format(comm_size, comm_rank)) #downsampling? recompute image dims image_height = image_height_orig // downsampling_fact image_width = image_width_orig // downsampling_fact #parameters per_rank_output = False loss_print_interval = 10 #session config sess_config = tf.ConfigProto( inter_op_parallelism_threads=1, #1 intra_op_parallelism_threads=6, #6 log_device_placement=False, allow_soft_placement=True) sess_config.gpu_options.visible_device_list = str(comm_local_rank + 2) sess_config.gpu_options.force_gpu_compatible = True #get data training_graph = tf.Graph() if comm_rank == 0: print("Loading data...") trn_data = load_data(input_path_train, True, trn_sz, horovod) val_data = load_data(input_path_validation, False, val_sz, horovod) if comm_rank == 0: print("Shape of trn_data is {}".format(trn_data.shape[0])) print("Shape of val_data is {}".format(val_data.shape[0])) print("done.") #print some stats if comm_rank == 0: print("Num workers: {}".format(comm_size)) print("Local batch size: {}".format(batch)) if dtype == tf.float32: print("Precision: {}".format("FP32")) else: print("Precision: {}".format("FP16")) print("Decoder: {}".format(decoder)) print("Batch normalization: {}".format(batchnorm)) print("Channels: {}".format(channels)) print("Loss type: {}".format(loss_type)) print("Loss weights: {}".format(weights)) print("Loss scale factor: {}".format(scale_factor)) print("Output sampling target: {}".format(output_sampling)) #print optimizer parameters for k, v in optimizer.items(): print("Solver Parameters: {k}: {v}".format(k=k, v=v)) print("Num training samples: {}".format(trn_data.shape[0])) print("Num validation samples: {}".format(val_data.shape[0])) print("Disable checkpoints: {}".format(disable_checkpoints)) print("Disable image save: {}".format(disable_imsave)) #compute epochs and stuff: if fs_type == "local": num_samples = trn_data.shape[0] // comm_local_size else: num_samples = trn_data.shape[0] // comm_size num_steps_per_epoch = num_samples // batch num_steps = num_epochs * num_steps_per_epoch if per_rank_output: print("Rank {} does {} steps per epoch".format(comm_rank, num_steps_per_epoch)) with training_graph.as_default(): #create readers trn_reader = h5_input_reader(input_path_train, channels, weights, dtype, normalization_file="stats.h5", update_on_read=False, data_format=data_format, label_id=label_id, sample_target=output_sampling) val_reader = h5_input_reader(input_path_validation, channels, weights, dtype, normalization_file="stats.h5", update_on_read=False, data_format=data_format, label_id=label_id) #create datasets if fs_type == "local": trn_dataset = create_dataset(trn_reader, trn_data, batch, num_epochs, comm_local_size, comm_local_rank, dtype, shuffle=True) val_dataset = create_dataset(val_reader, val_data, batch, 1, comm_local_size, comm_local_rank, dtype, shuffle=False) else: trn_dataset = create_dataset(trn_reader, trn_data, batch, num_epochs, comm_size, comm_rank, dtype, shuffle=True) val_dataset = create_dataset(val_reader, val_data, batch, 1, comm_size, comm_rank, dtype, shuffle=False) #create iterators handle = tf.placeholder(tf.string, shape=[], name="iterator-placeholder") iterator = tf.data.Iterator.from_string_handle( handle, (dtype, tf.int32, dtype, tf.string), ((batch, len(channels), image_height_orig, image_width_orig) if data_format == "channels_first" else (batch, image_height_orig, image_width_orig, len(channels)), (batch, image_height_orig, image_width_orig), (batch, image_height_orig, image_width_orig), (batch))) next_elem = iterator.get_next() #if downsampling, do some preprocessing if downsampling_fact != 1: if downsampling_mode == "scale": #do downsampling rand_select = tf.cast(tf.one_hot(tf.random_uniform( (batch, image_height, image_width), minval=0, maxval=downsampling_fact * downsampling_fact, dtype=tf.int32), depth=downsampling_fact * downsampling_fact, axis=-1), dtype=tf.int32) next_elem = (tf.layers.average_pooling2d(next_elem[0], downsampling_fact, downsampling_fact, 'valid', data_format), \ tf.reduce_max(tf.multiply(tf.image.extract_image_patches(tf.expand_dims(next_elem[1], axis=-1), \ [1, downsampling_fact, downsampling_fact, 1], \ [1, downsampling_fact, downsampling_fact, 1], \ [1,1,1,1], 'VALID'), rand_select), axis=-1), \ tf.squeeze(tf.layers.average_pooling2d(tf.expand_dims(next_elem[2], axis=-1), downsampling_fact, downsampling_fact, 'valid', "channels_last"), axis=-1), \ next_elem[3]) elif downsampling_mode == "center-crop": #some parameters length = 1. / float(downsampling_fact) offset = length / 2. boxes = [[offset, offset, offset + length, offset + length] ] * batch box_ind = list(range(0, batch)) crop_size = [image_height, image_width] #be careful with data order if data_format == "channels_first": next_elem[0] = tf.transpose(next_elem[0], perm=[0, 2, 3, 1]) #crop next_elem = (tf.image.crop_and_resize(next_elem[0], boxes, box_ind, crop_size, method='bilinear', extrapolation_value=0, name="data_cropping"), \ ensure_type(tf.squeeze(tf.image.crop_and_resize(tf.expand_dims(next_elem[1],axis=-1), boxes, box_ind, crop_size, method='nearest', extrapolation_value=0, name="label_cropping"), axis=-1), tf.int32), \ tf.squeeze(tf.image.crop_and_resize(tf.expand_dims(next_elem[2],axis=-1), boxes, box_ind, crop_size, method='bilinear', extrapolation_value=0, name="weight_cropping"), axis=-1), \ next_elem[3]) #be careful with data order if data_format == "channels_first": next_elem[0] = tf.transpose(next_elem[0], perm=[0, 3, 1, 2]) else: raise ValueError( "Error, downsampling mode {} not supported. Supported are [center-crop, scale]" .format(downsampling_mode)) #create init handles #trn trn_iterator = trn_dataset.make_initializable_iterator() trn_handle_string = trn_iterator.string_handle() trn_init_op = iterator.make_initializer(trn_dataset) #val val_iterator = val_dataset.make_initializable_iterator() val_handle_string = val_iterator.string_handle() val_init_op = iterator.make_initializer(val_dataset) #compute the input filter number based on number of channels used num_channels = len(channels) #set up model model = deeplab_v3_plus_generator(num_classes=3, output_stride=8, base_architecture=model, decoder=decoder, batchnorm=batchnorm, pre_trained_model=None, batch_norm_decay=None, data_format=data_format) logit, prediction = model(next_elem[0], True, dtype) #set up loss loss = None #cast the logits to fp32 logit = ensure_type(logit, tf.float32) if loss_type == "weighted": #cast weights to FP32 w_cast = ensure_type(next_elem[2], tf.float32) loss = tf.losses.sparse_softmax_cross_entropy( labels=next_elem[1], logits=logit, weights=w_cast, reduction=tf.losses.Reduction.SUM) if scale_factor != 1.0: loss *= scale_factor elif loss_type == "weighted_mean": #cast weights to FP32 w_cast = ensure_type(next_elem[2], tf.float32) loss = tf.losses.sparse_softmax_cross_entropy( labels=next_elem[1], logits=logit, weights=w_cast, reduction=tf.losses.Reduction.SUM_BY_NONZERO_WEIGHTS) if scale_factor != 1.0: loss *= scale_factor elif loss_type == "focal": #one-hot-encode labels_one_hot = tf.contrib.layers.one_hot_encoding( next_elem[1], 3) #cast to FP32 labels_one_hot = ensure_type(labels_one_hot, tf.float32) loss = focal_loss(onehot_labels=labels_one_hot, logits=logit, alpha=1., gamma=2.) else: raise ValueError("Error, loss type {} not supported.", format(loss_type)) # tf.summary.scalar('loss', loss) #determine flops flops = graph_flops.graph_flops( format="NHWC" if data_format == "channels_last" else "NCHW", batch=batch, sess_config=sess_config) flops *= comm_size if comm_rank == 0: print('training flops: {:.3f} TF/step'.format(flops * 1e-12)) #number of trainable parameters if comm_rank == 0: num_params = get_number_of_trainable_parameters() print('number of trainable parameters: {} ({} MB)'.format( num_params, num_params * (4 if dtype == tf.float32 else 2) * (2**-20))) if horovod: loss_avg = hvd.allreduce(ensure_type(loss, tf.float32)) else: loss_avg = tf.identity(loss) #set up global step - keep on CPU with tf.device('/device:CPU:0'): global_step = tf.train.get_or_create_global_step() #set up optimizer if optimizer['opt_type'].startswith("LARC"): if comm_rank == 0: print("Enabling LARC") train_op, lr = get_larc_optimizer(optimizer, loss, global_step, num_steps_per_epoch, horovod) else: train_op, lr = get_optimizer(optimizer, loss, global_step, num_steps_per_epoch, horovod) #set up streaming metrics iou_op, iou_update_op = tf.metrics.mean_iou(labels=next_elem[1], predictions=tf.argmax( prediction, axis=3), num_classes=3, weights=None, metrics_collections=None, updates_collections=None, name="iou_score") iou_reset_op = tf.variables_initializer([ i for i in tf.local_variables() if i.name.startswith('iou_score/') ]) if horovod: iou_avg = hvd.allreduce(iou_op) else: iou_avg = tf.identity(iou_op) # tf.summary.scalar('IOU', iou_avg) with tf.device(device): mem_usage_ops = [ tf.contrib.memory_stats.MaxBytesInUse(), tf.contrib.memory_stats.BytesLimit() ] #hooks #these hooks are essential. regularize the step hook by adding one additional step at the end hooks = [tf.train.StopAtStepHook(last_step=num_steps + 1)] #bcast init for bcasting the model after start if horovod: init_bcast = hvd.broadcast_global_variables(0) #initializers: init_op = tf.global_variables_initializer() init_local_op = tf.local_variables_initializer() #checkpointing if comm_rank == 0: checkpoint_save_freq = 5 * num_steps_per_epoch checkpoint_saver = tf.train.Saver(max_to_keep=1000) if (not disable_checkpoints): hooks.append( tf.train.CheckpointSaverHook( checkpoint_dir=checkpoint_dir, save_steps=checkpoint_save_freq, saver=checkpoint_saver)) #create image dir if not exists if not os.path.isdir(image_dir): os.makedirs(image_dir) #tracing if tracing is not None: import tracehook tracing_hook = tracehook.TraceHook(steps_to_trace=tracing, cache_traces=True, trace_dir=trace_dir) hooks.append(tracing_hook) # instead of averaging losses over an entire epoch, use a moving # window average recent_losses = [] loss_window_size = 10 #start session with tf.train.MonitoredTrainingSession(config=sess_config, hooks=hooks) as sess: #initialize sess.run([init_op, init_local_op]) #restore from checkpoint: if comm_rank == 0 and not disable_checkpoints: load_model(sess, checkpoint_saver, checkpoint_dir) with open('prm{}.log'.format(comm_rank), 'w') as f: f.write('{}\n'.format(hvd.rank())) for i, v in enumerate(tf.global_variables()): f.write('{} {}\n'.format(i, v)) print('Parameters checked') #broadcast loaded model variables if horovod: tb = time.time() sess.run(init_bcast) te = time.time() print('Model synchronization done in {} s'.format(te - tb)) #create iterator handles trn_handle, val_handle = sess.run( [trn_handle_string, val_handle_string]) #init iterators sess.run(trn_init_op, feed_dict={handle: trn_handle}) sess.run(val_init_op, feed_dict={handle: val_handle}) print('Init iterations done') # figure out what step we're on (it won't be 0 if we are # restoring from a checkpoint) so we can count from there train_steps = sess.run([global_step])[0] #do the training epoch = 1 step = 1 prev_mem_usage = 0 t_sustained_start = time.time() r_peak = 0 #start training start_time = time.time() while not sess.should_stop(): #training loop try: #construct feed dict t_inst_start = time.time() _, tmp_loss, cur_lr = sess.run( [ train_op, (loss if per_rank_output else loss_avg), lr ], feed_dict={handle: trn_handle}) t_inst_end = time.time() if "gpu" in device.lower(): mem_used = sess.run(mem_usage_ops) else: mem_used = [0, 0] train_steps += 1 train_steps_in_epoch = train_steps % num_steps_per_epoch recent_losses = [tmp_loss ] + recent_losses[0:loss_window_size - 1] train_loss = sum(recent_losses) / len(recent_losses) step += 1 r_inst = (t_inst_end - t_inst_start) r_peak = max(r_peak, r_inst) #print step report eff_steps = train_steps_in_epoch if ( train_steps_in_epoch > 0) else num_steps_per_epoch if (train_steps % loss_print_interval) == 0: mem_used = sess.run(mem_usage_ops) if per_rank_output: print( "REPORT: rank {}, training loss for step {} (of {}) is {}, time {:.3f}" .format(comm_rank, train_steps, num_steps, train_loss, time.time() - start_time)) else: if comm_rank == 0: if True or mem_used[0] > prev_mem_usage: print( "memory usage: {:.2f} GB / {:.2f} GB". format(mem_used[0] / 2.0**30, mem_used[1] / 2.0**30)) prev_mem_usage = mem_used[0] print( "REPORT: training loss for step {} (of {}) is {}, time {:.3f}, r_inst {:.3f}, r_peak {:.3f}, lr {:.2g}" .format(train_steps, num_steps, train_loss, time.time() - start_time, r_inst, r_peak, cur_lr)) # summary = sess.run([merged], feed_dict=feed_dict(False)) # test_writer.add_summary(summary, train_steps) #do the validation phase if train_steps_in_epoch == 0: end_time = time.time() #print epoch report if per_rank_output: print( "COMPLETED: rank {}, training loss for epoch {} (of {}) is {}, time {:.3f}, r_sust {:.3f}" .format( comm_rank, epoch, num_epochs, train_loss, time.time() - start_time, 1e-12 * flops * num_steps_per_epoch / (end_time - t_sustained_start))) else: if comm_rank == 0: print( "COMPLETED: training loss for epoch {} (of {}) is {}, time {:.3f}, r_sust {:.3f}" .format( epoch, num_epochs, train_loss, time.time() - start_time, 1e-12 * flops * num_steps_per_epoch / (end_time - t_sustained_start))) #evaluation loop eval_loss = 0. eval_steps = 0 while True: try: #construct feed dict _, tmp_loss, val_model_predictions, val_model_labels, val_model_filenames = sess.run( [ iou_update_op, (loss if per_rank_output else loss_avg), prediction, next_elem[1], next_elem[3] ], feed_dict={handle: val_handle}) #print some images if comm_rank == 0 and not disable_imsave: if have_imsave: imsave( image_dir + '/test_pred_epoch' + str(epoch) + '_estep' + str(eval_steps) + '_rank' + str(comm_rank) + '.png', np.argmax( val_model_predictions[0, ...], axis=2) * 100) imsave( image_dir + '/test_label_epoch' + str(epoch) + '_estep' + str(eval_steps) + '_rank' + str(comm_rank) + '.png', val_model_labels[0, ...] * 100) imsave( image_dir + '/test_combined_epoch' + str(epoch) + '_estep' + str(eval_steps) + '_rank' + str(comm_rank) + '.png', plot_colormap[ val_model_labels[0, ...], np.argmax( val_model_predictions[0, ...], axis=2)]) else: np.savez( image_dir + '/test_epoch' + str(epoch) + '_estep' + str(eval_steps) + '_rank' + str(comm_rank) + '.npz', prediction=np.argmax( val_model_predictions[0, ...], axis=2) * 100, label=val_model_labels[0, ...] * 100, filename=val_model_filenames[0]) eval_loss += tmp_loss eval_steps += 1 except tf.errors.OutOfRangeError: eval_steps = np.max([eval_steps, 1]) eval_loss /= eval_steps if per_rank_output: print( "COMPLETED: rank {}, evaluation loss for epoch {} (of {}) is {}" .format(comm_rank, epoch, num_epochs, eval_loss)) else: if comm_rank == 0: print( "COMPLETED: evaluation loss for epoch {} (of {}) is {}" .format(epoch, num_epochs, eval_loss)) if per_rank_output: iou_score = sess.run(iou_op) print( "COMPLETED: rank {}, evaluation IoU for epoch {} (of {}) is {}" .format(comm_rank, epoch, num_epochs, iou_score)) else: iou_score = sess.run(iou_avg) if comm_rank == 0: print( "COMPLETED: evaluation IoU for epoch {} (of {}) is {}" .format(epoch, num_epochs, iou_score)) sess.run(iou_reset_op) sess.run(val_init_op, feed_dict={handle: val_handle}) break #reset counters epoch += 1 step = 0 t_sustained_start = time.time() except tf.errors.OutOfRangeError: break # write any cached traces to disk if tracing is not None: tracing_hook.write_traces()
def main(_): # Horovod: initialize Horovod. hvd.init() # Horovod: pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.visible_device_list = str(hvd.local_rank()) tf.enable_eager_execution(config=config) mnist_model = tf.keras.Sequential([ tf.keras.layers.Conv2D(16, [3, 3], activation='relu'), tf.keras.layers.Conv2D(16, [3, 3], activation='relu'), tf.keras.layers.GlobalAveragePooling2D(), tf.keras.layers.Dense(10) ]) # Horovod: adjust learning rate based on number of GPUs. opt = tf.train.RMSPropOptimizer(0.001 * hvd.size()) (mnist_images, mnist_labels), _ = \ tf.keras.datasets.mnist.load_data(path='mnist-%d.npz' % hvd.rank()) dataset = tf.data.Dataset.from_tensor_slices( (tf.cast(mnist_images[..., tf.newaxis] / 255.0, tf.float32), tf.cast(mnist_labels, tf.int64))) dataset = dataset.shuffle(1000).batch(32) checkpoint_dir = './checkpoints' step_counter = tf.train.get_or_create_global_step() checkpoint = tf.train.Checkpoint(model=mnist_model, optimizer=opt, step_counter=step_counter) # Horovod: adjust number of steps based on number of GPUs. for (batch, (images, labels)) in enumerate(dataset.take(20000 // hvd.size())): with tf.GradientTape() as tape: logits = mnist_model(images, training=True) loss_value = tf.losses.sparse_softmax_cross_entropy(labels, logits) # Horovod: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. if batch == 0: hvd.broadcast_variables(mnist_model.variables, root_rank=0) # Horovod: add Horovod Distributed GradientTape. tape = hvd.DistributedGradientTape(tape) grads = tape.gradient(loss_value, mnist_model.variables) opt.apply_gradients(zip(grads, mnist_model.variables), global_step=tf.train.get_or_create_global_step()) if batch % 10 == 0 and hvd.local_rank() == 0: print('Step #%d\tLoss: %.6f' % (batch, loss_value)) # Horovod: save checkpoints only on worker 0 to prevent other workers from # corrupting it. if hvd.rank() == 0: checkpoint.save(checkpoint_dir)
init = tf.global_variables_initializer() # Horovod: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. bcast = hvd.broadcast_global_variables(0) # Step 5: Begin training. # Horovod: adjust number of steps based on number of GPUs. num_steps = 100000 // hvd.size() + 1 # Horovod: pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) with tf.Session(graph=graph, config=config) as session: # We must initialize all variables before we use them. init.run() bcast.run() print('Initialized') average_loss = 0 for step in xrange(num_steps): # simulate various sentence length by randomization batch_size = random.randint(max_batch_size // 2, max_batch_size) batch_inputs, batch_labels = generate_batch(batch_size, num_skips, skip_window) feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}
def main(argv=None): ''' ''' main.__doc__ = __doc__ argv = sys.argv if argv is None else sys.argv.extend(argv) desc = main.__doc__ # .format(os.path.basename(__file__)) # CLI parser args = parser_(desc) # Initialize Horovod. hvd.init() logdevp = args.logdevp # For debugging log_device_placement, allow_soft_placement = (True, True) \ if _DEVPROF or logdevp else (False, False) nranks_per_gpu = args.nranks_per_gpu local_rank = hvd.local_rank() gpu_local_rank = local_rank // nranks_per_gpu print('local_rank, GPU_LOCAL_RANK: {}, {}'.format( local_rank, gpu_local_rank)) # Pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto(log_device_placement=log_device_placement, allow_soft_placement=allow_soft_placement) config.gpu_options.allow_growth = True # config.gpu_options.visible_device_list = str(hvd.local_rank()) config.gpu_options.visible_device_list = str(gpu_local_rank) KB.set_session(tf.Session(config=config)) hvdsize = hvd.size() checkpt = getattr(args, 'checkpt', None) checkpt_flag = False if checkpt is None else True filepath = checkpt # print('CHECKPT:', checkpt) batch_size = args.batch_size num_classes = 10 epochs = args.epochs data_augmentation = args.aug datadir = getattr(args, 'datadir', None) # The data, shuffled and split between train and test sets: (x_train, y_train), (x_test, y_test) = cifar10_load_data(datadir) \ if datadir is not None else cifar10.load_data() train_samples = x_train.shape[0] test_samples = x_test.shape[0] steps_per_epoch = train_samples // batch_size // hvdsize test_batches = test_samples // batch_size print(train_samples, 'train samples') print(test_samples, 'test samples') x_train = x_train.astype('float32') x_test = x_test.astype('float32') x_train /= 255 x_test /= 255 # Convert class vectors to binary class matrices. y_train = to_categorical(y_train, num_classes).squeeze() y_test = to_categorical(y_test, num_classes).squeeze() callbacks = [] if hvd.rank() == 0: callbacks += [BatchTiming(), SamplesPerSec(batch_size * hvdsize)] print(x_train.shape, 'train shape') # with tf.device('/cpu:0'): model = make_model(x_train.shape, num_classes, filepath if checkpt_flag else None) lr = 0.0001 * hvdsize opt = tf.train.RMSPropOptimizer(lr) # Add Horovod Distributed Optimizer. opt = hvd.DistributedOptimizer(opt) # , use_locking=True) opt = TFOptimizer(opt) # Required for tf.train based optimizers # ------------------------------------- HAVE TO GET SESSION AFTER OPTIMIZER # sess = KB.get_session() # ------------------------------------------------------------------------- # Let's train the model using RMSprop model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy']) if hvd.rank() == 0: model.summary() KB.get_session().run(hvd.broadcast_global_variables(0)) if not data_augmentation: print('Not using data augmentation.') # model.fit(x_train, y_train, # batch_size=batch_size, # epochs=epochs, # validation_data=(x_test, y_test), # shuffle=True, # callbacks=callbacks) train_gen = ImageDataGenerator() test_gen = ImageDataGenerator() # Train the model. The training will randomly sample 1 / N batches of # training data and 3 / N batches of validation data on every worker, # where N is the number of workers. Over-sampling of validation data # helps to increase probability that every validation example will be # evaluated. start_time = time.time() model.fit_generator( train_gen.flow(x_train, y_train, batch_size=batch_size), steps_per_epoch=steps_per_epoch, callbacks=callbacks, epochs=epochs, verbose=hvd.rank() == 0, validation_data=test_gen.flow(x_test, y_test, batch_size=batch_size), validation_steps=3 * test_batches // hvdsize) else: print('Using real-time data augmentation.') # This will do preprocessing and realtime data augmentation: datagen = ImageDataGenerator( featurewise_center=False, # set input mean to 0 over the dataset samplewise_center=False, # set each sample mean to 0 # divide inputs by std of the dataset featurewise_std_normalization=False, samplewise_std_normalization=False, # divide each input by its std zca_whitening=False, # apply ZCA whitening # randomly rotate images in the range (degrees, 0 to 180) rotation_range=0, # randomly shift images horizontally (fraction of total width) width_shift_range=0.1, # randomly shift images vertically (fraction of total height) height_shift_range=0.1, horizontal_flip=True, # randomly flip images vertical_flip=False) # randomly flip images # Compute quantities required for feature-wise normalization # (std, mean, and principal components if ZCA whitening is applied). datagen.fit(x_train) start_time = time.time() # Fit the model on the batches generated by datagen.flow(). model.fit_generator( datagen.flow(x_train, y_train, batch_size=batch_size), steps_per_epoch=steps_per_epoch, epochs=epochs, validation_data=(x_test, y_test), verbose=hvd.rank() == 0, callbacks=callbacks) if hvd.rank() == 0: elapsed_time = time.time() - start_time print('[{}] finished in {} s' .format('TRAINING', round(elapsed_time, 3))) metrics = model.evaluate(x=x_test, y=y_test, batch_size=batch_size) print('\nCIFAR VALIDATION LOSS, ACC: {}, {}'.format(*metrics)) KB.clear_session()
def main(argv=None): # Initialize Horovod. hvd.init() # Pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) KB.set_session(tf.Session(config=config)) # print('LOCAL RANK, OVERAL RANK: {}, {}'.format(hvd.local_rank(), # hvd.rank())) ngpus = hvd.size() main.__doc__ = __doc__ argv = sys.argv if argv is None else sys.argv.extend(argv) desc = main.__doc__ # .format(os.path.basename(__file__)) # CLI parser args = _parser(desc) num_devices_tfrecord = 1 height, width = 224, 224 # Image dimensions. Gets resized if not match. distort_color = args.distort_color data_dir = args.datadir batch_size = args.batch_size # * ngpus epochs = args.epochs imgs_per_epoch = args.imgs_per_epoch # Fit the model using data from the TFRecord data tensors. device_minibatches = RecordInputImagenetPreprocessor.device_minibatches images_tfrecord, labels_tfrecord, nrecords = device_minibatches( num_devices_tfrecord, data_dir, batch_size, height, width, distort_color, val=False) images_tfrecord = images_tfrecord[0] labels_tfrecord = labels_tfrecord[0] # CASTING FOR KERAS # labels[device_num] = tf.cast(labels_tfrecord, dtype) nclasses = 1000 labels_tfrecord = tf.one_hot(labels_tfrecord, nclasses) nimgs_to_use = imgs_per_epoch if imgs_per_epoch > 0 else nrecords steps_per_epoch = nimgs_to_use // batch_size // hvd.size() # steps_per_epoch = 100 # batch_shape = images_tfrecord.get_shape().as_list() # images = Input(tensor=images_tfrecord, batch_shape=x_batch_shape) images = Input(tensor=images_tfrecord) model = ResNet50(input_tensor=images, weights=None) if hvd.rank() == 0: model.summary() print('Num images: {}'.format(nrecords)) if nimgs_to_use < nrecords: print('Using {} images per epoch'.format(nimgs_to_use)) # print('IMAGES_TFRECORD: {}'.format(images_tfrecord)) # print('LABELS_TFRECORD: {}'.format(labels_tfrecord)) # Add Horovod Distributed Optimizer from nvcnn.py # momentum = 0.9 # lr = 0.1 # learning_rate = tf.train.exponential_decay( # lr, # self.global_step, # decay_steps=FLAGS.lr_decay_epochs * nstep_per_epoch, # decay_rate=FLAGS.lr_decay_rate, # staircase=True) # opt = tf.train.MomentumOptimizer(self.learning_rate, momentum, # use_nesterov=True) # lr = 0.001 * ngpus # opt = tf.train.AdamOptimizer() # opt = hvd.DistributedOptimizer(opt) # , use_locking=True) # opt = KO.TFOptimizer(opt) # Required for tf.train based optimizers opt = KO.Adam() opt = hvd_keras.DistributedOptimizer(opt) model.compile(loss='categorical_crossentropy', optimizer=opt, # metrics=['accuracy'], target_tensors=[labels_tfrecord]) # Broadcast variables from rank 0 to all other processes. KB.get_session().run(hvd.broadcast_global_variables(0)) callbacks = [] if hvd.rank() == 0: callbacks += [BatchTiming(), SamplesPerSec(ngpus * batch_size)] # RecordInput is a yield op which doesn't use queue runners or queues. # Start the queue runners. # sess = KB.get_session() # sess.run([tf.local_variables_initializer(), # tf.global_variables_initializer()]) # coord = tf.train.Coordinator() # threads = tf.train.start_queue_runners(sess, coord) start_time = time.time() model.fit( steps_per_epoch=steps_per_epoch, epochs=epochs, callbacks=callbacks, verbose=1) # verbose=hvd.rank() == 0) elapsed_time = time.time() - start_time if hvd.rank() == 0: print('[{}] finished in {} s' .format('TRAINING', round(elapsed_time, 3))) # loss = model.evaluate(None, None, steps=steps_per_epoch_val) images_tfrecord_val, labels_tfrecord_val, nrecords_val = \ device_minibatches(num_devices_tfrecord, data_dir, batch_size, height, width, distort_color, val=True) images_tfrecord_val = images_tfrecord_val[0] labels_tfrecord_val = labels_tfrecord_val[0] labels_tfrecord_val = tf.one_hot(labels_tfrecord_val, nclasses) # print('IMAGES_TFRECORD_VAL: {}'.format(images_tfrecord_val)) # print('labels_tfrecord_val: {}'.format(labels_tfrecord_val)) steps_per_epoch_val = nrecords_val // batch_size images_val = Input(tensor=images_tfrecord_val) model_val = model model_val.layers[0] = KL.InputLayer(input_tensor=images_val) model_val.compile( loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'], target_tensors=[labels_tfrecord_val]) # model.summary() loss = model_val.evaluate(x=None, y=None, steps=steps_per_epoch_val) print('\nNum images evaluated, steps: {}, {}'. format(nrecords_val, steps_per_epoch_val)) print('\nTest loss, acc: {}'.format(loss)) # print('\nTest accuracy: {0}'.format(acc)) # Clean up the TF session. # coord.request_stop() # coord.join(threads) KB.clear_session() # do this for Horovod
#!/usr/bin/env python3 import os print("pid %i: Hello" % os.getpid()) import tensorflow as tf import horovod.tensorflow as hvd # Initialize Horovod hvd.init() print("pid %i: hvd: rank: %i, size: %i, local_rank %i, local_size %i" % (os.getpid(), hvd.rank(), hvd.size(), hvd.local_rank(), hvd.local_size()))
def train(): """ init dir and log config """ init_cluster_ray() hvd.init() base_dir, ckpt_dir, summary_dir = init_dir_and_log() kwargs = FLAGS.flag_values_dict() kwargs["BASE_DIR"] = base_dir kwargs["ckpt_dir"] = ckpt_dir """ get one seg from rollout worker for dtype and shapes :param kwargs rollout worker config """ logging.info('get one seg from Evaluator for dtype and shapes') ps = AsyncPS.remote() small_data_collector = RolloutCollector( server_nums=1, ps=ps, policy_evaluator_build_func=build_policy_evaluator, **kwargs) cache_struct_path = '/tmp/ExploreGoalLocationsLarge_%s.pkl' % FLAGS.dir if hvd.local_rank() == 0: structure = fetch_one_structure(small_data_collector, cache_struct_path=cache_struct_path, is_head=True) else: structure = fetch_one_structure(small_data_collector, cache_struct_path=cache_struct_path, is_head=False) del small_data_collector """ init data prefetch thread, prepare_input_pipe """ keys = list(structure.keys()) dtypes = [structure[k].dtype for k in keys] shapes = [structure[k].shape for k in keys] segBuffer = tf.queue.FIFOQueue( capacity=FLAGS.qsize * FLAGS.batch_size, dtypes=dtypes, shapes=shapes, names=keys, shared_name="buffer") server_nums = FLAGS.nof_evaluator server_nums_refine = server_nums * 2 // FLAGS.cpu_per_actor nof_server_gpus = FLAGS.nof_server_gpus server_nums_refine = server_nums_refine // nof_server_gpus data_collector = RolloutCollector(server_nums=server_nums_refine, ps=ps, policy_evaluator_build_func=build_policy_evaluator, **kwargs) config = tf.ConfigProto( allow_soft_placement=True, gpu_options=tf.GPUOptions( per_process_gpu_memory_fraction=1)) config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) sess = tf.Session(config=config) reader = QueueReader( sess=sess, global_queue=segBuffer, data_collector=data_collector, keys=keys, dtypes=dtypes, shapes=shapes) reader.daemon = True reader.start() dequeued = segBuffer.dequeue_many(FLAGS.batch_size) prephs, postphs = dict(), dict() for k, v in dequeued.items(): if k == "state_in": prephs[k] = v else: prephs[k], postphs[k] = tf.split( v, [FLAGS.burn_in, FLAGS.seqlen], axis=1) prekeys = list(prephs.keys()) postkeys = list(postphs.keys()) ## count frame and total steps num_frames = tf.get_variable( 'num_environment_frames', initializer=tf.zeros_initializer(), shape=[], dtype=tf.int32, trainable=False) tf.summary.scalar("frames", num_frames) global_step = tf.train.get_or_create_global_step() dur_time_tensor = tf.placeholder(dtype=tf.float32) tf.summary.scalar('time_per_step', dur_time_tensor) # set stage_op and build learner with tf.device("/gpu"): if FLAGS.use_stage: area = tf.contrib.staging.StagingArea( [prephs[key].dtype for key in prekeys] + [postphs[key].dtype for key in postkeys], [prephs[key].shape for key in prekeys] + [postphs[key].shape for key in postkeys]) stage_op = area.put([prephs[key] for key in prekeys] + [postphs[key] for key in postkeys]) from_stage = area.get() predatas = {key: from_stage[i] for i, key in enumerate(prekeys)} postdatas = {key: from_stage[i + len(prekeys)] for i, key in enumerate(postkeys)} else: stage_op = [] predatas, postdatas = prephs, postphs act_space = FLAGS.act_space num_frames_and_train, global_step_and_train = build_learner( pre=predatas, post=postdatas, act_space=act_space, num_frames=num_frames) """ add summary """ summary_ops = tf.summary.merge_all() if hvd.local_rank() == 0: summary_writer = tf.summary.FileWriter(summary_dir, sess.graph) """ initialize and save ckpt """ saver = tf.train.Saver(max_to_keep=100, keep_checkpoint_every_n_hours=6) ckpt = tf.train.get_checkpoint_state(ckpt_dir) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) else: sess.run(tf.global_variables_initializer()) sess.run(hvd.broadcast_global_variables(0)) if hvd.local_rank() == 0: saver.save(sess, os.path.join(ckpt_dir, "PPOcGAE"), global_step=global_step) """ step """ total_frames = 0 sess.run(stage_op) dur_time = 0 while total_frames < FLAGS.total_environment_frames: start = time.time() total_frames, gs, summary, _ = sess.run( [num_frames_and_train, global_step_and_train, summary_ops, stage_op], feed_dict={dur_time_tensor: dur_time}) if hvd.local_rank() == 0: if gs % 1 == 0: summary_writer.add_summary(summary, global_step=gs) dur_time = time.time() - start msg = "Global Step %d, Total Frames %d, Time Consume %.2f" % ( gs, total_frames, dur_time) logging.info(msg) if gs % 25 == 0: ws = Model.get_ws(sess) logging.info('pushing weight to ps') ray.get(ps.push.remote(ws)) if gs % 1000 == 0: saver.save(sess, os.path.join(ckpt_dir, "CKPT"), global_step=global_step)
def main(argv=None): ''' ''' main.__doc__ = __doc__ argv = sys.argv if argv is None else sys.argv.extend(argv) desc = main.__doc__ # .format(os.path.basename(__file__)) # CLI parser args = parser_(desc) nranks_per_gpu = args.nranks_per_gpu local_rank = hvd.local_rank() gpu_local_rank = local_rank // nranks_per_gpu print('local_rank, GPU_LOCAL_RANK: {}, {}'.format( local_rank, gpu_local_rank)) # Pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True # config.gpu_options.visible_device_list = str(hvd.local_rank()) config.gpu_options.visible_device_list = str(gpu_local_rank) K.set_session(tf.Session(config=config)) # input image dimensions img_rows, img_cols, img_chns = 28, 28, 1 # number of convolutional filters to use filters = 64 # convolution kernel size num_conv = 3 hvdsize = hvd.size() batch_size = 128 # 100 if K.image_data_format() == 'channels_first': original_img_size = (img_chns, img_rows, img_cols) else: original_img_size = (img_rows, img_cols, img_chns) latent_dim = 2 intermediate_dim = 128 epsilon_std = 1.0 epochs = args.epochs # 5 # train the VAE on MNIST digits (x_train, _), (x_test, y_test) = mnist.load_data() x_train = x_train.astype('float32') / 255. x_train = x_train.reshape((x_train.shape[0],) + original_img_size) x_test = x_test.astype('float32') / 255. x_test = x_test.reshape((x_test.shape[0],) + original_img_size) if hvd.rank() == 0: print('x_train.shape:', x_train.shape) train_samples = x_train.shape[0] # steps_per_epoch = train_samples // batch_size // hvdsize speedupopt = args.speedup if speedupopt == SpeedupOpts.imgspersec: steps_per_epoch = train_samples // batch_size else: steps_per_epoch = int(round( float(train_samples) / batch_size / hvdsize + 0.5)) # Create the dataset and its associated one-shot iterator. buffer_size = 10000 dataset = Dataset.from_tensor_slices(x_train) dataset = dataset.repeat() dataset = dataset.shuffle(buffer_size) dataset = dataset.batch(batch_size) iterator = dataset.make_one_shot_iterator() x_train_batch = iterator.get_next() ldict = make_shared_layers_dict( img_chns, img_rows, img_cols, batch_size, filters, num_conv, intermediate_dim, latent_dim, epsilon_std) # ldict is a dictionary that holds all layers. Since these layers are # instantiated once, they are shared amongs vae, encoder, and generator. x = Input(tensor=x_train_batch) vae = make_vae(ldict, x) # : :type vae: Model lr = 0.001 # * hvdsize opt = tf.train.RMSPropOptimizer(lr) # Add Horovod Distributed Optimizer. opt = hvd.DistributedOptimizer(opt) # , use_locking=True) opt = TFOptimizer(opt) # opt = RMSprop(lr) # Add Horovod Distributed Optimizer. # opt = hvd_keras.DistributedOptimizer(opt) # , use_locking=True) vae.compile(optimizer=opt, loss=None) if hvd.rank() == 0: vae.summary() callbacks = [] if hvd.rank() == 0: callbacks += [BatchTiming(), SamplesPerSec(batch_size * hvdsize)] sess = K.get_session() sess.run(hvd.broadcast_global_variables(0)) # Fit the model using data from the TF data tensors. vae.fit(steps_per_epoch=steps_per_epoch, epochs=epochs, callbacks=callbacks) if hvd.rank() == 0: x = Input(shape=original_img_size) vae_val = make_vae(ldict, x) vae_val.compile(optimizer=opt, loss=None) loss = vae_val.evaluate(x=x_test, y=None, batch_size=batch_size) print('\n\nVAE VALIDATION LOSS: {}'.format(loss)) x = Input(shape=original_img_size) z_mean, _ = get_encoded(ldict, x) encoder = Model(x, z_mean) # : :type encoder: Model decoder_input = Input(shape=(latent_dim,)) x_decoded_mean_squash = get_decoded(ldict, decoder_input) generator = Model(decoder_input, x_decoded_mean_squash) # : :type generator: Model # display a 2D plot of the digit classes in the latent space x_test_encoded = encoder.predict(x_test, batch_size=batch_size) plt.figure(figsize=(6, 6)) plt.scatter(x_test_encoded[:, 0], x_test_encoded[:, 1], c=y_test) plt.colorbar() # plt.show() plt.savefig('vae_scatter.ps') plt.close() # display a 2D manifold of the digits n = 15 # figure with 15x15 digits digit_size = 28 figure = np.zeros((digit_size * n, digit_size * n)) # Linearly spaced coordinates on the unit square were transformed # through the inverse CDF (ppf) of the Gaussian # To produce values of the latent variables z, since the prior of the # latent space is Gaussian grid_x = norm.ppf(np.linspace(0.05, 0.95, n)) grid_y = norm.ppf(np.linspace(0.05, 0.95, n)) for i, yi in enumerate(grid_x): for j, xi in enumerate(grid_y): z_sample = np.array([[xi, yi]]) z_sample = np.tile(z_sample, batch_size).reshape(batch_size, 2) x_decoded = generator.predict(z_sample, batch_size=batch_size) digit = x_decoded[0].reshape(digit_size, digit_size) figure[i * digit_size: (i + 1) * digit_size, j * digit_size: (j + 1) * digit_size] = digit plt.figure(figsize=(10, 10)) plt.imshow(figure, cmap='Greys_r') # plt.show() plt.savefig('vae_digit.ps') plt.close() K.clear_session()
def main(): script_start = time.time() hvd_init() mpi_comm = MPI.COMM_WORLD args = parse_args() if hvd.rank() == 0: dllogger.init(backends=[dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE, filename=args.log_path), dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE)]) else: dllogger.init(backends=[]) args.world_size = hvd.size() dllogger.log(data=vars(args), step='PARAMETER') if args.seed is None: if hvd.rank() == 0: seed = int(time.time()) else: seed = None seed = mpi_comm.bcast(seed, root=0) else: seed = args.seed tf.random.set_random_seed(seed) np.random.seed(seed) cp.random.seed(seed) if args.amp: os.environ["TF_ENABLE_AUTO_MIXED_PRECISION"] = "1" if args.checkpoint_dir is not None: os.makedirs(args.checkpoint_dir, exist_ok=True) final_checkpoint_path = os.path.join(args.checkpoint_dir, 'model.ckpt') else: final_checkpoint_path = None # Load converted data and get statistics train_df = pd.read_pickle(args.data+'/train_ratings.pickle') test_df = pd.read_pickle(args.data+'/test_ratings.pickle') nb_users, nb_items = train_df.max() + 1 # Extract train and test feature tensors from dataframe pos_train_users = train_df.iloc[:, 0].values.astype(np.int32) pos_train_items = train_df.iloc[:, 1].values.astype(np.int32) pos_test_users = test_df.iloc[:, 0].values.astype(np.int32) pos_test_items = test_df.iloc[:, 1].values.astype(np.int32) # Negatives indicator for negatives generation neg_mat = np.ones((nb_users, nb_items), dtype=np.bool) neg_mat[pos_train_users, pos_train_items] = 0 # Get the local training/test data train_users, train_items, train_labels = get_local_train_data( pos_train_users, pos_train_items, args.negative_samples ) test_users, test_items = get_local_test_data( pos_test_users, pos_test_items ) # Create and run Data Generator in a separate thread data_generator = DataGenerator( args.seed, hvd.rank(), nb_users, nb_items, neg_mat, train_users, train_items, train_labels, args.batch_size // hvd.size(), args.negative_samples, test_users, test_items, args.valid_users_per_batch, args.valid_negative, ) # Create tensorflow session and saver config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) if args.xla: config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 sess = tf.Session(config=config) # Input tensors users = tf.placeholder(tf.int32, shape=(None,)) items = tf.placeholder(tf.int32, shape=(None,)) labels = tf.placeholder(tf.int32, shape=(None,)) is_dup = tf.placeholder(tf.float32, shape=(None,)) dropout = tf.placeholder_with_default(args.dropout, shape=()) # Model ops and saver hit_rate, ndcg, eval_op, train_op = ncf_model_ops( users, items, labels, is_dup, params={ 'val_batch_size': args.valid_negative+1, 'top_k': args.topk, 'learning_rate': args.learning_rate, 'beta_1': args.beta1, 'beta_2': args.beta2, 'epsilon': args.eps, 'num_users': nb_users, 'num_items': nb_items, 'num_factors': args.factors, 'mf_reg': 0, 'layer_sizes': args.layers, 'layer_regs': [0. for i in args.layers], 'dropout': dropout, 'sigmoid': True, 'loss_scale': args.loss_scale }, mode='TRAIN' if args.mode == 'train' else 'EVAL' ) saver = tf.train.Saver() # Accuracy metric tensors hr_sum = tf.get_default_graph().get_tensor_by_name('neumf/hit_rate/total:0') hr_cnt = tf.get_default_graph().get_tensor_by_name('neumf/hit_rate/count:0') ndcg_sum = tf.get_default_graph().get_tensor_by_name('neumf/ndcg/total:0') ndcg_cnt = tf.get_default_graph().get_tensor_by_name('neumf/ndcg/count:0') # Prepare evaluation data data_generator.prepare_eval_data() if args.load_checkpoint_path: saver.restore(sess, args.load_checkpoint_path) else: # Manual initialize weights sess.run(tf.global_variables_initializer()) # If test mode, run one eval if args.mode == 'test': sess.run(tf.local_variables_initializer()) eval_start = time.time() for user_batch, item_batch, dup_batch \ in zip(data_generator.eval_users, data_generator.eval_items, data_generator.dup_mask): sess.run( eval_op, feed_dict={ users: user_batch, items: item_batch, is_dup:dup_batch, dropout: 0.0 } ) eval_duration = time.time() - eval_start # Report results hit_rate_sum = sess.run(hvd.allreduce(hr_sum, average=False)) hit_rate_cnt = sess.run(hvd.allreduce(hr_cnt, average=False)) ndcg_sum = sess.run(hvd.allreduce(ndcg_sum, average=False)) ndcg_cnt = sess.run(hvd.allreduce(ndcg_cnt, average=False)) hit_rate = hit_rate_sum / hit_rate_cnt ndcg = ndcg_sum / ndcg_cnt if hvd.rank() == 0: eval_throughput = pos_test_users.shape[0] * (args.valid_negative + 1) / eval_duration dllogger.log(step=tuple(), data={'eval_throughput': eval_throughput, 'eval_time': eval_duration, 'hr@10': float(hit_rate), 'ndcg': float(ndcg)}) return # Performance Metrics train_times = list() eval_times = list() # Accuracy Metrics first_to_target = None time_to_train = 0.0 best_hr = 0 best_epoch = 0 # Buffers for global metrics global_hr_sum = np.ones(1) global_hr_count = np.ones(1) global_ndcg_sum = np.ones(1) global_ndcg_count = np.ones(1) # Buffers for local metrics local_hr_sum = np.ones(1) local_hr_count = np.ones(1) local_ndcg_sum = np.ones(1) local_ndcg_count = np.ones(1) # Begin training begin_train = time.time() for epoch in range(args.epochs): # Train for one epoch train_start = time.time() data_generator.prepare_train_data() for user_batch, item_batch, label_batch \ in zip(data_generator.train_users_batches, data_generator.train_items_batches, data_generator.train_labels_batches): sess.run( train_op, feed_dict={ users: user_batch.get(), items: item_batch.get(), labels: label_batch.get() } ) train_duration = time.time() - train_start # Only log "warm" epochs if epoch >= 1: train_times.append(train_duration) # Evaluate if epoch > args.eval_after: eval_start = time.time() sess.run(tf.local_variables_initializer()) for user_batch, item_batch, dup_batch \ in zip(data_generator.eval_users, data_generator.eval_items, data_generator.dup_mask): sess.run( eval_op, feed_dict={ users: user_batch, items: item_batch, is_dup: dup_batch, dropout: 0.0 } ) # Compute local metrics local_hr_sum[0] = sess.run(hr_sum) local_hr_count[0] = sess.run(hr_cnt) local_ndcg_sum[0] = sess.run(ndcg_sum) local_ndcg_count[0] = sess.run(ndcg_cnt) # Reduce metrics across all workers mpi_comm.Reduce(local_hr_count, global_hr_count) mpi_comm.Reduce(local_hr_sum, global_hr_sum) mpi_comm.Reduce(local_ndcg_count, global_ndcg_count) mpi_comm.Reduce(local_ndcg_sum, global_ndcg_sum) # Calculate metrics hit_rate = global_hr_sum[0] / global_hr_count[0] ndcg = global_ndcg_sum[0] / global_ndcg_count[0] eval_duration = time.time() - eval_start # Only log "warm" epochs if epoch >= 1: eval_times.append(eval_duration) if hvd.rank() == 0: dllogger.log(step=(epoch,), data={ 'train_time': train_duration, 'eval_time': eval_duration, 'hr@10': hit_rate, 'ndcg': ndcg}) # Update summary metrics if hit_rate > args.target and first_to_target is None: first_to_target = epoch time_to_train = time.time() - begin_train if hit_rate > best_hr: best_hr = hit_rate best_epoch = epoch time_to_best = time.time() - begin_train if hit_rate > args.target and final_checkpoint_path: saver.save(sess, final_checkpoint_path) # Final Summary if hvd.rank() == 0: train_times = np.array(train_times) train_throughputs = pos_train_users.shape[0]*(args.negative_samples+1) / train_times eval_times = np.array(eval_times) eval_throughputs = pos_test_users.shape[0]*(args.valid_negative+1) / eval_times dllogger.log(step=tuple(), data={ 'average_train_time_per_epoch': np.mean(train_times), 'average_train_throughput': np.mean(train_throughputs), 'average_eval_time_per_epoch': np.mean(eval_times), 'average_eval_throughput': np.mean(eval_throughputs), 'first_epoch_to_hit': first_to_target, 'time_to_train': time_to_train, 'time_to_best': time_to_best, 'best_hr': best_hr, 'best_epoch': best_epoch}) dllogger.flush() sess.close() return
def main(_): if FLAGS.strategy == 'horovod': import horovod.tensorflow as hvd # pylint: disable=g-import-not-at-top logging.info('Use horovod with multi gpus') hvd.init() os.environ['CUDA_VISIBLE_DEVICES'] = str(hvd.local_rank()) import tensorflow.compat.v1 as tf # pylint: disable=g-import-not-at-top tf.enable_v2_tensorshape() tf.disable_eager_execution() if FLAGS.strategy == 'tpu': tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver( FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) tpu_grpc_url = tpu_cluster_resolver.get_master() tf.Session.reset(tpu_grpc_url) else: tpu_cluster_resolver = None # Check data path if FLAGS.mode in ( 'train', 'train_and_eval') and FLAGS.training_file_pattern is None: raise RuntimeError( 'You must specify --training_file_pattern for training.') if FLAGS.mode in ('eval', 'train_and_eval'): if FLAGS.validation_file_pattern is None: raise RuntimeError('You must specify --validation_file_pattern ' 'for evaluation.') # Parse and override hparams config = hparams_config.get_detection_config(FLAGS.model_name) config.override(FLAGS.hparams) if FLAGS.num_epochs: # NOTE: remove this flag after updating all docs. config.num_epochs = FLAGS.num_epochs # Parse image size in case it is in string format. config.image_size = utils.parse_image_size(config.image_size) # The following is for spatial partitioning. `features` has one tensor while # `labels` had 4 + (`max_level` - `min_level` + 1) * 2 tensors. The input # partition is performed on `features` and all partitionable tensors of # `labels`, see the partition logic below. # In the TPUEstimator context, the meaning of `shard` and `replica` is the # same; follwing the API, here has mixed use of both. if FLAGS.use_spatial_partition: # Checks input_partition_dims agrees with num_cores_per_replica. if FLAGS.num_cores_per_replica != np.prod(FLAGS.input_partition_dims): raise RuntimeError( '--num_cores_per_replica must be a product of array' 'elements in --input_partition_dims.') labels_partition_dims = { 'mean_num_positives': None, 'source_ids': None, 'groundtruth_data': None, 'image_scales': None, } # The Input Partition Logic: We partition only the partition-able tensors. # Spatial partition requires that the to-be-partitioned tensors must have a # dimension that is a multiple of `partition_dims`. Depending on the # `partition_dims` and the `image_size` and the `max_level` in config, some # high-level anchor labels (i.e., `cls_targets` and `box_targets`) cannot # be partitioned. For example, when `partition_dims` is [1, 4, 2, 1], image # size is 1536, `max_level` is 9, `cls_targets_8` has a shape of # [batch_size, 6, 6, 9], which cannot be partitioned (6 % 4 != 0). In this # case, the level-8 and level-9 target tensors are not partition-able, and # the highest partition-able level is 7. feat_sizes = utils.get_feat_sizes(config.get('image_size'), config.get('max_level')) for level in range(config.get('min_level'), config.get('max_level') + 1): def _can_partition(spatial_dim): partitionable_index = np.where( spatial_dim % np.array(FLAGS.input_partition_dims) == 0) return len(partitionable_index[0]) == len( FLAGS.input_partition_dims) spatial_dim = feat_sizes[level] if _can_partition(spatial_dim['height']) and _can_partition( spatial_dim['width']): labels_partition_dims['box_targets_%d' % level] = FLAGS.input_partition_dims labels_partition_dims['cls_targets_%d' % level] = FLAGS.input_partition_dims else: labels_partition_dims['box_targets_%d' % level] = None labels_partition_dims['cls_targets_%d' % level] = None num_cores_per_replica = FLAGS.num_cores_per_replica input_partition_dims = [ FLAGS.input_partition_dims, labels_partition_dims ] num_shards = FLAGS.num_cores // num_cores_per_replica else: num_cores_per_replica = None input_partition_dims = None num_shards = FLAGS.num_cores params = dict(config.as_dict(), model_name=FLAGS.model_name, iterations_per_loop=FLAGS.iterations_per_loop, model_dir=FLAGS.model_dir, num_shards=num_shards, num_examples_per_epoch=FLAGS.num_examples_per_epoch, strategy=FLAGS.strategy, backbone_ckpt=FLAGS.backbone_ckpt, ckpt=FLAGS.ckpt, val_json_file=FLAGS.val_json_file, testdev_dir=FLAGS.testdev_dir, mode=FLAGS.mode) config_proto = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) if FLAGS.strategy != 'tpu': if FLAGS.use_xla: config_proto.graph_options.optimizer_options.global_jit_level = ( tf.OptimizerOptions.ON_1) config_proto.gpu_options.allow_growth = True tpu_config = tf.estimator.tpu.TPUConfig( FLAGS.iterations_per_loop, num_cores_per_replica=num_cores_per_replica, input_partition_dims=input_partition_dims, per_host_input_for_training=tf.estimator.tpu.InputPipelineConfig. PER_HOST_V2) if FLAGS.strategy == 'horovod': model_dir = FLAGS.model_dir if hvd.rank() == 0 else None else: model_dir = FLAGS.model_dir run_config = tf.estimator.tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=model_dir, log_step_count_steps=FLAGS.iterations_per_loop, session_config=config_proto, tpu_config=tpu_config, tf_random_seed=FLAGS.tf_random_seed, ) model_fn_instance = det_model_fn.get_model_fn(FLAGS.model_name) max_instances_per_image = config.max_instances_per_image eval_steps = int(FLAGS.eval_samples // FLAGS.eval_batch_size) use_tpu = (FLAGS.strategy == 'tpu') logging.info(params) def _train(steps): """Build train estimator and run training if steps > 0.""" train_estimator = tf.estimator.tpu.TPUEstimator( model_fn=model_fn_instance, use_tpu=use_tpu, train_batch_size=FLAGS.train_batch_size, config=run_config, params=params) train_estimator.train(input_fn=dataloader.InputReader( FLAGS.training_file_pattern, is_training=True, use_fake_data=FLAGS.use_fake_data, max_instances_per_image=max_instances_per_image), max_steps=steps) def _eval(steps): """Build estimator and eval the latest checkpoint if steps > 0.""" eval_params = dict( params, strategy=FLAGS.strategy, input_rand_hflip=False, is_training_bn=False, ) eval_estimator = tf.estimator.tpu.TPUEstimator( model_fn=model_fn_instance, use_tpu=use_tpu, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, config=run_config, params=eval_params) eval_results = eval_estimator.evaluate(input_fn=dataloader.InputReader( FLAGS.validation_file_pattern, is_training=False, max_instances_per_image=max_instances_per_image), steps=steps, name=FLAGS.eval_name) logging.info('Evaluation results: %s', eval_results) return eval_results # start train/eval flow. if FLAGS.mode == 'train': total_examples = int(config.num_epochs * FLAGS.num_examples_per_epoch) _train(total_examples // FLAGS.train_batch_size) if FLAGS.eval_after_training: _eval(eval_steps) elif FLAGS.mode == 'eval': # Run evaluation when there's a new checkpoint for ckpt in tf.train.checkpoints_iterator( FLAGS.model_dir, min_interval_secs=FLAGS.min_eval_interval, timeout=FLAGS.eval_timeout): logging.info('Starting to evaluate.') try: eval_results = _eval(eval_steps) # Terminate eval job when final checkpoint is reached. try: current_step = int(os.path.basename(ckpt).split('-')[1]) except IndexError: logging.info('%s has no global step info: stop!', ckpt) break utils.archive_ckpt(eval_results, eval_results['AP'], ckpt) total_step = int( (config.num_epochs * FLAGS.num_examples_per_epoch) / FLAGS.train_batch_size) if current_step >= total_step: logging.info('Evaluation finished after training step %d', current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long after # the CPU job tells it to start evaluating. In this case, the checkpoint # file could have been deleted already. logging.info('Checkpoint %s no longer exists, skipping.', ckpt) elif FLAGS.mode == 'train_and_eval': epochs_per_cycle = 1 # higher number has less graph construction overhead. for e in range(1, config.num_epochs + 1, epochs_per_cycle): logging.info('Starting training, epoch: %d.', e) _train(e * FLAGS.num_examples_per_epoch // FLAGS.train_batch_size) logging.info('Starting evaluation, epoch: %d.', ) eval_results = _eval(eval_steps) ckpt = tf.train.latest_checkpoint(FLAGS.model_dir) utils.archive_ckpt(eval_results, eval_results['AP'], ckpt) else: logging.info('Mode not found.')
import os os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # Disable TensorFlow logging. import tensorflow as tf import horovod.tensorflow as hvd # Horovod: initialize Horovod. # `dims` and `comm` are optional. hvd.init([2, 4, 4]) # Horovod: pin GPU to be used to process local rank (one GPU per process) gpus = tf.config.experimental.list_physical_devices('GPU') for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) if gpus: tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU') (mnist_images, mnist_labels), _ = \ tf.keras.datasets.mnist.load_data(path='mnist-%d.npz' % hvd.rank()) dataset = tf.data.Dataset.from_tensor_slices( (tf.cast(mnist_images[..., tf.newaxis] / 255.0, tf.float32), tf.cast(mnist_labels, tf.int64))) dataset = dataset.repeat().shuffle(10000).batch(128) mnist_model = tf.keras.Sequential([ tf.keras.layers.Conv2D(32, [3, 3], activation='relu'), tf.keras.layers.Conv2D(64, [3, 3], activation='relu'), tf.keras.layers.MaxPooling2D(pool_size=(2, 2)), tf.keras.layers.Dropout(0.25), tf.keras.layers.Flatten(),
def minibatch(self): """ Returns minibatch of images and labels from TF records file. """ with tf.name_scope('pipeline'): ds = tf.data.Dataset.from_generator(self.generator, (tf.int64), (tf.TensorShape([]))) if self.mode == 'train': max_num_records = self.params['num_epochs'] * self.params[ 'NUM_EXAMPLES_PER_EPOCH'] ds = ds.take(max_num_records) ds = ds.prefetch(min(1, self.num_samples)) ds = ds.batch(self.params['batch_size'], drop_remainder=True) #ds = ds.map(self.wrapped_decode, num_parallel_calls=tf.data.experimental.AUTOTUNE) ds = ds.map(self.wrapped_decode) iterator = ds.make_one_shot_iterator() images, labels = [], [] for _ in range(self.params['batch_size']): image, label = iterator.get_next() image = tf.reshape(image, self.data_specs['image_shape']) images.append( tf.reshape(image, self.data_specs['image_shape'])) labels.append( tf.reshape(label, self.data_specs['label_shape'])) elif self.mode == 'eval': ds = ds.take(self.num_samples) ds = ds.batch(self.params['batch_size'], drop_remainder=True) ds = ds.map(self.wrapped_decode) iterator = ds.make_one_shot_iterator() images, labels = [], [] if self.params[self.mode + '_distort']: print('images will be distorted') for _ in range(self.params['batch_size']): image, label = iterator.get_next() image = tf.reshape(image, self.data_specs['image_shape']) images.append( tf.reshape(image, self.data_specs['image_shape'])) labels.append( tf.reshape(label, self.data_specs['label_shape'])) if tf.executing_eagerly(): images = tf.stack(images) labels = tf.stack(labels) else: images = tf.parallel_stack(images) labels = tf.parallel_stack(labels) # reshape them to the expected shape: labels_newshape = [self.params['batch_size'] ] + self.data_specs['label_shape'] images_newshape = [self.params['batch_size'] ] + self.data_specs['image_shape'] labels = tf.reshape(labels, labels_newshape) images = tf.reshape(images, images_newshape) #labels = self.image_scaling(labels) images = self.image_scaling(images) # data augmentation if self.params[self.mode + '_distort']: with tf.device('/gpu:%i' % hvd.local_rank()): if self.params.get('random_crop', False): images = tf.transpose(images, perm=[0, 2, 3, 1]) images = self.random_crop_resize(images) images = self.add_noise_image(images) images = tf.transpose(images, perm=[0, 3, 1, 2]) else: images = self.add_noise_image(images) return images, labels
def train_ffn(model_cls, **model_kwargs): hvd.init() logging.info('Rank: %d %d' % (hvd.rank(), rank)) with tf.Graph().as_default(): # The constructor might define TF ops/placeholders, so it is important # that the FFN is instantiated within the current context. model = model_cls(**model_kwargs) eval_shape_zyx = train_eval_size(model).tolist()[::-1] eval_tracker = EvalTracker(eval_shape_zyx) load_data_ops = h5_distributed_dataset(model, queue_batch=1) print(load_data_ops) prepare_ffn(model) merge_summaries_op = tf.summary.merge_all() if FLAGS.task == 0: save_flags() hooks = [ # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states # from rank 0 to all other processes. This is necessary to ensure consistent # initialization of all workers when training is started with random weights # or restored from a checkpoint. hvd.BroadcastGlobalVariablesHook(0), # Horovod: adjust number of steps based on number of GPUs. tf.train.StopAtStepHook(last_step=FLAGS.max_steps // hvd.size()), ] config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) checkpoint_dir = FLAGS.train_dir if hvd.rank() == 0 else None summary_writer = None saver = tf.train.Saver(max_to_keep=None, keep_checkpoint_every_n_hours=24) scaffold = tf.train.Scaffold(saver=saver) with tf.train.MonitoredTrainingSession(master=FLAGS.master, is_chief=(FLAGS.task == 0), checkpoint_dir=checkpoint_dir, hooks=hooks, save_checkpoint_secs=30, save_summaries_steps=None, config=config, scaffold=scaffold) as sess: eval_tracker.sess = sess step = int(sess.run(model.global_step)) if FLAGS.task > 0: # To avoid early instabilities when using multiple replicas, we use # a launch schedule where new replicas are brought online gradually. logging.info('Delaying replica start.') while step < FLAGS.replica_step_delay * FLAGS.task: time.sleep(5.0) if rank == 0: summary_writer = tf.summary.FileWriterCache.get( FLAGS.train_dir) summary_writer.add_session_log( tf.summary.SessionLog(status=tf.summary.SessionLog.START), step) fov_shifts = list(model.shifts) # x, y, z if FLAGS.shuffle_moves: random.shuffle(fov_shifts) policy_map = { 'fixed': partial(fixed_offsets, fov_shifts=fov_shifts), 'max_pred_moves': max_pred_offsets } batch_it = get_batch( lambda: sess.run(load_data_ops), eval_tracker, model, FLAGS.batch_size, # eval_tracker, model, 1, policy_map[FLAGS.fov_policy]) t_last = time.time() while not sess.should_stop() and step < FLAGS.max_steps: # Run summaries periodically. t_curr = time.time() if t_curr - t_last > FLAGS.summary_rate_secs and FLAGS.task == 0: summ_op = merge_summaries_op t_last = t_curr else: summ_op = None seed, patches, labels, weights = next(batch_it) updated_seed, step, summ = run_training_step( sess, model, summ_op, feed_dict={ model.loss_weights: weights, model.labels: labels, model.input_patches: patches, model.input_seed: seed, }) # Save prediction results in the original seed array so that # they can be used in subsequent steps. mask.update_at(seed, (0, 0, 0), updated_seed) # Record summaries. if hvd.rank() == 0 and summ is not None: logging.info('Saving summaries.') summ = tf.Summary.FromString(summ) # Compute a loss over the whole training patch (i.e. more than a # single-step field of view of the network). This quantifies the # quality of the final object mask. summ.value.extend(eval_tracker.get_summaries()) eval_tracker.reset() assert summary_writer is not None summary_writer.add_summary(summ, step) if summary_writer is not None: summary_writer.flush()