def load_train_data(): imgs_train = np.load( os.path.join(ROOT, 'img_20191216_aug_' + str(ddl.rank()) + '.npy')) imgs_mask_train = np.load( os.path.join( ROOT, 'labelOnehot_20191216_aug_binary_' + str(ddl.rank()) + '.npy')) return imgs_train, imgs_mask_train
def run_mnist_eager(): """Run MNIST training and eval loop in eager mode. """ data_dir = '/tmp/tensorflow/mnist/input_data' + str(ddl.rank()) model_dir = '/tmp/tensorflow/mnist/checkpoints/' + str(ddl.rank()) + '/' # Delete model dir if os.path.isdir(model_dir) and ddl.local_rank() == 0: shutil.rmtree(model_dir) data_format = 'channels_first' # Load the datasets train_ds, _ = mnist_dataset.train(data_dir, (1, 28, 28), label_int=True) train_ds = train_ds.shard(ddl.size(), ddl.rank()).shuffle(60000).batch(batch_size) test_ds, _ = mnist_dataset.test(data_dir, (1, 28, 28), label_int=True) test_ds = test_ds.batch(batch_size) # Create the model and optimizer model = create_model(data_format) optimizer = tf.train.MomentumOptimizer(0.01, 0.5) train_dir = None test_dir = None summary_writer = tf.contrib.summary.create_file_writer(train_dir, flush_millis=10000) test_summary_writer = tf.contrib.summary.create_file_writer( test_dir, flush_millis=10000, name='test') # Create and restore checkpoint (if one exists on the path) checkpoint_prefix = os.path.join(model_dir, 'ckpt-r' + str(ddl.rank())) step_counter = tf.train.get_or_create_global_step() checkpoint = tf.train.Checkpoint(model=model, optimizer=optimizer, step_counter=step_counter) # Restore variables on creation if a checkpoint exists. checkpoint.restore(tf.train.latest_checkpoint(model_dir)) # Train and evaluate for a set number of epochs. for _ in range(train_epochs): start = time.time() with summary_writer.as_default(): train(model, optimizer, train_ds, step_counter, 10) end = time.time() if ddl.rank() == 0: print('\nTrain time for epoch #%d (%d total steps): %f' % (checkpoint.save_counter.numpy() + 1, step_counter.numpy(), end - start)) with test_summary_writer.as_default(): test(model, test_ds) checkpoint.save(checkpoint_prefix)
def run_main(flags, default_hparams, train_fn, inference_fn, target_session=""): """Run main.""" # Job #jobid = flags.jobid jobid = ddl.rank() # for ddl to enforce data partitioning num_workers = flags.num_workers # must be 1 for ddl utils.print_out("# Job id %d" % jobid) # Random random_seed = flags.random_seed if random_seed is not None and random_seed > 0: utils.print_out("# Set random seed to %d" % random_seed) random.seed(random_seed + jobid) np.random.seed(random_seed + jobid) tf.set_random_seed(random_seed + jobid) ## Train / Decode out_dir = flags.out_dir if not tf.gfile.Exists(out_dir): tf.gfile.MakeDirs(out_dir) # Load hparams. hparams = create_or_load_hparams(out_dir, default_hparams, flags.hparams_path, save_hparams=(jobid == 0)) if flags.inference_input_file: # Inference indices hparams.inference_indices = None if flags.inference_list: (hparams.inference_indices) = ([ int(token) for token in flags.inference_list.split(",") ]) # Inference trans_file = flags.inference_output_file ckpt = flags.ckpt if not ckpt: ckpt = tf.train.latest_checkpoint(out_dir) inference_fn(ckpt, flags.inference_input_file, trans_file, hparams, num_workers, jobid) # Evaluation ref_file = flags.inference_ref_file if ref_file and tf.gfile.Exists(trans_file): for metric in hparams.metrics: score = evaluation_utils.evaluate(ref_file, trans_file, metric, hparams.subword_option) utils.print_out(" %s: %.1f" % (metric, score)) else: # Train train_fn(hparams, target_session=target_session)
def _external_eval(model, global_step, sess, hparams, iterator, iterator_feed_dict, tgt_file, label, summary_writer, save_on_best, avg_ckpts=False): """External evaluation such as BLEU and ROUGE scores.""" out_dir = hparams.out_dir decode = global_step > 0 if avg_ckpts: label = "avg_" + label if decode: utils.print_out("# External evaluation, global step %d" % global_step) sess.run(iterator.initializer, feed_dict=iterator_feed_dict) output = os.path.join(out_dir, "output_%s" % label) scores = nmt_utils.decode_and_evaluate( label, model, sess, output, ref_file=tgt_file, metrics=hparams.metrics, subword_option=hparams.subword_option, beam_width=hparams.beam_width, tgt_eos=hparams.eos, decode=decode) # Save on best metrics if decode: for metric in hparams.metrics: if avg_ckpts: best_metric_label = "avg_best_" + metric else: best_metric_label = "best_" + metric utils.add_summary(summary_writer, global_step, "%s_%s" % (label, metric), scores[metric]) # metric: larger is better if save_on_best and scores[metric] > getattr(hparams, best_metric_label): setattr(hparams, best_metric_label, scores[metric]) if ddl.rank() == 0: model.saver.save( sess, os.path.join( getattr(hparams, best_metric_label + "_dir"), "translate.ckpt"), global_step=model.global_step) utils.save_hparams(out_dir, hparams) return scores
def test(model, dataset): """Perform an evaluation of `model` on the examples from `dataset`.""" avg_loss = tfe.metrics.Mean('loss', dtype=tf.float32) accuracy = tfe.metrics.Accuracy('accuracy', dtype=tf.float32) for (images, labels) in dataset: logits = model(images, training=False) avg_loss(loss(logits, labels)) accuracy(tf.argmax(logits, axis=1, output_type=tf.int64), tf.cast(labels, tf.int64)) if ddl.rank() == 0: print('Test set: Average loss: %.4f, Accuracy: %4f%%\n' % (avg_loss.result(), 100 * accuracy.result())) with tf.contrib.summary.always_record_summaries(): tf.contrib.summary.scalar('loss', avg_loss.result()) tf.contrib.summary.scalar('accuracy', accuracy.result())
def train(model, optimizer, dataset, step_counter, log_interval=None): """Trains model on `dataset` using `optimizer`.""" start = time.time() for (batch, (images, labels)) in enumerate(dataset): with tf.contrib.summary.record_summaries_every_n_global_steps( 10, global_step=step_counter): # Record the operations used to compute the loss given the input, # so that the gradient of the loss with respect to the variables # can be computed. with tf.GradientTape() as tape: logits = model(images, training=True) loss_value = loss(logits, labels) tf.contrib.summary.scalar('loss', loss_value) tf.contrib.summary.scalar('accuracy', compute_accuracy(logits, labels)) grads = tape.gradient(loss_value, model.variables) optimizer.apply_gradients(zip(grads, model.variables), global_step=step_counter) if log_interval and batch % log_interval == 0 and ddl.rank() == 0: rate = log_interval / (time.time() - start) print('Step #%d\tLoss: %.6f (%d steps/sec)' % (batch, loss_value, rate)) start = time.time()
def train_and_predict(postfix, bsize, eps, lrate, imgs_train, imgs_mask_train, weights=None): if ddl.rank() == 0: print('Running with postfix:', postfix, 'batch_size:', bsize, 'epochs:', eps, 'lr:', lrate, 'weights:', weights) tempH5file = postfix + '_' + str(random.randint(1, 1000000)) + '.h5' model = get_unet(lrate) model_checkpoint = ModelCheckpoint(tempH5file, monitor='val_loss', save_best_only=True) reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=lrate * 0.0001) early_stopping_monitor = EarlyStopping(patience=5, min_delta=0.0001) #load previous weights to continue training if weights: model.load_weights(weights) if ddl.rank() == 0: train_history = model.fit(imgs_train, imgs_mask_train, batch_size=bsize, epochs=eps, verbose=1, shuffle=True, validation_split=0.1, callbacks=[ ddl.DDLCallback, model_checkpoint, reduce_lr, early_stopping_monitor, ddl.DDLGlobalVariablesCallback() ]) else: train_history = model.fit(imgs_train, imgs_mask_train, batch_size=bsize, epochs=eps, verbose=0, shuffle=True, validation_split=0.1, callbacks=[ ddl.DDLCallback, reduce_lr, early_stopping_monitor, ddl.DDLGlobalVariablesCallback() ]) if ddl.rank() == 0: score = np.max(train_history.history['val_jaccard_index']) #throw away ridiculously low scores if score > 0.25: score_str = str(score)[:8].replace('.', '_') weightsFile = 'weights_' + postfix + '_' + score_str + '.h5' shutil.move(tempH5file, weightsFile) else: os.remove(tempH5file)
def main(_): # Parameters learning_rate = 0.001 training_iters = FLAGS.num_iterations batch_size = 100 display_step = 1 # Network Parameters n_input = 784 # MNIST data input (img shape: 28*28) n_classes = 10 # MNIST total classes (0-9 digits) dropout = 0.75 # Dropout, probability to keep units ############################################################################ # Import MNIST data ############################################################################ data_dir = FLAGS.data_dir + str(ddl.local_rank()) (train_set, num_of_train_imgs) = dataset.train(data_dir, (28, 28, 1), VARTYPE) train_set = train_set.shard(ddl.size(), ddl.rank()) train_set = train_set.batch(batch_size).cache().shuffle(buffer_size=1000).repeat() X_train, Y_train = train_set.make_one_shot_iterator().get_next() # Construct model pred, keep_prob = deepnn(X_train) # Define loss and optimizer with tf.name_scope('loss'): cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits_v2(labels=Y_train, logits=pred)) with tf.name_scope('adam_optimizer'): optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, epsilon=1e-4) objective = optimizer.minimize(cost) # Evaluate model with tf.name_scope('accuracy'): correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(Y_train, 1)) correct_prediction = tf.cast(correct_prediction, VARTYPE) accuracy = tf.reduce_mean(correct_prediction) graph_location = tempfile.mkdtemp() print('Saving graph to: %s' % graph_location) train_writer = tf.summary.FileWriter(graph_location) train_writer.add_graph(tf.get_default_graph()) # Launch the graph with tf.Session(config=tf.ConfigProto()) as sess: sess.run(tf.global_variables_initializer()) my_variable = bias_variable([5, 5, 1, 32]) sess.run(my_variable.initializer) step = 1 # Keep training until reach max iterations while step * batch_size < training_iters: # Run optimization op (backprop) sess.run(objective) if step % display_step == 0: # Calculate batch loss and accuracy loss, acc = sess.run([cost, accuracy]) print("DDL " + str(ddl.rank()) + "] Iter " + str(step * batch_size) + ", Minibatch Loss= " + "{:.6f}".format(loss) + ", Training Accuracy= " + "{:.5f}".format(acc)) step += 1 print("DDL "+str(ddl.rank())+"] Optimization Finished!") # Calculate accuracy for 256 mnist test images print("DDL "+str(ddl.rank())+"] Testing Accuracy:", sess.run(accuracy))
import tensorflow as tf if len(sys.argv) > 1 and sys.argv[1] == '--eager': tf.enable_eager_execution() import ddl import dataset batch_size = 128 num_classes = 10 epochs = 12 # input image dimensions img_rows, img_cols = 28, 28 # data_dir data_dir = "/tmp/mnist_convnet_model_data" + str(ddl.rank()) input_shape = () if K.image_data_format() == 'channels_first': input_shape = (1, img_rows, img_cols) else: input_shape = (img_rows, img_cols, 1) # the data, split between train and test sets (train_set, num_of_train_imgs) = dataset.train(data_dir, input_shape) train_set = train_set.shard(ddl.size(), ddl.rank()) train_set = train_set.cache().shuffle( buffer_size=1000).batch(batch_size).repeat() (eval_set, num_of_test_imgs) = dataset.test(data_dir, input_shape) eval_full = eval_set
def main(): ############################################################################ # Import MNIST data ############################################################################ mnist = input_data.read_data_sets(training_data_dir) # Parameters learning_rate = 0.001 training_iters = 2500 batch_size = 100 display_step = 1 # Network Parameters n_input = 784 # MNIST data input (img shape: 28*28) n_classes = 10 # MNIST total classes (0-9 digits) dropout = 0.75 # Dropout, probability to keep units # tf Graph input x = tf.placeholder(tf.float32, [None, n_input], name="x") # Construct model keep_prob = tf.placeholder_with_default(1.0,shape=(), name="keepprob") pred = deepnn(x,1.0) pRes = tf.identity(pred,name="pRes") if os.getenv("OMPI_COMM_WORLD_RANK") == "0": print("writing checkpoint file", chkptpath+"_basegraph.meta") tf.train.export_meta_graph(chkptpath+"_basegraph.meta", as_text=True) #import the ddl library; this creates objects for distribution so #it must be done after exporting meta graph import ddl y = tf.placeholder(tf.int64, [None], name="y") # Define loss and optimizer with tf.name_scope('loss'): cost = tf.reduce_mean( tf.losses.sparse_softmax_cross_entropy(labels=y, logits=pred)) with tf.name_scope('adam_optimizer'): optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) objective = optimizer.minimize(cost) predictor = tf.argmax(pred, 1, name="predictor") # Evaluate model with tf.name_scope('accuracy'): correct_prediction = tf.equal(predictor, y) correct_prediction = tf.cast(correct_prediction, tf.float32) accuracy = tf.reduce_mean(correct_prediction) saver = tf.train.Saver() # Launch the graph with tf.Session(config=tf.ConfigProto()) as sess: sess.run(tf.global_variables_initializer()) step = 1 # Keep training until reach max iterations while step * batch_size < training_iters: ################################################### ### USE ddl.rank() and ddl.size() to load data ### ################################################### batch_x, batch_y = mnist.train.next_batch(batch_size*ddl.size()) #select one of partitions batch_x = np.split(batch_x,ddl.size())[ddl.rank()] batch_y = np.split(batch_y,ddl.size())[ddl.rank()] # Run optimization op (backprop) sess.run(objective, feed_dict={x: batch_x, y: batch_y}) if step % display_step == 0: # Calculate batch loss and accuracy loss, acc = sess.run([cost, accuracy], feed_dict={x: batch_x, y: batch_y}) print("DDL " + str(ddl.rank()) + "] Iter " + str(step * batch_size) + ", Minibatch Loss= " + "{:.6f}".format(loss) + ", Training Accuracy= " + "{:.5f}".format(acc)) step += 1 if os.getenv("OMPI_COMM_WORLD_RANK") == "0" and step%10==0 and step!=0: saver.save(sess, chkptpath,global_step=step) print('[%d] save checkpoint' % step+" path: "+chkptpath) print("DDL "+str(ddl.rank())+"] Optimization Finished!") # Calculate accuracy for 256 mnist test images print("DDL "+str(ddl.rank())+"] Testing Accuracy:", \ sess.run(accuracy, feed_dict={x: mnist.test.images[:256], y: mnist.test.labels[:256]}))
def main(_): # Note: Not using DDL_OPTIONS; doing explicit DDL calls! # Explicit initialization call: ddl.init(FLAGS.ddl_options) # Parameters learning_rate = 0.001 training_iters = FLAGS.num_iterations batch_size = 100 display_step = 1 # Network Parameters n_input = 784 # MNIST data input (img shape: 28*28) n_classes = 10 # MNIST total classes (0-9 digits) dropout = 0.75 # Dropout, probability to keep units ############################################################################ # Import MNIST data ############################################################################ data_dir = FLAGS.data_dir + str(ddl.local_rank()) (train_set, num_of_train_imgs) = dataset.train(data_dir, (28, 28, 1)) train_set = train_set.shard(ddl.size(), ddl.rank()) train_set = train_set.batch(batch_size).cache().shuffle( buffer_size=1000).repeat() X_train, Y_train = train_set.make_one_shot_iterator().get_next() # Construct model pred, keep_prob = deepnn(X_train) # Define loss and optimizer with tf.name_scope('loss'): cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits_v2(labels=Y_train, logits=pred)) with tf.name_scope('adam_optimizer'): optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) grads_and_vars = optimizer.compute_gradients(cost) # obtain learnable variables and their gradients across the cluster nodes # and do reduce_scatter by making explicit DDL reduce call. # Note: all zipping is hidden grads_and_vars = ddl.grads_reduce(grads_and_vars, average=True) objective = optimizer.apply_gradients(grads_and_vars) # Evaluate model with tf.name_scope('accuracy'): correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(Y_train, 1)) correct_prediction = tf.cast(correct_prediction, tf.float32) accuracy = tf.reduce_mean(correct_prediction) graph_location = tempfile.mkdtemp() print('Saving graph to: %s' % graph_location) train_writer = tf.summary.FileWriter(graph_location) train_writer.add_graph(tf.get_default_graph()) # Launch the graph with tf.Session() as sess: sess.run(tf.global_variables_initializer()) step = 1 # Keep training until reach max iterations while step * batch_size < training_iters: # Run optimization op (backprop) sess.run(objective) if step % display_step == 0: # Calculate batch loss and accuracy loss, acc = sess.run([cost, accuracy]) print("DDL " + str(ddl.rank()) + "] Iter " + str(step * batch_size) + ", Minibatch Loss= " + "{:.6f}".format(loss) + ", Training Accuracy= " + "{:.5f}".format(acc)) step += 1 print("DDL " + str(ddl.rank()) + "] Optimization Finished!") # Calculate accuracy for 256 mnist test images print("DDL " + str(ddl.rank()) + "] Testing Accuracy:", sess.run(accuracy))
callbacks.append(ddl.DDLCallback()) callbacks.append(ddl.DDLGlobalVariablesCallback()) # Normalize into 0~1 range x_train /= x_train.max() x_test /= x_test.max() y_train_binary = to_categorical( y_train ) # For categorical crossentropy loss, we need to binarize multi-class labels y_test_binary = to_categorical( y_test ) # For categorical crossentropy loss, we need to binarize multi-class labels # Split the training data into ddl.size() batches for distributed training. x_train_dist = np.array_split(x_train, ddl.size())[ddl.rank()] y_train_dist = np.array_split(y_train, ddl.size())[ddl.rank()] y_train_dist_binary = np.array_split(y_train_binary, ddl.size())[ddl.rank()] ''' Training step one. Train for NN ''' if model_type == 'triplet' or model_type == 'contrastive': model = build_nn([568, 256, 100], x_train_dist.shape[1], l1_reg=l1_reg, l2_reg=l2_reg, activation_func='tanh')[0] # Set initial weights as DAE trained weights (skip dropout and batchnorm layers) # for layer,weight in zip(model.layers[1:8:3],pretrain_weights): # layer.set_weights(weight)
def train(hparams, scope=None, target_session=""): """Train a translation model.""" log_device_placement = hparams.log_device_placement out_dir = hparams.out_dir num_train_steps = hparams.num_train_steps steps_per_stats = hparams.steps_per_stats steps_per_external_eval = hparams.steps_per_external_eval steps_per_eval = 10 * steps_per_stats avg_ckpts = hparams.avg_ckpts if not steps_per_external_eval: steps_per_external_eval = 5 * steps_per_eval if not hparams.attention: model_creator = nmt_model.Model else: # Attention if (hparams.encoder_type == "gnmt" or hparams.attention_architecture in ["gnmt", "gnmt_v2"]): model_creator = gnmt_model.GNMTModel elif hparams.attention_architecture == "standard": model_creator = attention_model.AttentionModel else: raise ValueError("Unknown attention architecture %s" % hparams.attention_architecture) utils.print_out("Detected %d ranks, the current rank is %d " % (ddl.size(), ddl.rank())) train_model = model_helper.create_train_model(model_creator, hparams, scope, num_workers=ddl.size(), jobid=ddl.rank()) ddl.disable_bcast() eval_model = model_helper.create_eval_model(model_creator, hparams, scope) infer_model = model_helper.create_infer_model(model_creator, hparams, scope) # Preload data for sample decoding. dev_src_file = "%s.%s" % (hparams.dev_prefix, hparams.src) dev_tgt_file = "%s.%s" % (hparams.dev_prefix, hparams.tgt) sample_src_data = inference.load_data(dev_src_file) sample_tgt_data = inference.load_data(dev_tgt_file) summary_name = "train_log_rank_%d" % ddl.rank() model_dir = hparams.out_dir # Log and output files log_file = os.path.join(out_dir, "log_%d_rank_%d" % (time.time(), ddl.rank())) log_f = tf.gfile.GFile(log_file, mode="a") utils.print_out("# log_file=%s" % log_file, log_f) # TensorFlow model config_proto = utils.get_config_proto( log_device_placement=log_device_placement, num_intra_threads=hparams.num_intra_threads, num_inter_threads=hparams.num_inter_threads) train_sess = tf.Session( target=target_session, config=config_proto, graph=train_model.graph) eval_sess = tf.Session( target=target_session, config=config_proto, graph=eval_model.graph) infer_sess = tf.Session( target=target_session, config=config_proto, graph=infer_model.graph) with train_model.graph.as_default(): loaded_train_model, global_step = model_helper.create_or_load_model( train_model.model, model_dir, train_sess, "train") # Summary writer summary_writer = tf.summary.FileWriter( os.path.join(out_dir, summary_name), train_model.graph) #GJ18: do all evaluations on a single GPU! # First evaluation if ddl.rank() == 0: run_full_eval( model_dir, infer_model, infer_sess, eval_model, eval_sess, hparams, summary_writer, sample_src_data, sample_tgt_data, avg_ckpts) last_stats_step = global_step last_eval_step = global_step last_external_eval_step = global_step # This is the training loop. stats, info, start_train_time = before_train( loaded_train_model, train_model, train_sess, global_step, hparams, log_f) while global_step < num_train_steps: ### Run a step ### start_time = time.time() try: step_result = loaded_train_model.train(train_sess) hparams.epoch_step += 1 except tf.errors.OutOfRangeError: # Finished going through the training dataset. Go to next epoch. hparams.epoch_step = 0 if ddl.rank() == 0: utils.print_out( "# Finished an epoch, step %d. Perform external evaluation" % global_step) run_sample_decode(infer_model, infer_sess, model_dir, hparams, summary_writer, sample_src_data, sample_tgt_data) run_external_eval(infer_model, infer_sess, model_dir, hparams, summary_writer) if avg_ckpts: run_avg_external_eval(infer_model, infer_sess, model_dir, hparams, summary_writer, global_step) train_sess.run( train_model.iterator.initializer, feed_dict={train_model.skip_count_placeholder: 0}) continue # Process step_result, accumulate stats, and write summary global_step, info["learning_rate"], step_summary = update_stats( stats, start_time, step_result) summary_writer.add_summary(step_summary, global_step) # Once in a while, we print statistics. if global_step - last_stats_step >= steps_per_stats: last_stats_step = global_step is_overflow = process_stats( stats, info, global_step, steps_per_stats, log_f) print_step_info(" ", global_step, info, _get_best_results(hparams), log_f) if is_overflow: break # Reset statistics stats = init_stats() if global_step - last_eval_step >= steps_per_eval: last_eval_step = global_step utils.print_out("# Save eval, global step %d" % global_step) utils.add_summary(summary_writer, global_step, "train_ppl", info["train_ppl"]) if ddl.rank() == 0: # Save checkpoint loaded_train_model.saver.save( train_sess, os.path.join(out_dir, "translate.ckpt"), global_step=global_step) # Evaluate on dev/test run_sample_decode(infer_model, infer_sess, model_dir, hparams, summary_writer, sample_src_data, sample_tgt_data) run_internal_eval( eval_model, eval_sess, model_dir, hparams, summary_writer) if global_step - last_external_eval_step >= steps_per_external_eval: last_external_eval_step = global_step # Save checkpoint if ddl.rank() == 0: loaded_train_model.saver.save( train_sess, os.path.join(out_dir, "translate.ckpt"), global_step=global_step) run_sample_decode(infer_model, infer_sess, model_dir, hparams, summary_writer, sample_src_data, sample_tgt_data) run_external_eval( infer_model, infer_sess, model_dir, hparams, summary_writer) if avg_ckpts: run_avg_external_eval(infer_model, infer_sess, model_dir, hparams, summary_writer, global_step) # Done training if ddl.rank() == 0: loaded_train_model.saver.save( train_sess, os.path.join(out_dir, "translate.ckpt"), global_step=global_step) (result_summary, _, final_eval_metrics) = ( run_full_eval( model_dir, infer_model, infer_sess, eval_model, eval_sess, hparams, summary_writer, sample_src_data, sample_tgt_data, avg_ckpts)) print_step_info("# Final, ", global_step, info, result_summary, log_f) utils.print_time("# Done training!", start_train_time) summary_writer.close() if ddl.rank() == 0: utils.print_out("# Start evaluating saved best models.") for metric in hparams.metrics: best_model_dir = getattr(hparams, "best_" + metric + "_dir") summary_writer = tf.summary.FileWriter( os.path.join(best_model_dir, summary_name), infer_model.graph) result_summary, best_global_step, _ = run_full_eval( best_model_dir, infer_model, infer_sess, eval_model, eval_sess, hparams, summary_writer, sample_src_data, sample_tgt_data) print_step_info("# Best %s, " % metric, best_global_step, info, result_summary, log_f) summary_writer.close() if avg_ckpts: best_model_dir = getattr(hparams, "avg_best_" + metric + "_dir") summary_writer = tf.summary.FileWriter( os.path.join(best_model_dir, summary_name), infer_model.graph) result_summary, best_global_step, _ = run_full_eval( best_model_dir, infer_model, infer_sess, eval_model, eval_sess, hparams, summary_writer, sample_src_data, sample_tgt_data) print_step_info("# Averaged Best %s, " % metric, best_global_step, info, result_summary, log_f) summary_writer.close()
def main(_): ############################################################################ # Import MNIST data ############################################################################ mnist = input_data.read_data_sets(training_data_dir) # Parameters learning_rate = 0.001 training_iters = 2000 batch_size = 100 display_step = 1 # Network Parameters n_input = 784 # MNIST data input (img shape: 28*28) n_classes = 10 # MNIST total classes (0-9 digits) dropout = 0.75 # Dropout, probability to keep units # tf Graph input x = tf.placeholder(tf.float32, [None, n_input]) y = tf.placeholder(tf.int64, [None]) # Construct model pred, keep_prob = deepnn(x) # Define loss and optimizer with tf.name_scope('loss'): cost = tf.reduce_mean( tf.losses.sparse_softmax_cross_entropy(labels=y, logits=pred)) with tf.name_scope('adam_optimizer'): optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) objective = optimizer.minimize(cost) predictor = tf.argmax(pred, 1, name="predictor") # Evaluate model with tf.name_scope('accuracy'): correct_prediction = tf.equal(predictor, y) correct_prediction = tf.cast(correct_prediction, tf.float32) accuracy = tf.reduce_mean(correct_prediction) graph_location = tempfile.mkdtemp() print('Saving graph to: %s' % graph_location) train_writer = tf.summary.FileWriter(graph_location) train_writer.add_graph(tf.get_default_graph()) # Launch the graph with tf.Session(config=tf.ConfigProto()) as sess: sess.run(tf.global_variables_initializer()) step = 1 # Keep training until reach max iterations while step * batch_size < training_iters: ################################################### ### USE ddl.rank() and ddl.size() to load data ### ################################################### batch_x, batch_y = mnist.train.next_batch(batch_size * ddl.size()) #select one of partitions batch_x = np.split(batch_x, ddl.size())[ddl.rank()] batch_y = np.split(batch_y, ddl.size())[ddl.rank()] # Run optimization op (backprop) sess.run(objective, feed_dict={ x: batch_x, y: batch_y, keep_prob: dropout }) if step % display_step == 0: # Calculate batch loss and accuracy loss, acc = sess.run([cost, accuracy], feed_dict={ x: batch_x, y: batch_y, keep_prob: 1. }) print("DDL " + str(ddl.rank()) + "] Iter " + str(step * batch_size) + ", Minibatch Loss= " + "{:.6f}".format(loss) + ", Training Accuracy= " + "{:.5f}".format(acc)) step += 1 print("DDL " + str(ddl.rank()) + "] Optimization Finished!") classification_inputs = tf.saved_model.utils.build_tensor_info(x) classification_outputs_classes = tf.saved_model.utils.build_tensor_info( predictor) classification_signature = ( tf.saved_model.signature_def_utils.build_signature_def( inputs={ tf.saved_model.signature_constants.CLASSIFY_INPUTS: classification_inputs }, outputs={ tf.saved_model.signature_constants.CLASSIFY_OUTPUT_CLASSES: classification_outputs_classes }, method_name=tf.saved_model.signature_constants. CLASSIFY_METHOD_NAME)) print("classification_signature content:") print(classification_signature) # Calculate accuracy for 256 mnist test images print("DDL "+str(ddl.rank())+"] Testing Accuracy:", \ sess.run(accuracy, feed_dict={x: mnist.test.images[:256], y: mnist.test.labels[:256], keep_prob: 1.})) if ddl.rank() == 0: #model_path = "/tmp/mnist_chk" builder = tf.saved_model.builder.SavedModelBuilder(model_path) legacy_init_op = tf.group(tf.tables_initializer(), name='legacy_init_op') builder.add_meta_graph_and_variables( sess, [tf.saved_model.tag_constants.SERVING], signature_def_map={ 'predict_images': classification_signature, }, legacy_init_op=legacy_init_op) save_path = str(builder.save()) # save_path = saver.save(sess, model_path) print("Model saved in file: %s" % save_path)