예제 #1
0
def load_train_data():
    imgs_train = np.load(
        os.path.join(ROOT, 'img_20191216_aug_' + str(ddl.rank()) + '.npy'))
    imgs_mask_train = np.load(
        os.path.join(
            ROOT,
            'labelOnehot_20191216_aug_binary_' + str(ddl.rank()) + '.npy'))

    return imgs_train, imgs_mask_train
예제 #2
0
def run_mnist_eager():
    """Run MNIST training and eval loop in eager mode.
    """

    data_dir = '/tmp/tensorflow/mnist/input_data' + str(ddl.rank())
    model_dir = '/tmp/tensorflow/mnist/checkpoints/' + str(ddl.rank()) + '/'

    # Delete model dir
    if os.path.isdir(model_dir) and ddl.local_rank() == 0:
        shutil.rmtree(model_dir)

    data_format = 'channels_first'

    # Load the datasets
    train_ds, _ = mnist_dataset.train(data_dir, (1, 28, 28), label_int=True)
    train_ds = train_ds.shard(ddl.size(),
                              ddl.rank()).shuffle(60000).batch(batch_size)
    test_ds, _ = mnist_dataset.test(data_dir, (1, 28, 28), label_int=True)
    test_ds = test_ds.batch(batch_size)

    # Create the model and optimizer
    model = create_model(data_format)
    optimizer = tf.train.MomentumOptimizer(0.01, 0.5)

    train_dir = None
    test_dir = None
    summary_writer = tf.contrib.summary.create_file_writer(train_dir,
                                                           flush_millis=10000)
    test_summary_writer = tf.contrib.summary.create_file_writer(
        test_dir, flush_millis=10000, name='test')

    # Create and restore checkpoint (if one exists on the path)
    checkpoint_prefix = os.path.join(model_dir, 'ckpt-r' + str(ddl.rank()))
    step_counter = tf.train.get_or_create_global_step()
    checkpoint = tf.train.Checkpoint(model=model,
                                     optimizer=optimizer,
                                     step_counter=step_counter)
    # Restore variables on creation if a checkpoint exists.
    checkpoint.restore(tf.train.latest_checkpoint(model_dir))

    # Train and evaluate for a set number of epochs.
    for _ in range(train_epochs):
        start = time.time()
        with summary_writer.as_default():
            train(model, optimizer, train_ds, step_counter, 10)
        end = time.time()
        if ddl.rank() == 0:
            print('\nTrain time for epoch #%d (%d total steps): %f' %
                  (checkpoint.save_counter.numpy() + 1, step_counter.numpy(),
                   end - start))
        with test_summary_writer.as_default():
            test(model, test_ds)
        checkpoint.save(checkpoint_prefix)
예제 #3
0
def run_main(flags,
             default_hparams,
             train_fn,
             inference_fn,
             target_session=""):
    """Run main."""
    # Job
    #jobid = flags.jobid
    jobid = ddl.rank()  # for ddl to enforce data partitioning
    num_workers = flags.num_workers  # must be 1 for ddl
    utils.print_out("# Job id %d" % jobid)

    # Random
    random_seed = flags.random_seed
    if random_seed is not None and random_seed > 0:
        utils.print_out("# Set random seed to %d" % random_seed)
        random.seed(random_seed + jobid)
        np.random.seed(random_seed + jobid)
        tf.set_random_seed(random_seed + jobid)

    ## Train / Decode
    out_dir = flags.out_dir
    if not tf.gfile.Exists(out_dir): tf.gfile.MakeDirs(out_dir)

    # Load hparams.
    hparams = create_or_load_hparams(out_dir,
                                     default_hparams,
                                     flags.hparams_path,
                                     save_hparams=(jobid == 0))

    if flags.inference_input_file:
        # Inference indices
        hparams.inference_indices = None
        if flags.inference_list:
            (hparams.inference_indices) = ([
                int(token) for token in flags.inference_list.split(",")
            ])

        # Inference
        trans_file = flags.inference_output_file
        ckpt = flags.ckpt
        if not ckpt:
            ckpt = tf.train.latest_checkpoint(out_dir)
        inference_fn(ckpt, flags.inference_input_file, trans_file, hparams,
                     num_workers, jobid)

        # Evaluation
        ref_file = flags.inference_ref_file
        if ref_file and tf.gfile.Exists(trans_file):
            for metric in hparams.metrics:
                score = evaluation_utils.evaluate(ref_file, trans_file, metric,
                                                  hparams.subword_option)
                utils.print_out("  %s: %.1f" % (metric, score))
    else:
        # Train
        train_fn(hparams, target_session=target_session)
예제 #4
0
def _external_eval(model, global_step, sess, hparams, iterator,
                   iterator_feed_dict, tgt_file, label, summary_writer,
                   save_on_best, avg_ckpts=False):
  """External evaluation such as BLEU and ROUGE scores."""
  out_dir = hparams.out_dir
  decode = global_step > 0

  if avg_ckpts:
    label = "avg_" + label

  if decode:
    utils.print_out("# External evaluation, global step %d" % global_step)

  sess.run(iterator.initializer, feed_dict=iterator_feed_dict)

  output = os.path.join(out_dir, "output_%s" % label)
  scores = nmt_utils.decode_and_evaluate(
      label,
      model,
      sess,
      output,
      ref_file=tgt_file,
      metrics=hparams.metrics,
      subword_option=hparams.subword_option,
      beam_width=hparams.beam_width,
      tgt_eos=hparams.eos,
      decode=decode)
  # Save on best metrics
  if decode:
    for metric in hparams.metrics:
      if avg_ckpts:
        best_metric_label = "avg_best_" + metric
      else:
        best_metric_label = "best_" + metric

      utils.add_summary(summary_writer, global_step, "%s_%s" % (label, metric),
                        scores[metric])
      # metric: larger is better
      if save_on_best and scores[metric] > getattr(hparams, best_metric_label):
        setattr(hparams, best_metric_label, scores[metric])
        if ddl.rank() == 0:
          model.saver.save(
              sess,
              os.path.join(
                  getattr(hparams, best_metric_label + "_dir"), "translate.ckpt"),
              global_step=model.global_step)

    utils.save_hparams(out_dir, hparams)
  return scores
예제 #5
0
def test(model, dataset):
    """Perform an evaluation of `model` on the examples from `dataset`."""
    avg_loss = tfe.metrics.Mean('loss', dtype=tf.float32)
    accuracy = tfe.metrics.Accuracy('accuracy', dtype=tf.float32)

    for (images, labels) in dataset:
        logits = model(images, training=False)
        avg_loss(loss(logits, labels))
        accuracy(tf.argmax(logits, axis=1, output_type=tf.int64),
                 tf.cast(labels, tf.int64))
    if ddl.rank() == 0:
        print('Test set: Average loss: %.4f, Accuracy: %4f%%\n' %
              (avg_loss.result(), 100 * accuracy.result()))
    with tf.contrib.summary.always_record_summaries():
        tf.contrib.summary.scalar('loss', avg_loss.result())
        tf.contrib.summary.scalar('accuracy', accuracy.result())
예제 #6
0
def train(model, optimizer, dataset, step_counter, log_interval=None):
    """Trains model on `dataset` using `optimizer`."""

    start = time.time()
    for (batch, (images, labels)) in enumerate(dataset):
        with tf.contrib.summary.record_summaries_every_n_global_steps(
                10, global_step=step_counter):
            # Record the operations used to compute the loss given the input,
            # so that the gradient of the loss with respect to the variables
            # can be computed.
            with tf.GradientTape() as tape:
                logits = model(images, training=True)
                loss_value = loss(logits, labels)
                tf.contrib.summary.scalar('loss', loss_value)
                tf.contrib.summary.scalar('accuracy',
                                          compute_accuracy(logits, labels))
            grads = tape.gradient(loss_value, model.variables)
            optimizer.apply_gradients(zip(grads, model.variables),
                                      global_step=step_counter)
            if log_interval and batch % log_interval == 0 and ddl.rank() == 0:
                rate = log_interval / (time.time() - start)
                print('Step #%d\tLoss: %.6f (%d steps/sec)' %
                      (batch, loss_value, rate))
                start = time.time()
예제 #7
0
def train_and_predict(postfix,
                      bsize,
                      eps,
                      lrate,
                      imgs_train,
                      imgs_mask_train,
                      weights=None):

    if ddl.rank() == 0:
        print('Running with postfix:', postfix, 'batch_size:', bsize,
              'epochs:', eps, 'lr:', lrate, 'weights:', weights)

    tempH5file = postfix + '_' + str(random.randint(1, 1000000)) + '.h5'

    model = get_unet(lrate)
    model_checkpoint = ModelCheckpoint(tempH5file,
                                       monitor='val_loss',
                                       save_best_only=True)
    reduce_lr = ReduceLROnPlateau(monitor='val_loss',
                                  factor=0.5,
                                  patience=2,
                                  min_lr=lrate * 0.0001)
    early_stopping_monitor = EarlyStopping(patience=5, min_delta=0.0001)

    #load previous weights to continue training
    if weights:
        model.load_weights(weights)

    if ddl.rank() == 0:
        train_history = model.fit(imgs_train,
                                  imgs_mask_train,
                                  batch_size=bsize,
                                  epochs=eps,
                                  verbose=1,
                                  shuffle=True,
                                  validation_split=0.1,
                                  callbacks=[
                                      ddl.DDLCallback, model_checkpoint,
                                      reduce_lr, early_stopping_monitor,
                                      ddl.DDLGlobalVariablesCallback()
                                  ])
    else:
        train_history = model.fit(imgs_train,
                                  imgs_mask_train,
                                  batch_size=bsize,
                                  epochs=eps,
                                  verbose=0,
                                  shuffle=True,
                                  validation_split=0.1,
                                  callbacks=[
                                      ddl.DDLCallback, reduce_lr,
                                      early_stopping_monitor,
                                      ddl.DDLGlobalVariablesCallback()
                                  ])

    if ddl.rank() == 0:
        score = np.max(train_history.history['val_jaccard_index'])

        #throw away ridiculously low scores
        if score > 0.25:
            score_str = str(score)[:8].replace('.', '_')
            weightsFile = 'weights_' + postfix + '_' + score_str + '.h5'
            shutil.move(tempH5file, weightsFile)
        else:
            os.remove(tempH5file)
예제 #8
0
def main(_):
    # Parameters
    learning_rate = 0.001
    training_iters = FLAGS.num_iterations
    batch_size = 100
    display_step = 1

    # Network Parameters
    n_input = 784 # MNIST data input (img shape: 28*28)
    n_classes = 10 # MNIST total classes (0-9 digits)
    dropout = 0.75 # Dropout, probability to keep units

    ############################################################################
    # Import MNIST data
    ############################################################################
    data_dir = FLAGS.data_dir + str(ddl.local_rank())
    (train_set, num_of_train_imgs) = dataset.train(data_dir, (28, 28, 1), VARTYPE)
    train_set = train_set.shard(ddl.size(), ddl.rank())
    train_set = train_set.batch(batch_size).cache().shuffle(buffer_size=1000).repeat()

    X_train, Y_train = train_set.make_one_shot_iterator().get_next()

    # Construct model
    pred, keep_prob = deepnn(X_train)

    # Define loss and optimizer
    with tf.name_scope('loss'):
        cost = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits_v2(labels=Y_train, logits=pred))

    with tf.name_scope('adam_optimizer'):
        optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, epsilon=1e-4)
        objective = optimizer.minimize(cost)

    # Evaluate model
    with tf.name_scope('accuracy'):
        correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(Y_train, 1))
        correct_prediction = tf.cast(correct_prediction, VARTYPE)
        accuracy = tf.reduce_mean(correct_prediction)

    graph_location = tempfile.mkdtemp()
    print('Saving graph to: %s' % graph_location)
    train_writer = tf.summary.FileWriter(graph_location)
    train_writer.add_graph(tf.get_default_graph())

    # Launch the graph
    with tf.Session(config=tf.ConfigProto()) as sess:
        sess.run(tf.global_variables_initializer())
        my_variable = bias_variable([5, 5, 1, 32])
        sess.run(my_variable.initializer)
        step = 1
        # Keep training until reach max iterations
        while step * batch_size < training_iters:
            # Run optimization op (backprop)
            sess.run(objective)
            if step % display_step == 0:
                # Calculate batch loss and accuracy
                loss, acc = sess.run([cost, accuracy])
                print("DDL " + str(ddl.rank()) + "] Iter " + str(step * batch_size) +
                  ", Minibatch Loss= " + "{:.6f}".format(loss) +
                  ", Training Accuracy= " + "{:.5f}".format(acc))
            step += 1

        print("DDL "+str(ddl.rank())+"] Optimization Finished!")

        # Calculate accuracy for 256 mnist test images
        print("DDL "+str(ddl.rank())+"] Testing Accuracy:", sess.run(accuracy))
예제 #9
0
import tensorflow as tf
if len(sys.argv) > 1 and sys.argv[1] == '--eager':
    tf.enable_eager_execution()
import ddl
import dataset

batch_size = 128
num_classes = 10

epochs = 12

# input image dimensions
img_rows, img_cols = 28, 28

# data_dir
data_dir = "/tmp/mnist_convnet_model_data" + str(ddl.rank())

input_shape = ()
if K.image_data_format() == 'channels_first':
    input_shape = (1, img_rows, img_cols)
else:
    input_shape = (img_rows, img_cols, 1)

# the data, split between train and test sets
(train_set, num_of_train_imgs) = dataset.train(data_dir, input_shape)
train_set = train_set.shard(ddl.size(), ddl.rank())
train_set = train_set.cache().shuffle(
    buffer_size=1000).batch(batch_size).repeat()

(eval_set, num_of_test_imgs) = dataset.test(data_dir, input_shape)
eval_full = eval_set
예제 #10
0
def main():
    ############################################################################
    # Import MNIST data
    ############################################################################
    mnist = input_data.read_data_sets(training_data_dir)

    # Parameters
    learning_rate = 0.001
    training_iters = 2500
    batch_size = 100
    display_step = 1

    # Network Parameters
    n_input = 784 # MNIST data input (img shape: 28*28)
    n_classes = 10 # MNIST total classes (0-9 digits)
    dropout = 0.75 # Dropout, probability to keep units

    # tf Graph input
    x = tf.placeholder(tf.float32, [None, n_input], name="x")
    # Construct model
    keep_prob = tf.placeholder_with_default(1.0,shape=(), name="keepprob")
    pred = deepnn(x,1.0)
    pRes = tf.identity(pred,name="pRes")

    if os.getenv("OMPI_COMM_WORLD_RANK") == "0":
         print("writing checkpoint file", chkptpath+"_basegraph.meta")
         tf.train.export_meta_graph(chkptpath+"_basegraph.meta", as_text=True)

    #import the ddl library; this creates objects for distribution so 
    #it must be done after exporting meta graph
    import ddl
    y = tf.placeholder(tf.int64, [None], name="y")
    # Define loss and optimizer
    with tf.name_scope('loss'):
        cost = tf.reduce_mean(
            tf.losses.sparse_softmax_cross_entropy(labels=y, logits=pred))

    with tf.name_scope('adam_optimizer'):
        optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
        objective = optimizer.minimize(cost)

    predictor = tf.argmax(pred, 1, name="predictor") 
    # Evaluate model
    with tf.name_scope('accuracy'):
        correct_prediction = tf.equal(predictor, y)
        correct_prediction = tf.cast(correct_prediction, tf.float32)
        accuracy = tf.reduce_mean(correct_prediction)


    saver = tf.train.Saver() 

    # Launch the graph
    with tf.Session(config=tf.ConfigProto()) as sess:
        sess.run(tf.global_variables_initializer())
        step = 1
        # Keep training until reach max iterations
        while step * batch_size < training_iters:

            ###################################################
            ### USE ddl.rank() and ddl.size() to load data  ###
            ###################################################
            batch_x, batch_y = mnist.train.next_batch(batch_size*ddl.size())

            #select one of partitions
            batch_x = np.split(batch_x,ddl.size())[ddl.rank()]
            batch_y = np.split(batch_y,ddl.size())[ddl.rank()]

            # Run optimization op (backprop)
            sess.run(objective, feed_dict={x: batch_x, y: batch_y})
            if step % display_step == 0:
                # Calculate batch loss and accuracy
                loss, acc = sess.run([cost, accuracy], feed_dict={x: batch_x,
                                                                  y: batch_y})
                print("DDL " + str(ddl.rank()) + "] Iter " + str(step * batch_size) +
                  ", Minibatch Loss= " + "{:.6f}".format(loss) +
                  ", Training Accuracy= " + "{:.5f}".format(acc))
            step += 1
            if os.getenv("OMPI_COMM_WORLD_RANK") == "0" and step%10==0 and step!=0:
                saver.save(sess, chkptpath,global_step=step)
                print('[%d] save checkpoint' % step+" path: "+chkptpath)


        print("DDL "+str(ddl.rank())+"] Optimization Finished!")



        # Calculate accuracy for 256 mnist test images
        print("DDL "+str(ddl.rank())+"] Testing Accuracy:", \
            sess.run(accuracy, feed_dict={x: mnist.test.images[:256],
                                          y: mnist.test.labels[:256]}))
예제 #11
0
def main(_):
    # Note: Not using DDL_OPTIONS; doing explicit DDL calls!
    # Explicit initialization call:
    ddl.init(FLAGS.ddl_options)

    # Parameters
    learning_rate = 0.001
    training_iters = FLAGS.num_iterations
    batch_size = 100
    display_step = 1

    # Network Parameters
    n_input = 784  # MNIST data input (img shape: 28*28)
    n_classes = 10  # MNIST total classes (0-9 digits)
    dropout = 0.75  # Dropout, probability to keep units

    ############################################################################
    # Import MNIST data
    ############################################################################
    data_dir = FLAGS.data_dir + str(ddl.local_rank())
    (train_set, num_of_train_imgs) = dataset.train(data_dir, (28, 28, 1))
    train_set = train_set.shard(ddl.size(), ddl.rank())
    train_set = train_set.batch(batch_size).cache().shuffle(
        buffer_size=1000).repeat()

    X_train, Y_train = train_set.make_one_shot_iterator().get_next()

    # Construct model
    pred, keep_prob = deepnn(X_train)

    # Define loss and optimizer
    with tf.name_scope('loss'):
        cost = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits_v2(labels=Y_train,
                                                       logits=pred))

    with tf.name_scope('adam_optimizer'):
        optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
        grads_and_vars = optimizer.compute_gradients(cost)

        # obtain learnable variables and their gradients across the cluster nodes
        # and do reduce_scatter by making explicit DDL reduce call.
        # Note: all zipping is hidden
        grads_and_vars = ddl.grads_reduce(grads_and_vars, average=True)
        objective = optimizer.apply_gradients(grads_and_vars)

    # Evaluate model
    with tf.name_scope('accuracy'):
        correct_prediction = tf.equal(tf.argmax(pred, 1),
                                      tf.argmax(Y_train, 1))
        correct_prediction = tf.cast(correct_prediction, tf.float32)
        accuracy = tf.reduce_mean(correct_prediction)

    graph_location = tempfile.mkdtemp()
    print('Saving graph to: %s' % graph_location)
    train_writer = tf.summary.FileWriter(graph_location)
    train_writer.add_graph(tf.get_default_graph())

    # Launch the graph
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        step = 1
        # Keep training until reach max iterations
        while step * batch_size < training_iters:
            # Run optimization op (backprop)
            sess.run(objective)
            if step % display_step == 0:
                # Calculate batch loss and accuracy
                loss, acc = sess.run([cost, accuracy])
                print("DDL " + str(ddl.rank()) + "] Iter " +
                      str(step * batch_size) + ", Minibatch Loss= " +
                      "{:.6f}".format(loss) + ", Training Accuracy= " +
                      "{:.5f}".format(acc))
            step += 1

        print("DDL " + str(ddl.rank()) + "] Optimization Finished!")

        # Calculate accuracy for 256 mnist test images
        print("DDL " + str(ddl.rank()) + "] Testing Accuracy:",
              sess.run(accuracy))
callbacks.append(ddl.DDLCallback())
callbacks.append(ddl.DDLGlobalVariablesCallback())

# Normalize into 0~1 range
x_train /= x_train.max()
x_test /= x_test.max()

y_train_binary = to_categorical(
    y_train
)  # For categorical crossentropy loss, we need to binarize multi-class labels
y_test_binary = to_categorical(
    y_test
)  # For categorical crossentropy loss, we need to binarize multi-class labels

# Split the training data into ddl.size() batches for distributed training.
x_train_dist = np.array_split(x_train, ddl.size())[ddl.rank()]
y_train_dist = np.array_split(y_train, ddl.size())[ddl.rank()]
y_train_dist_binary = np.array_split(y_train_binary, ddl.size())[ddl.rank()]
'''
Training step one. Train for NN
'''
if model_type == 'triplet' or model_type == 'contrastive':
    model = build_nn([568, 256, 100],
                     x_train_dist.shape[1],
                     l1_reg=l1_reg,
                     l2_reg=l2_reg,
                     activation_func='tanh')[0]

    # Set initial weights as DAE trained weights (skip dropout and batchnorm layers)
    # for layer,weight in zip(model.layers[1:8:3],pretrain_weights):
    #	layer.set_weights(weight)
예제 #13
0
def train(hparams, scope=None, target_session=""):
  """Train a translation model."""
  log_device_placement = hparams.log_device_placement
  out_dir = hparams.out_dir
  num_train_steps = hparams.num_train_steps
  steps_per_stats = hparams.steps_per_stats
  steps_per_external_eval = hparams.steps_per_external_eval
  steps_per_eval = 10 * steps_per_stats
  avg_ckpts = hparams.avg_ckpts

  if not steps_per_external_eval:
    steps_per_external_eval = 5 * steps_per_eval

  if not hparams.attention:
    model_creator = nmt_model.Model
  else:  # Attention
    if (hparams.encoder_type == "gnmt" or
        hparams.attention_architecture in ["gnmt", "gnmt_v2"]):
      model_creator = gnmt_model.GNMTModel
    elif hparams.attention_architecture == "standard":
      model_creator = attention_model.AttentionModel
    else:
      raise ValueError("Unknown attention architecture %s" %
                       hparams.attention_architecture)

  utils.print_out("Detected %d ranks, the current rank is %d " % (ddl.size(), ddl.rank()))

  train_model = model_helper.create_train_model(model_creator, hparams, scope, num_workers=ddl.size(), jobid=ddl.rank())
  ddl.disable_bcast()
  eval_model = model_helper.create_eval_model(model_creator, hparams, scope)
  infer_model = model_helper.create_infer_model(model_creator, hparams, scope)

  # Preload data for sample decoding.
  dev_src_file = "%s.%s" % (hparams.dev_prefix, hparams.src)
  dev_tgt_file = "%s.%s" % (hparams.dev_prefix, hparams.tgt)
  sample_src_data = inference.load_data(dev_src_file)
  sample_tgt_data = inference.load_data(dev_tgt_file)

  summary_name = "train_log_rank_%d" % ddl.rank()
  model_dir = hparams.out_dir

  # Log and output files
  log_file = os.path.join(out_dir, "log_%d_rank_%d" % (time.time(), ddl.rank()))
  log_f = tf.gfile.GFile(log_file, mode="a")
  utils.print_out("# log_file=%s" % log_file, log_f)

  # TensorFlow model
  config_proto = utils.get_config_proto(
      log_device_placement=log_device_placement,
      num_intra_threads=hparams.num_intra_threads,
      num_inter_threads=hparams.num_inter_threads)
  train_sess = tf.Session(
      target=target_session, config=config_proto, graph=train_model.graph)
  eval_sess = tf.Session(
      target=target_session, config=config_proto, graph=eval_model.graph)
  infer_sess = tf.Session(
      target=target_session, config=config_proto, graph=infer_model.graph)

  with train_model.graph.as_default():
    loaded_train_model, global_step = model_helper.create_or_load_model(
        train_model.model, model_dir, train_sess, "train")

  # Summary writer
  summary_writer = tf.summary.FileWriter(
    os.path.join(out_dir, summary_name), train_model.graph)

  #GJ18: do all evaluations on a single GPU!

  # First evaluation
  if ddl.rank() == 0:
    run_full_eval(
      model_dir, infer_model, infer_sess,
      eval_model, eval_sess, hparams,
      summary_writer, sample_src_data,
      sample_tgt_data, avg_ckpts)

  last_stats_step = global_step
  last_eval_step = global_step
  last_external_eval_step = global_step

  # This is the training loop.
  stats, info, start_train_time = before_train(
      loaded_train_model, train_model, train_sess, global_step, hparams, log_f)
  while global_step < num_train_steps:
    ### Run a step ###
    start_time = time.time()
    try:
      step_result = loaded_train_model.train(train_sess)
      hparams.epoch_step += 1
    except tf.errors.OutOfRangeError:
      # Finished going through the training dataset.  Go to next epoch.
      hparams.epoch_step = 0
      if ddl.rank() == 0:
        utils.print_out(
          "# Finished an epoch, step %d. Perform external evaluation" %
          global_step)
        run_sample_decode(infer_model, infer_sess, model_dir, hparams,
                          summary_writer, sample_src_data, sample_tgt_data)
        run_external_eval(infer_model, infer_sess, model_dir, hparams,
                          summary_writer)

        if avg_ckpts:
          run_avg_external_eval(infer_model, infer_sess, model_dir, hparams,
                                summary_writer, global_step)

      train_sess.run(
          train_model.iterator.initializer,
          feed_dict={train_model.skip_count_placeholder: 0})
      continue

    # Process step_result, accumulate stats, and write summary
    global_step, info["learning_rate"], step_summary = update_stats(
        stats, start_time, step_result)

    summary_writer.add_summary(step_summary, global_step)

    # Once in a while, we print statistics.
    if global_step - last_stats_step >= steps_per_stats:
      last_stats_step = global_step
      is_overflow = process_stats(
          stats, info, global_step, steps_per_stats, log_f)
      print_step_info("  ", global_step, info, _get_best_results(hparams),
                      log_f)
      if is_overflow:
        break

      # Reset statistics
      stats = init_stats()

    if global_step - last_eval_step >= steps_per_eval:
      last_eval_step = global_step
      utils.print_out("# Save eval, global step %d" % global_step)

      
      utils.add_summary(summary_writer, global_step, "train_ppl",
                        info["train_ppl"])

      if ddl.rank() == 0:
        # Save checkpoint
        loaded_train_model.saver.save(
            train_sess,
            os.path.join(out_dir, "translate.ckpt"),
            global_step=global_step)

        # Evaluate on dev/test
        run_sample_decode(infer_model, infer_sess,
                          model_dir, hparams, summary_writer, sample_src_data,
                          sample_tgt_data)
        run_internal_eval(
          eval_model, eval_sess, model_dir, hparams, summary_writer)

    if global_step - last_external_eval_step >= steps_per_external_eval:
      last_external_eval_step = global_step

      # Save checkpoint
      if ddl.rank() == 0:
        loaded_train_model.saver.save(
          train_sess,
          os.path.join(out_dir, "translate.ckpt"),
          global_step=global_step)

        run_sample_decode(infer_model, infer_sess,
                          model_dir, hparams, summary_writer, sample_src_data,
                          sample_tgt_data)
        run_external_eval(
          infer_model, infer_sess, model_dir,
          hparams, summary_writer)

        if avg_ckpts:
          run_avg_external_eval(infer_model, infer_sess, model_dir, hparams,
                                summary_writer, global_step)

  # Done training
  if ddl.rank() == 0:
    loaded_train_model.saver.save(
        train_sess,
        os.path.join(out_dir, "translate.ckpt"),
        global_step=global_step)

    (result_summary, _, final_eval_metrics) = (
      run_full_eval(
        model_dir, infer_model, infer_sess, eval_model, eval_sess, hparams,
        summary_writer, sample_src_data, sample_tgt_data, avg_ckpts))

    print_step_info("# Final, ", global_step, info, result_summary, log_f)
  utils.print_time("# Done training!", start_train_time)

  summary_writer.close()

  if ddl.rank() == 0:
    utils.print_out("# Start evaluating saved best models.")
    for metric in hparams.metrics:
      best_model_dir = getattr(hparams, "best_" + metric + "_dir")
      summary_writer = tf.summary.FileWriter(
        os.path.join(best_model_dir, summary_name), infer_model.graph)
      result_summary, best_global_step, _ = run_full_eval(
        best_model_dir, infer_model, infer_sess, eval_model, eval_sess, hparams,
        summary_writer, sample_src_data, sample_tgt_data)
      print_step_info("# Best %s, " % metric, best_global_step, info,
                      result_summary, log_f)
      summary_writer.close()

      if avg_ckpts:
        best_model_dir = getattr(hparams, "avg_best_" + metric + "_dir")
        summary_writer = tf.summary.FileWriter(
          os.path.join(best_model_dir, summary_name), infer_model.graph)
        result_summary, best_global_step, _ = run_full_eval(
          best_model_dir, infer_model, infer_sess, eval_model, eval_sess,
          hparams, summary_writer, sample_src_data, sample_tgt_data)
        print_step_info("# Averaged Best %s, " % metric, best_global_step, info,
                        result_summary, log_f)
        summary_writer.close()
예제 #14
0
def main(_):
    ############################################################################
    # Import MNIST data
    ############################################################################
    mnist = input_data.read_data_sets(training_data_dir)

    # Parameters
    learning_rate = 0.001
    training_iters = 2000
    batch_size = 100
    display_step = 1

    # Network Parameters
    n_input = 784  # MNIST data input (img shape: 28*28)
    n_classes = 10  # MNIST total classes (0-9 digits)
    dropout = 0.75  # Dropout, probability to keep units

    # tf Graph input
    x = tf.placeholder(tf.float32, [None, n_input])
    y = tf.placeholder(tf.int64, [None])

    # Construct model
    pred, keep_prob = deepnn(x)

    # Define loss and optimizer
    with tf.name_scope('loss'):
        cost = tf.reduce_mean(
            tf.losses.sparse_softmax_cross_entropy(labels=y, logits=pred))

    with tf.name_scope('adam_optimizer'):
        optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
        objective = optimizer.minimize(cost)

    predictor = tf.argmax(pred, 1, name="predictor")
    # Evaluate model
    with tf.name_scope('accuracy'):
        correct_prediction = tf.equal(predictor, y)
        correct_prediction = tf.cast(correct_prediction, tf.float32)
        accuracy = tf.reduce_mean(correct_prediction)

    graph_location = tempfile.mkdtemp()
    print('Saving graph to: %s' % graph_location)
    train_writer = tf.summary.FileWriter(graph_location)
    train_writer.add_graph(tf.get_default_graph())

    # Launch the graph
    with tf.Session(config=tf.ConfigProto()) as sess:
        sess.run(tf.global_variables_initializer())
        step = 1
        # Keep training until reach max iterations
        while step * batch_size < training_iters:

            ###################################################
            ### USE ddl.rank() and ddl.size() to load data  ###
            ###################################################
            batch_x, batch_y = mnist.train.next_batch(batch_size * ddl.size())

            #select one of partitions
            batch_x = np.split(batch_x, ddl.size())[ddl.rank()]
            batch_y = np.split(batch_y, ddl.size())[ddl.rank()]

            # Run optimization op (backprop)
            sess.run(objective,
                     feed_dict={
                         x: batch_x,
                         y: batch_y,
                         keep_prob: dropout
                     })
            if step % display_step == 0:
                # Calculate batch loss and accuracy
                loss, acc = sess.run([cost, accuracy],
                                     feed_dict={
                                         x: batch_x,
                                         y: batch_y,
                                         keep_prob: 1.
                                     })
                print("DDL " + str(ddl.rank()) + "] Iter " +
                      str(step * batch_size) + ", Minibatch Loss= " +
                      "{:.6f}".format(loss) + ", Training Accuracy= " +
                      "{:.5f}".format(acc))
            step += 1

        print("DDL " + str(ddl.rank()) + "] Optimization Finished!")

        classification_inputs = tf.saved_model.utils.build_tensor_info(x)
        classification_outputs_classes = tf.saved_model.utils.build_tensor_info(
            predictor)

        classification_signature = (
            tf.saved_model.signature_def_utils.build_signature_def(
                inputs={
                    tf.saved_model.signature_constants.CLASSIFY_INPUTS:
                    classification_inputs
                },
                outputs={
                    tf.saved_model.signature_constants.CLASSIFY_OUTPUT_CLASSES:
                    classification_outputs_classes
                },
                method_name=tf.saved_model.signature_constants.
                CLASSIFY_METHOD_NAME))

        print("classification_signature content:")
        print(classification_signature)

        # Calculate accuracy for 256 mnist test images
        print("DDL "+str(ddl.rank())+"] Testing Accuracy:", \
            sess.run(accuracy, feed_dict={x: mnist.test.images[:256],
                                          y: mnist.test.labels[:256],
                                          keep_prob: 1.}))
        if ddl.rank() == 0:
            #model_path = "/tmp/mnist_chk"
            builder = tf.saved_model.builder.SavedModelBuilder(model_path)
            legacy_init_op = tf.group(tf.tables_initializer(),
                                      name='legacy_init_op')
            builder.add_meta_graph_and_variables(
                sess, [tf.saved_model.tag_constants.SERVING],
                signature_def_map={
                    'predict_images': classification_signature,
                },
                legacy_init_op=legacy_init_op)

            save_path = str(builder.save())

            # save_path = saver.save(sess, model_path)
            print("Model saved in file: %s" % save_path)