Exemplo n.º 1
0
def run_experiment(hparams):
  """Run the training and evaluate using the high level API"""

  train_input = lambda: model.input_fn(
      filename=os.path.join(hparams.data_dir, 'train.tfrecords'),
      batch_size=hparams.train_batch_size
  )

  eval_input = lambda: model.input_fn(
      filename=os.path.join(hparams.data_dir, 'test.tfrecords'),
      batch_size=hparams.eval_batch_size
  )

  train_spec = tf.estimator.TrainSpec(train_input,
                                      max_steps=hparams.train_steps
                                      )

  exporter = tf.estimator.FinalExporter('cnn',
          model.serving_input_fn)
  eval_spec = tf.estimator.EvalSpec(eval_input,
                                    steps=hparams.eval_steps,
                                    exporters=[exporter],
                                    name='cnn-eval'
                                    )

  estimator = model.build_estimator(model_dir=hparams.job_dir)

  tf.estimator.train_and_evaluate(estimator,
                                  train_spec,
                                  eval_spec)
Exemplo n.º 2
0
def run_experiment(hparams):
    """Run the training and evaluate using the high level API"""

    train_input = lambda: model.input_fn(hparams.train_files,
                                         num_epochs=hparams.num_epochs,
                                         batch_size=hparams.train_batch_size)

    # Don't shuffle evaluation data
    eval_input = lambda: model.input_fn(
        hparams.eval_files, batch_size=hparams.eval_batch_size, shuffle=False)

    train_spec = tf.estimator.TrainSpec(train_input,
                                        max_steps=hparams.train_steps)

    exporter = tf.estimator.FinalExporter(
        'jimini', model.SERVING_FUNCTIONS[hparams.export_format])
    eval_spec = tf.estimator.EvalSpec(eval_input,
                                      steps=hparams.eval_steps,
                                      exporters=[exporter],
                                      name='jimini-eval')

    run_config = tf.estimator.RunConfig()
    run_config = run_config.replace(model_dir=hparams.job_dir)
    print('model dir {}'.format(run_config.model_dir))
    estimator = model.build_estimator(
        embedding_size=hparams.embedding_size,
        # Construct layers sizes with exponetial decay
        hidden_units=[
            max(2, int(hparams.first_layer_size * hparams.scale_factor**i))
            for i in range(hparams.num_layers)
        ],
        config=run_config)

    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
Exemplo n.º 3
0
def train_and_evaluate(args):
    """Run the training and evaluate using the high level API."""

    train_input = lambda: model.input_fn(args.train_files,
                                         num_epochs=args.num_epochs,
                                         batch_size=args.train_batch_size)

    # Don't shuffle evaluation data
    eval_input = lambda: model.input_fn(
        args.eval_files, batch_size=args.eval_batch_size, shuffle=False)

    train_spec = tf.estimator.TrainSpec(train_input,
                                        max_steps=args.train_steps)

    exporter = tf.estimator.FinalExporter(
        'census', model.SERVING_FUNCTIONS[args.export_format])
    eval_spec = tf.estimator.EvalSpec(eval_input,
                                      steps=args.eval_steps,
                                      exporters=[exporter],
                                      name='census-eval')

    run_config = tf.estimator.RunConfig(
        session_config=_get_session_config_from_env_var())
    run_config = run_config.replace(model_dir=args.job_dir)
    print('Model dir %s' % run_config.model_dir)
    estimator = model.build_estimator(
        embedding_size=args.embedding_size,
        # Construct layers sizes with exponential decay
        hidden_units=[
            max(2, int(args.first_layer_size * args.scale_factor**i))
            for i in range(args.num_layers)
        ],
        config=run_config)

    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
Exemplo n.º 4
0
def run_experiment(hparams):
    """Run the training and evaluate using the high level API"""

    train_input = lambda: model.input_fn(hparams.train_files,
                                         num_epochs=hparams.num_epochs,
                                         batch_size=hparams.train_batch_size)

    # Don't shuffle evaluation data
    eval_input = lambda: model.input_fn(
        hparams.eval_files, batch_size=hparams.eval_batch_size, shuffle=False)

    train_spec = tf.estimator.TrainSpec(train_input,
                                        max_steps=hparams.train_steps)

    exporter = tf.estimator.FinalExporter(
        'airline', model.SERVING_FUNCTIONS[hparams.export_format])
    eval_spec = tf.estimator.EvalSpec(eval_input,
                                      steps=hparams.eval_steps,
                                      exporters=[exporter],
                                      name='airline-eval')

    run_config = tf.estimator.RunConfig()
    run_config = run_config.replace(model_dir=hparams.job_dir)
    print('model dir {}'.format(run_config.model_dir))
    estimator = model.build_estimator(model=hparams.model, config=run_config)

    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
Exemplo n.º 5
0
def train_and_evaluate(hparams):
    """Run the training and evaluate using the high level API."""
    train_input = lambda: model.input_fn(hparams.train_files,
                                         num_epochs=hparams.num_epochs,
                                         batch_size=hparams.train_batch_size)

    # Don't shuffle evaluation data.
    eval_input = lambda: model.input_fn(
        hparams.eval_files, batch_size=hparams.eval_batch_size, shuffle=False)

    train_spec = tf.estimator.TrainSpec(train_input,
                                        max_steps=hparams.train_steps)

    exporter = tf.estimator.FinalExporter(
        'census', model.SERVING_FUNCTIONS[hparams.export_format])
    eval_spec = tf.estimator.EvalSpec(eval_input,
                                      steps=hparams.eval_steps,
                                      exporters=[exporter],
                                      name='census-eval')

    model_fn = model.generate_model_fn(
        embedding_size=hparams.embedding_size,
        # Construct layers sizes with exponential decay.
        hidden_units=[
            max(2, int(hparams.first_layer_size * hparams.scale_factor**i))
            for i in range(hparams.num_layers)
        ],
        learning_rate=hparams.learning_rate)

    estimator = tf.estimator.Estimator(model_fn=model_fn,
                                       model_dir=hparams.job_dir)
    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
Exemplo n.º 6
0
def train_and_evaluate(args):
    """Trains and evaluates the Keras model.

  Uses the Keras model defined in model.py and trains on data loaded and
  preprocessed in util.py. Saves the trained model in TensorFlow SavedModel
  format to the path defined in part by the --job-dir argument.

  Args:
    args: dictionary of arguments - see get_args() for details
  """

    train_x, train_y, eval_x, eval_y = util.load_data(BUCKET_NAME)

    # dimensions
    num_train_examples, input_dim = train_x.shape
    num_eval_examples = eval_x.shape[0]

    # Create the Keras Model
    keras_model = model.create_keras_model(input_dim=input_dim,
                                           learning_rate=args.learning_rate)

    # Pass a numpy array by passing DataFrame.values
    training_dataset = model.input_fn(features=train_x.values,
                                      labels=train_y,
                                      shuffle=True,
                                      num_epochs=args.num_epochs,
                                      batch_size=args.batch_size)

    # Pass a numpy array by passing DataFrame.values
    validation_dataset = model.input_fn(features=eval_x.values,
                                        labels=eval_y,
                                        shuffle=False,
                                        num_epochs=args.num_epochs,
                                        batch_size=num_eval_examples)

    # Setup Learning Rate decay.
    lr_decay_cb = tf.keras.callbacks.LearningRateScheduler(
        lambda epoch: args.learning_rate + 0.02 * (0.5**(1 + epoch)),
        verbose=True)

    # Setup TensorBoard callback.
    tensorboard_cb = tf.keras.callbacks.TensorBoard(os.path.join(
        args.job_dir, 'keras_tensorboard'),
                                                    histogram_freq=1)

    # Train model
    keras_model.fit(training_dataset,
                    steps_per_epoch=int(num_train_examples / args.batch_size),
                    epochs=args.num_epochs,
                    validation_data=validation_dataset,
                    validation_steps=1,
                    verbose=1,
                    callbacks=[lr_decay_cb, tensorboard_cb])

    export_path = os.path.join(args.job_dir, 'keras_export')
    tf.contrib.saved_model.save_keras_model(keras_model, export_path)
    print('Model exported to: {}'.format(export_path))
Exemplo n.º 7
0
def train_and_maybe_evaluate(hparams):
  """Run the training and evaluate using the high level API.

  Args:
    hparams: Holds hyperparameters used to train the model as name/value pairs.

  Returns:
    The estimator that was used for training (and maybe eval)
  """
  schema = taxi.read_schema(hparams.schema_file)
  tf_transform_output = tft.TFTransformOutput(hparams.tf_transform_dir)

  train_input = lambda: model.input_fn(
      hparams.train_files,
      tf_transform_output,
      batch_size=TRAIN_BATCH_SIZE
  )

  eval_input = lambda: model.input_fn(
      hparams.eval_files,
      tf_transform_output,
      batch_size=EVAL_BATCH_SIZE
  )

  train_spec = tf.estimator.TrainSpec(
      train_input, max_steps=hparams.train_steps)

  serving_receiver_fn = lambda: model.example_serving_receiver_fn(
      tf_transform_output, schema)

  exporter = tf.estimator.FinalExporter('chicago-taxi', serving_receiver_fn)
  eval_spec = tf.estimator.EvalSpec(
      eval_input,
      steps=hparams.eval_steps,
      exporters=[exporter],
      name='chicago-taxi-eval')

  run_config = tf.estimator.RunConfig(
      save_checkpoints_steps=999, keep_checkpoint_max=1)

  serving_model_dir = os.path.join(hparams.output_dir, SERVING_MODEL_DIR)
  run_config = run_config.replace(model_dir=serving_model_dir)

  estimator = model.build_estimator(
      tf_transform_output,

      # Construct layers sizes with exponetial decay
      hidden_units=[
          max(2, int(FIRST_DNN_LAYER_SIZE * DNN_DECAY_FACTOR**i))
          for i in range(NUM_DNN_LAYERS)
      ],
      config=run_config)

  tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)

  return estimator
Exemplo n.º 8
0
def train_and_maybe_evaluate(hparams):
  """Run the training and evaluate using the high level API.

  Args:
    hparams: Holds hyperparameters used to train the model as name/value pairs.

  Returns:
    The estimator that was used for training (and maybe eval)
  """
  schema = taxi.read_schema(hparams.schema_file)

  train_input = lambda: model.input_fn(
      hparams.train_files,
      hparams.tf_transform_dir,
      batch_size=TRAIN_BATCH_SIZE
  )

  eval_input = lambda: model.input_fn(
      hparams.eval_files,
      hparams.tf_transform_dir,
      batch_size=EVAL_BATCH_SIZE
  )

  train_spec = tf.estimator.TrainSpec(
      train_input, max_steps=hparams.train_steps)

  serving_receiver_fn = lambda: model.example_serving_receiver_fn(
      hparams.tf_transform_dir, schema)

  exporter = tf.estimator.FinalExporter('chicago-taxi', serving_receiver_fn)
  eval_spec = tf.estimator.EvalSpec(
      eval_input,
      steps=hparams.eval_steps,
      exporters=[exporter],
      name='chicago-taxi-eval')

  run_config = tf.estimator.RunConfig(
      save_checkpoints_steps=999, keep_checkpoint_max=1)

  serving_model_dir = os.path.join(hparams.output_dir, SERVING_MODEL_DIR)
  run_config = run_config.replace(model_dir=serving_model_dir)

  estimator = model.build_estimator(
      hparams.tf_transform_dir,

      # Construct layers sizes with exponetial decay
      hidden_units=[
          max(2, int(FIRST_DNN_LAYER_SIZE * DNN_DECAY_FACTOR**i))
          for i in range(NUM_DNN_LAYERS)
      ],
      config=run_config)

  tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)

  return estimator
Exemplo n.º 9
0
def run_experiment(hparams):
    """Run the training and evaluate using the high level API"""

    train_input = lambda: model.input_fn(hparams.train_files,
                                         num_epochs=hparams.num_epochs,
                                         batch_size=hparams.train_batch_size)

    # Don't shuffle evaluation data
    eval_input = lambda: model.input_fn(
        hparams.eval_files, batch_size=hparams.eval_batch_size, shuffle=False)

    # TODO: How should these train/eval spec's be further adjusted for using tf.estimator.train_and_evaluate()?
    train_spec = tf.estimator.TrainSpec(train_input,
                                        max_steps=hparams.train_steps)

    exporter = tf.estimator.FinalExporter(
        'census', model.SERVING_FUNCTIONS[hparams.export_format])
    eval_spec = tf.estimator.EvalSpec(eval_input,
                                      steps=hparams.eval_steps,
                                      exporters=[exporter],
                                      name='census-eval')

    model_fn = model.generate_model_fn(
        embedding_size=hparams.embedding_size,
        # Construct layers sizes with exponetial decay
        hidden_units=[
            max(2, int(hparams.first_layer_size * hparams.scale_factor**i))
            for i in range(hparams.num_layers)
        ],
        learning_rate=hparams.learning_rate)

    # TODO: unclear what config settings are needed for my model
    config = tpu_config.RunConfig(
        master=tpu_grpc_url,
        evaluation_master=tpu_grpc_url,
        model_dir=FLAGS.model_dir,
        cluster=tpu_cluster_resolver,
        tpu_config=tpu_config.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_cores))

    estimator = tpu_estimator.Estimator(
        use_tpu=True,
        model_fn=model_fn,
        model_dir=hparams.job_dir,
        config=run_config,
        # train_batch_size --> not being passed as I believe this should be handled in the train_spec...
    )
    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
Exemplo n.º 10
0
 def train_input():
     """Input function returning batches from the training
     data set from training.
     """
     return model.input_fn(args.train_files,
                           num_epochs=args.num_epochs,
                           batch_size=args.train_batch_size)
Exemplo n.º 11
0
 def eval_input():
     """Input function returning the entire validation data
     set for evaluation. Shuffling is not required.
     """
     return model.input_fn(args.eval_files,
                           batch_size=args.eval_batch_size,
                           shuffle=False)
Exemplo n.º 12
0
def run_experiment(hparams):
  """Run the training and evaluate using the high level API"""

  train_input = lambda: model.input_fn(
      hparams.train_files,
      num_epochs=hparams.num_epochs,
      batch_size=hparams.train_batch_size
  )

  # Don't shuffle evaluation data
  eval_input = lambda: model.input_fn(
      hparams.eval_files,
      batch_size=hparams.eval_batch_size,
      shuffle=False
  )

  train_spec = tf.estimator.TrainSpec(train_input,
                                      max_steps=hparams.train_steps
                                      )

  exporter = tf.estimator.FinalExporter('census',
          model.SERVING_FUNCTIONS[hparams.export_format])
  eval_spec = tf.estimator.EvalSpec(eval_input,
                                    steps=hparams.eval_steps,
                                    exporters=[exporter],
                                    name='census-eval'
                                    )

  run_config = tf.estimator.RunConfig()
  run_config = run_config.replace(model_dir=hparams.job_dir)
  print('model dir {}'.format(run_config.model_dir))
  estimator = model.build_estimator(
      embedding_size=hparams.embedding_size,
      # Construct layers sizes with exponetial decay
      hidden_units=[
          max(2, int(hparams.first_layer_size *
                     hparams.scale_factor**i))
          for i in range(hparams.num_layers)
      ],
      config=run_config
  )

  tf.estimator.train_and_evaluate(estimator,
                                  train_spec,
                                  eval_spec)
Exemplo n.º 13
0
def train_and_evaluate(args):
  """Run the training and evaluate using the high level API."""
  train_input = lambda: model.input_fn(
      args.train_files,
      num_epochs=args.num_epochs,
      batch_size=args.train_batch_size
  )

  # Don't shuffle evaluation data.
  eval_input = lambda: model.input_fn(
    args.eval_files,
      batch_size=args.eval_batch_size,
      shuffle=False
  )

  train_spec = tf.estimator.TrainSpec(
      train_input, max_steps=args.train_steps)

  exporter = tf.estimator.FinalExporter(
      'census', model.SERVING_FUNCTIONS[args.export_format])
  eval_spec = tf.estimator.EvalSpec(
      eval_input,
      steps=args.eval_steps,
      exporters=[exporter],
      name='census-eval')

  model_fn = model.generate_model_fn(
      embedding_size=args.embedding_size,
      # Construct layers sizes with exponential decay.
      hidden_units=[
          max(2, int(args.first_layer_size * args.scale_factor**i))
          for i in range(args.num_layers)
      ],
      learning_rate=args.learning_rate)

  estimator = tf.estimator.Estimator(
      model_fn=model_fn, model_dir=args.job_dir)
  tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
Exemplo n.º 14
0
def main(hparams):
    """Run the training and evaluate using the high level API."""

    trn_input = lambda: model.input_fn(hparams.train_files,
                                       batch_size=hparams.train_batch_size)
    train_spec = tf.estimator.TrainSpec(trn_input,
                                        max_steps=hparams.train_steps)

    eval_input = lambda: model.input_fn(
        hparams.eval_files,
        batch_size=hparams.eval_batch_size,
    )

    # Construct our JSON serving function for Online Predictions using GCP.
    exporter = tf.estimator.FinalExporter('model', model.build_serving_fn())
    eval_spec = tf.estimator.EvalSpec(
        eval_input,
        throttle_secs=hparams.eval_secs,
        steps=hparams.eval_steps,
        exporters=[exporter],
    )

    run_config = tf.estimator.RunConfig()
    run_config = run_config.replace(model_dir=hparams.job_dir)
    # Construct layers sizes with exponential decay
    hidden_units = [
        max(2, int(hparams.first_layer_size * hparams.scale_factor**i))
        for i in range(hparams.num_layers)
    ]
    estimator = model.build_estimator(
        config=run_config,
        hidden_units=hidden_units,
        learning_rate=hparams.learning_rate,
        dropout=hparams.dropout,
        embedding_vocab_file=hparams.cpc_embedding_vocab_file,
        embedding_dim=hparams.cpc_embedding_dim,
    )
    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
Exemplo n.º 15
0
def run(target, cluster_spec, is_chief, hparams):
    """Runs the training and evaluation graph.

  Args:
    target (str): Tensorflow server target.
    cluster_spec: (cluster spec) Cluster specification.
    is_chief (bool): Boolean flag to specify a chief server.
    hparams (tf.hparams): Input Arguments.
  """

    # Calculate the number of hidden units
    hidden_units = [
        max(2, int(hparams.first_layer_size * hparams.scale_factor**i))
        for i in range(hparams.num_layers)
    ]

    # If the server is chief which is `master`
    # In between graph replication Chief is one node in
    # the cluster with extra responsibility and by default
    # is worker task zero. We have assigned master as the chief.
    #
    # See https://youtu.be/la_M6bCV91M?t=1203 for details on
    # distributed TensorFlow and motivation about chief.
    if is_chief:
        tf.logging.info('Created DNN hidden units {}'.format(hidden_units))
        evaluation_graph = tf.Graph()
        with evaluation_graph.as_default():

            # Features and label tensors
            features, labels = model.input_fn(
                hparams.eval_files,
                num_epochs=None if hparams.eval_steps else 1,
                batch_size=hparams.eval_batch_size,
                shuffle=False)
            # Accuracy and AUROC metrics
            # model.model_fn returns the dict when EVAL mode
            metric_dict = model.model_fn(model.EVAL,
                                         features.copy(),
                                         labels,
                                         hidden_units=hidden_units,
                                         learning_rate=hparams.learning_rate)

        hooks = [
            EvaluationRunHook(
                hparams.job_dir,
                metric_dict,
                evaluation_graph,
                hparams.eval_frequency,
                eval_steps=hparams.eval_steps,
            )
        ]
    else:
        hooks = []

    # Create a new graph and specify that as default.
    with tf.Graph().as_default():
        # Placement of ops on devices using replica device setter
        # which automatically places the parameters on the `ps` server
        # and the `ops` on the workers.
        #
        # See:
        # https://www.tensorflow.org/api_docs/python/tf/train/replica_device_setter
        with tf.device(tf.train.replica_device_setter(cluster=cluster_spec)):

            # Features and label tensors as read using filename queue.
            features, labels = model.input_fn(
                hparams.train_files,
                num_epochs=hparams.num_epochs,
                batch_size=hparams.train_batch_size)

            # Returns the training graph and global step tensor.
            train_op, global_step_tensor = model.model_fn(
                model.TRAIN,
                features.copy(),
                labels,
                hidden_units=hidden_units,
                learning_rate=hparams.learning_rate)

        # Creates a MonitoredSession for training.
        # MonitoredSession is a Session-like object that handles
        # initialization, recovery and hooks
        # https://www.tensorflow.org/api_docs/python/tf/train/MonitoredTrainingSession
        with tf.train.MonitoredTrainingSession(
                master=target,
                is_chief=is_chief,
                checkpoint_dir=hparams.job_dir,
                hooks=hooks,
                save_checkpoint_secs=20,
                save_summaries_steps=50) as session:
            # Global step to keep track of global number of steps particularly in
            # distributed setting
            step = global_step_tensor.eval(session=session)

            # Run the training graph which returns the step number as tracked by
            # the global step tensor.
            # When train epochs is reached, session.should_stop() will be true.
            while (hparams.train_steps is None or
                   step < hparams.train_steps) and not session.should_stop():
                step, _ = session.run([global_step_tensor, train_op])

        # Find the filename of the latest saved checkpoint file
        latest_checkpoint = tf.train.latest_checkpoint(hparams.job_dir)

        # Only perform this if chief
        if is_chief:
            build_and_run_exports(
                latest_checkpoint, hparams.job_dir,
                model.SERVING_INPUT_FUNCTIONS[hparams.export_format],
                hidden_units)
Exemplo n.º 16
0
def run(target, cluster_spec, is_chief, args):

  """Runs the training and evaluation graph.

  Args:
    target (str): Tensorflow server target.
    cluster_spec: (cluster spec) Cluster specification.
    is_chief (bool): Boolean flag to specify a chief server.
    args (args): Input Arguments.
  """

  # Calculate the number of hidden units
  hidden_units = [
      max(2, int(args.first_layer_size * args.scale_factor**i))
      for i in range(args.num_layers)
  ]

  # If the server is chief which is `master`
  # In between graph replication Chief is one node in
  # the cluster with extra responsibility and by default
  # is worker task zero. We have assigned master as the chief.
  #
  # See https://youtu.be/la_M6bCV91M?t=1203 for details on
  # distributed TensorFlow and motivation about chief.
  if is_chief:
    tf.logging.info('Created DNN hidden units {}'.format(hidden_units))
    evaluation_graph = tf.Graph()
    with evaluation_graph.as_default():

      # Features and label tensors
      features, labels = model.input_fn(
        args.eval_files,
          num_epochs=None if args.eval_steps else 1,
          batch_size=args.eval_batch_size,
          shuffle=False
      )
      # Accuracy and AUROC metrics
      # model.model_fn returns the dict when EVAL mode
      metric_dict = model.model_fn(
          model.EVAL,
          features.copy(),
          labels,
          hidden_units=hidden_units,
          learning_rate=args.learning_rate
      )

    hooks = [EvaluationRunHook(
        args.job_dir,
        metric_dict,
        evaluation_graph,
        args.eval_frequency,
        eval_steps=args.eval_steps,
    )]
  else:
    hooks = []

  # Create a new graph and specify that as default.
  with tf.Graph().as_default():
    # Placement of ops on devices using replica device setter
    # which automatically places the parameters on the `ps` server
    # and the `ops` on the workers.
    #
    # See:
    # https://www.tensorflow.org/api_docs/python/tf/train/replica_device_setter
    with tf.device(tf.train.replica_device_setter(cluster=cluster_spec)):

      # Features and label tensors as read using filename queue.
      features, labels = model.input_fn(
          args.train_files,
          num_epochs=args.num_epochs,
          batch_size=args.train_batch_size
      )

      # Returns the training graph and global step tensor.
      train_op, global_step_tensor = model.model_fn(
          model.TRAIN,
          features.copy(),
          labels,
          hidden_units=hidden_units,
          learning_rate=args.learning_rate
      )

    # Creates a MonitoredSession for training.
    # MonitoredSession is a Session-like object that handles
    # initialization, recovery and hooks
    # https://www.tensorflow.org/api_docs/python/tf/train/MonitoredTrainingSession
    with tf.train.MonitoredTrainingSession(master=target,
                                           is_chief=is_chief,
                                           checkpoint_dir=args.job_dir,
                                           hooks=hooks,
                                           save_checkpoint_secs=20,
                                           save_summaries_steps=50) as session:
      # Global step to keep track of global number of steps particularly in
      # distributed setting
      step = global_step_tensor.eval(session=session)

      # Run the training graph which returns the step number as tracked by
      # the global step tensor.
      # When train epochs is reached, session.should_stop() will be true.
      while (args.train_steps is None or
             step < args.train_steps) and not session.should_stop():
        step, _ = session.run([global_step_tensor, train_op])

    # Find the filename of the latest saved checkpoint file
    latest_checkpoint = tf.train.latest_checkpoint(args.job_dir)

    # Only perform this if chief
    if is_chief:
      build_and_run_exports(latest_checkpoint,
                            args.job_dir,
                            model.SERVING_INPUT_FUNCTIONS[args.export_format],
                            hidden_units)
Exemplo n.º 17
0
 def eval_input():
     return model.input_fn(hparams.eval_files,
                           tf_transform_output,
                           batch_size=EVAL_BATCH_SIZE)
Exemplo n.º 18
0
 def train_input():
     return model.input_fn(hparams.train_files,
                           tf_transform_output,
                           batch_size=TRAIN_BATCH_SIZE)
Exemplo n.º 19
0
def run(target, cluster_spec, is_chief, train_steps, eval_steps, job_dir,
        train_files, eval_files, train_batch_size, eval_batch_size,
        learning_rate, eval_frequency, first_layer_size, num_layers,
        scale_factor, num_epochs, export_format):
    """Run the training and evaluation graph.
  Args:
    target (string): Tensorflow server target
    is_chief (bool): Boolean flag to specify a chief server
    train_steps (int): Maximum number of training steps
    eval_steps (int): Number of steps to run evaluation for at each checkpoint.
      if eval_steps is None, evaluation will run for 1 epoch.
    job_dir (string): Output dir for checkpoint and summary
    train_files (string): List of CSV files to read train data
    eval_files (string): List of CSV files to read eval data
    train_batch_size (int): Batch size for training
    eval_batch_size (int): Batch size for evaluation
    learning_rate (float): Learning rate for Gradient Descent
    eval_frequency (int): Run evaluation frequency every n training steps.
      Do not evaluate too frequently otherwise you will
      pay for performance and do not evaluate too in-frequently
      otherwise you will not know how soon to stop training.
      Use default values to start with
    first_layer_size (int): Size of the first DNN layer
    num_layers (int): Number of hidden layers in the DNN
    scale_factor (float): Decay rate for the size of hidden layers
    num_epochs (int): Maximum number of training data epochs on which to train
    export_format (str): One of 'JSON', 'CSV' or 'EXAMPLE'. The input format
      for the outputed saved_model binary.
  """

    # Calculate the number of hidden units
    hidden_units = [
        max(2, int(first_layer_size * scale_factor**i))
        for i in range(num_layers)
    ]

    # If the server is chief which is `master`
    # In between graph replication Chief is one node in
    # the cluster with extra responsibility and by default
    # is worker task zero. We have assigned master as the chief.
    #
    # See https://youtu.be/la_M6bCV91M?t=1203 for details on
    # distributed TensorFlow and motivation about chief.
    if is_chief:
        tf.logging.info("Created DNN hidden units {}".format(hidden_units))
        evaluation_graph = tf.Graph()
        with evaluation_graph.as_default():

            # Features and label tensors
            features, labels = model.input_fn(
                eval_files,
                num_epochs=None if eval_steps else 1,
                batch_size=eval_batch_size,
                shuffle=False)
            # Accuracy and AUROC metrics
            # model.model_fn returns the dict when EVAL mode
            metric_dict = model.model_fn(model.EVAL,
                                         features.copy(),
                                         labels,
                                         hidden_units=hidden_units,
                                         learning_rate=learning_rate)

        hooks = [
            EvaluationRunHook(
                job_dir,
                metric_dict,
                evaluation_graph,
                eval_frequency,
                eval_steps=eval_steps,
            )
        ]
    else:
        hooks = []

    # Create a new graph and specify that as default
    with tf.Graph().as_default():
        # Placement of ops on devices using replica device setter
        # which automatically places the parameters on the `ps` server
        # and the `ops` on the workers
        #
        # See:
        # https://www.tensorflow.org/api_docs/python/tf/train/replica_device_setter
        with tf.device(tf.train.replica_device_setter(cluster=cluster_spec)):

            # Features and label tensors as read using filename queue
            features, labels = model.input_fn(train_files,
                                              num_epochs=num_epochs,
                                              batch_size=train_batch_size)

            # Returns the training graph and global step tensor
            train_op, global_step_tensor = model.model_fn(
                model.TRAIN,
                features.copy(),
                labels,
                hidden_units=hidden_units,
                learning_rate=learning_rate)

        # Creates a MonitoredSession for training
        # MonitoredSession is a Session-like object that handles
        # initialization, recovery and hooks
        # https://www.tensorflow.org/api_docs/python/tf/train/MonitoredTrainingSession
        with tf.train.MonitoredTrainingSession(
                master=target,
                is_chief=is_chief,
                checkpoint_dir=job_dir,
                hooks=hooks,
                save_checkpoint_secs=20,
                save_summaries_steps=50) as session:
            # Global step to keep track of global number of steps particularly in
            # distributed setting
            step = global_step_tensor.eval(session=session)

            # Run the training graph which returns the step number as tracked by
            # the global step tensor.
            # When train epochs is reached, session.should_stop() will be true.
            while (train_steps is None
                   or step < train_steps) and not session.should_stop():
                step, _ = session.run([global_step_tensor, train_op])

        # Find the filename of the latest saved checkpoint file
        latest_checkpoint = tf.train.latest_checkpoint(job_dir)

        # Only perform this if chief
        if is_chief:
            build_and_run_exports(latest_checkpoint, job_dir,
                                  model.SERVING_INPUT_FUNCTIONS[export_format],
                                  hidden_units)