def run_experiment(hparams): """Run the training and evaluate using the high level API""" train_input = lambda: model.input_fn( filename=os.path.join(hparams.data_dir, 'train.tfrecords'), batch_size=hparams.train_batch_size ) eval_input = lambda: model.input_fn( filename=os.path.join(hparams.data_dir, 'test.tfrecords'), batch_size=hparams.eval_batch_size ) train_spec = tf.estimator.TrainSpec(train_input, max_steps=hparams.train_steps ) exporter = tf.estimator.FinalExporter('cnn', model.serving_input_fn) eval_spec = tf.estimator.EvalSpec(eval_input, steps=hparams.eval_steps, exporters=[exporter], name='cnn-eval' ) estimator = model.build_estimator(model_dir=hparams.job_dir) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
def run_experiment(hparams): """Run the training and evaluate using the high level API""" train_input = lambda: model.input_fn(hparams.train_files, num_epochs=hparams.num_epochs, batch_size=hparams.train_batch_size) # Don't shuffle evaluation data eval_input = lambda: model.input_fn( hparams.eval_files, batch_size=hparams.eval_batch_size, shuffle=False) train_spec = tf.estimator.TrainSpec(train_input, max_steps=hparams.train_steps) exporter = tf.estimator.FinalExporter( 'jimini', model.SERVING_FUNCTIONS[hparams.export_format]) eval_spec = tf.estimator.EvalSpec(eval_input, steps=hparams.eval_steps, exporters=[exporter], name='jimini-eval') run_config = tf.estimator.RunConfig() run_config = run_config.replace(model_dir=hparams.job_dir) print('model dir {}'.format(run_config.model_dir)) estimator = model.build_estimator( embedding_size=hparams.embedding_size, # Construct layers sizes with exponetial decay hidden_units=[ max(2, int(hparams.first_layer_size * hparams.scale_factor**i)) for i in range(hparams.num_layers) ], config=run_config) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
def train_and_evaluate(args): """Run the training and evaluate using the high level API.""" train_input = lambda: model.input_fn(args.train_files, num_epochs=args.num_epochs, batch_size=args.train_batch_size) # Don't shuffle evaluation data eval_input = lambda: model.input_fn( args.eval_files, batch_size=args.eval_batch_size, shuffle=False) train_spec = tf.estimator.TrainSpec(train_input, max_steps=args.train_steps) exporter = tf.estimator.FinalExporter( 'census', model.SERVING_FUNCTIONS[args.export_format]) eval_spec = tf.estimator.EvalSpec(eval_input, steps=args.eval_steps, exporters=[exporter], name='census-eval') run_config = tf.estimator.RunConfig( session_config=_get_session_config_from_env_var()) run_config = run_config.replace(model_dir=args.job_dir) print('Model dir %s' % run_config.model_dir) estimator = model.build_estimator( embedding_size=args.embedding_size, # Construct layers sizes with exponential decay hidden_units=[ max(2, int(args.first_layer_size * args.scale_factor**i)) for i in range(args.num_layers) ], config=run_config) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
def run_experiment(hparams): """Run the training and evaluate using the high level API""" train_input = lambda: model.input_fn(hparams.train_files, num_epochs=hparams.num_epochs, batch_size=hparams.train_batch_size) # Don't shuffle evaluation data eval_input = lambda: model.input_fn( hparams.eval_files, batch_size=hparams.eval_batch_size, shuffle=False) train_spec = tf.estimator.TrainSpec(train_input, max_steps=hparams.train_steps) exporter = tf.estimator.FinalExporter( 'airline', model.SERVING_FUNCTIONS[hparams.export_format]) eval_spec = tf.estimator.EvalSpec(eval_input, steps=hparams.eval_steps, exporters=[exporter], name='airline-eval') run_config = tf.estimator.RunConfig() run_config = run_config.replace(model_dir=hparams.job_dir) print('model dir {}'.format(run_config.model_dir)) estimator = model.build_estimator(model=hparams.model, config=run_config) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
def train_and_evaluate(hparams): """Run the training and evaluate using the high level API.""" train_input = lambda: model.input_fn(hparams.train_files, num_epochs=hparams.num_epochs, batch_size=hparams.train_batch_size) # Don't shuffle evaluation data. eval_input = lambda: model.input_fn( hparams.eval_files, batch_size=hparams.eval_batch_size, shuffle=False) train_spec = tf.estimator.TrainSpec(train_input, max_steps=hparams.train_steps) exporter = tf.estimator.FinalExporter( 'census', model.SERVING_FUNCTIONS[hparams.export_format]) eval_spec = tf.estimator.EvalSpec(eval_input, steps=hparams.eval_steps, exporters=[exporter], name='census-eval') model_fn = model.generate_model_fn( embedding_size=hparams.embedding_size, # Construct layers sizes with exponential decay. hidden_units=[ max(2, int(hparams.first_layer_size * hparams.scale_factor**i)) for i in range(hparams.num_layers) ], learning_rate=hparams.learning_rate) estimator = tf.estimator.Estimator(model_fn=model_fn, model_dir=hparams.job_dir) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
def train_and_evaluate(args): """Trains and evaluates the Keras model. Uses the Keras model defined in model.py and trains on data loaded and preprocessed in util.py. Saves the trained model in TensorFlow SavedModel format to the path defined in part by the --job-dir argument. Args: args: dictionary of arguments - see get_args() for details """ train_x, train_y, eval_x, eval_y = util.load_data(BUCKET_NAME) # dimensions num_train_examples, input_dim = train_x.shape num_eval_examples = eval_x.shape[0] # Create the Keras Model keras_model = model.create_keras_model(input_dim=input_dim, learning_rate=args.learning_rate) # Pass a numpy array by passing DataFrame.values training_dataset = model.input_fn(features=train_x.values, labels=train_y, shuffle=True, num_epochs=args.num_epochs, batch_size=args.batch_size) # Pass a numpy array by passing DataFrame.values validation_dataset = model.input_fn(features=eval_x.values, labels=eval_y, shuffle=False, num_epochs=args.num_epochs, batch_size=num_eval_examples) # Setup Learning Rate decay. lr_decay_cb = tf.keras.callbacks.LearningRateScheduler( lambda epoch: args.learning_rate + 0.02 * (0.5**(1 + epoch)), verbose=True) # Setup TensorBoard callback. tensorboard_cb = tf.keras.callbacks.TensorBoard(os.path.join( args.job_dir, 'keras_tensorboard'), histogram_freq=1) # Train model keras_model.fit(training_dataset, steps_per_epoch=int(num_train_examples / args.batch_size), epochs=args.num_epochs, validation_data=validation_dataset, validation_steps=1, verbose=1, callbacks=[lr_decay_cb, tensorboard_cb]) export_path = os.path.join(args.job_dir, 'keras_export') tf.contrib.saved_model.save_keras_model(keras_model, export_path) print('Model exported to: {}'.format(export_path))
def train_and_maybe_evaluate(hparams): """Run the training and evaluate using the high level API. Args: hparams: Holds hyperparameters used to train the model as name/value pairs. Returns: The estimator that was used for training (and maybe eval) """ schema = taxi.read_schema(hparams.schema_file) tf_transform_output = tft.TFTransformOutput(hparams.tf_transform_dir) train_input = lambda: model.input_fn( hparams.train_files, tf_transform_output, batch_size=TRAIN_BATCH_SIZE ) eval_input = lambda: model.input_fn( hparams.eval_files, tf_transform_output, batch_size=EVAL_BATCH_SIZE ) train_spec = tf.estimator.TrainSpec( train_input, max_steps=hparams.train_steps) serving_receiver_fn = lambda: model.example_serving_receiver_fn( tf_transform_output, schema) exporter = tf.estimator.FinalExporter('chicago-taxi', serving_receiver_fn) eval_spec = tf.estimator.EvalSpec( eval_input, steps=hparams.eval_steps, exporters=[exporter], name='chicago-taxi-eval') run_config = tf.estimator.RunConfig( save_checkpoints_steps=999, keep_checkpoint_max=1) serving_model_dir = os.path.join(hparams.output_dir, SERVING_MODEL_DIR) run_config = run_config.replace(model_dir=serving_model_dir) estimator = model.build_estimator( tf_transform_output, # Construct layers sizes with exponetial decay hidden_units=[ max(2, int(FIRST_DNN_LAYER_SIZE * DNN_DECAY_FACTOR**i)) for i in range(NUM_DNN_LAYERS) ], config=run_config) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec) return estimator
def train_and_maybe_evaluate(hparams): """Run the training and evaluate using the high level API. Args: hparams: Holds hyperparameters used to train the model as name/value pairs. Returns: The estimator that was used for training (and maybe eval) """ schema = taxi.read_schema(hparams.schema_file) train_input = lambda: model.input_fn( hparams.train_files, hparams.tf_transform_dir, batch_size=TRAIN_BATCH_SIZE ) eval_input = lambda: model.input_fn( hparams.eval_files, hparams.tf_transform_dir, batch_size=EVAL_BATCH_SIZE ) train_spec = tf.estimator.TrainSpec( train_input, max_steps=hparams.train_steps) serving_receiver_fn = lambda: model.example_serving_receiver_fn( hparams.tf_transform_dir, schema) exporter = tf.estimator.FinalExporter('chicago-taxi', serving_receiver_fn) eval_spec = tf.estimator.EvalSpec( eval_input, steps=hparams.eval_steps, exporters=[exporter], name='chicago-taxi-eval') run_config = tf.estimator.RunConfig( save_checkpoints_steps=999, keep_checkpoint_max=1) serving_model_dir = os.path.join(hparams.output_dir, SERVING_MODEL_DIR) run_config = run_config.replace(model_dir=serving_model_dir) estimator = model.build_estimator( hparams.tf_transform_dir, # Construct layers sizes with exponetial decay hidden_units=[ max(2, int(FIRST_DNN_LAYER_SIZE * DNN_DECAY_FACTOR**i)) for i in range(NUM_DNN_LAYERS) ], config=run_config) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec) return estimator
def run_experiment(hparams): """Run the training and evaluate using the high level API""" train_input = lambda: model.input_fn(hparams.train_files, num_epochs=hparams.num_epochs, batch_size=hparams.train_batch_size) # Don't shuffle evaluation data eval_input = lambda: model.input_fn( hparams.eval_files, batch_size=hparams.eval_batch_size, shuffle=False) # TODO: How should these train/eval spec's be further adjusted for using tf.estimator.train_and_evaluate()? train_spec = tf.estimator.TrainSpec(train_input, max_steps=hparams.train_steps) exporter = tf.estimator.FinalExporter( 'census', model.SERVING_FUNCTIONS[hparams.export_format]) eval_spec = tf.estimator.EvalSpec(eval_input, steps=hparams.eval_steps, exporters=[exporter], name='census-eval') model_fn = model.generate_model_fn( embedding_size=hparams.embedding_size, # Construct layers sizes with exponetial decay hidden_units=[ max(2, int(hparams.first_layer_size * hparams.scale_factor**i)) for i in range(hparams.num_layers) ], learning_rate=hparams.learning_rate) # TODO: unclear what config settings are needed for my model config = tpu_config.RunConfig( master=tpu_grpc_url, evaluation_master=tpu_grpc_url, model_dir=FLAGS.model_dir, cluster=tpu_cluster_resolver, tpu_config=tpu_config.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_cores)) estimator = tpu_estimator.Estimator( use_tpu=True, model_fn=model_fn, model_dir=hparams.job_dir, config=run_config, # train_batch_size --> not being passed as I believe this should be handled in the train_spec... ) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
def train_input(): """Input function returning batches from the training data set from training. """ return model.input_fn(args.train_files, num_epochs=args.num_epochs, batch_size=args.train_batch_size)
def eval_input(): """Input function returning the entire validation data set for evaluation. Shuffling is not required. """ return model.input_fn(args.eval_files, batch_size=args.eval_batch_size, shuffle=False)
def run_experiment(hparams): """Run the training and evaluate using the high level API""" train_input = lambda: model.input_fn( hparams.train_files, num_epochs=hparams.num_epochs, batch_size=hparams.train_batch_size ) # Don't shuffle evaluation data eval_input = lambda: model.input_fn( hparams.eval_files, batch_size=hparams.eval_batch_size, shuffle=False ) train_spec = tf.estimator.TrainSpec(train_input, max_steps=hparams.train_steps ) exporter = tf.estimator.FinalExporter('census', model.SERVING_FUNCTIONS[hparams.export_format]) eval_spec = tf.estimator.EvalSpec(eval_input, steps=hparams.eval_steps, exporters=[exporter], name='census-eval' ) run_config = tf.estimator.RunConfig() run_config = run_config.replace(model_dir=hparams.job_dir) print('model dir {}'.format(run_config.model_dir)) estimator = model.build_estimator( embedding_size=hparams.embedding_size, # Construct layers sizes with exponetial decay hidden_units=[ max(2, int(hparams.first_layer_size * hparams.scale_factor**i)) for i in range(hparams.num_layers) ], config=run_config ) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
def train_and_evaluate(args): """Run the training and evaluate using the high level API.""" train_input = lambda: model.input_fn( args.train_files, num_epochs=args.num_epochs, batch_size=args.train_batch_size ) # Don't shuffle evaluation data. eval_input = lambda: model.input_fn( args.eval_files, batch_size=args.eval_batch_size, shuffle=False ) train_spec = tf.estimator.TrainSpec( train_input, max_steps=args.train_steps) exporter = tf.estimator.FinalExporter( 'census', model.SERVING_FUNCTIONS[args.export_format]) eval_spec = tf.estimator.EvalSpec( eval_input, steps=args.eval_steps, exporters=[exporter], name='census-eval') model_fn = model.generate_model_fn( embedding_size=args.embedding_size, # Construct layers sizes with exponential decay. hidden_units=[ max(2, int(args.first_layer_size * args.scale_factor**i)) for i in range(args.num_layers) ], learning_rate=args.learning_rate) estimator = tf.estimator.Estimator( model_fn=model_fn, model_dir=args.job_dir) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
def main(hparams): """Run the training and evaluate using the high level API.""" trn_input = lambda: model.input_fn(hparams.train_files, batch_size=hparams.train_batch_size) train_spec = tf.estimator.TrainSpec(trn_input, max_steps=hparams.train_steps) eval_input = lambda: model.input_fn( hparams.eval_files, batch_size=hparams.eval_batch_size, ) # Construct our JSON serving function for Online Predictions using GCP. exporter = tf.estimator.FinalExporter('model', model.build_serving_fn()) eval_spec = tf.estimator.EvalSpec( eval_input, throttle_secs=hparams.eval_secs, steps=hparams.eval_steps, exporters=[exporter], ) run_config = tf.estimator.RunConfig() run_config = run_config.replace(model_dir=hparams.job_dir) # Construct layers sizes with exponential decay hidden_units = [ max(2, int(hparams.first_layer_size * hparams.scale_factor**i)) for i in range(hparams.num_layers) ] estimator = model.build_estimator( config=run_config, hidden_units=hidden_units, learning_rate=hparams.learning_rate, dropout=hparams.dropout, embedding_vocab_file=hparams.cpc_embedding_vocab_file, embedding_dim=hparams.cpc_embedding_dim, ) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
def run(target, cluster_spec, is_chief, hparams): """Runs the training and evaluation graph. Args: target (str): Tensorflow server target. cluster_spec: (cluster spec) Cluster specification. is_chief (bool): Boolean flag to specify a chief server. hparams (tf.hparams): Input Arguments. """ # Calculate the number of hidden units hidden_units = [ max(2, int(hparams.first_layer_size * hparams.scale_factor**i)) for i in range(hparams.num_layers) ] # If the server is chief which is `master` # In between graph replication Chief is one node in # the cluster with extra responsibility and by default # is worker task zero. We have assigned master as the chief. # # See https://youtu.be/la_M6bCV91M?t=1203 for details on # distributed TensorFlow and motivation about chief. if is_chief: tf.logging.info('Created DNN hidden units {}'.format(hidden_units)) evaluation_graph = tf.Graph() with evaluation_graph.as_default(): # Features and label tensors features, labels = model.input_fn( hparams.eval_files, num_epochs=None if hparams.eval_steps else 1, batch_size=hparams.eval_batch_size, shuffle=False) # Accuracy and AUROC metrics # model.model_fn returns the dict when EVAL mode metric_dict = model.model_fn(model.EVAL, features.copy(), labels, hidden_units=hidden_units, learning_rate=hparams.learning_rate) hooks = [ EvaluationRunHook( hparams.job_dir, metric_dict, evaluation_graph, hparams.eval_frequency, eval_steps=hparams.eval_steps, ) ] else: hooks = [] # Create a new graph and specify that as default. with tf.Graph().as_default(): # Placement of ops on devices using replica device setter # which automatically places the parameters on the `ps` server # and the `ops` on the workers. # # See: # https://www.tensorflow.org/api_docs/python/tf/train/replica_device_setter with tf.device(tf.train.replica_device_setter(cluster=cluster_spec)): # Features and label tensors as read using filename queue. features, labels = model.input_fn( hparams.train_files, num_epochs=hparams.num_epochs, batch_size=hparams.train_batch_size) # Returns the training graph and global step tensor. train_op, global_step_tensor = model.model_fn( model.TRAIN, features.copy(), labels, hidden_units=hidden_units, learning_rate=hparams.learning_rate) # Creates a MonitoredSession for training. # MonitoredSession is a Session-like object that handles # initialization, recovery and hooks # https://www.tensorflow.org/api_docs/python/tf/train/MonitoredTrainingSession with tf.train.MonitoredTrainingSession( master=target, is_chief=is_chief, checkpoint_dir=hparams.job_dir, hooks=hooks, save_checkpoint_secs=20, save_summaries_steps=50) as session: # Global step to keep track of global number of steps particularly in # distributed setting step = global_step_tensor.eval(session=session) # Run the training graph which returns the step number as tracked by # the global step tensor. # When train epochs is reached, session.should_stop() will be true. while (hparams.train_steps is None or step < hparams.train_steps) and not session.should_stop(): step, _ = session.run([global_step_tensor, train_op]) # Find the filename of the latest saved checkpoint file latest_checkpoint = tf.train.latest_checkpoint(hparams.job_dir) # Only perform this if chief if is_chief: build_and_run_exports( latest_checkpoint, hparams.job_dir, model.SERVING_INPUT_FUNCTIONS[hparams.export_format], hidden_units)
def run(target, cluster_spec, is_chief, args): """Runs the training and evaluation graph. Args: target (str): Tensorflow server target. cluster_spec: (cluster spec) Cluster specification. is_chief (bool): Boolean flag to specify a chief server. args (args): Input Arguments. """ # Calculate the number of hidden units hidden_units = [ max(2, int(args.first_layer_size * args.scale_factor**i)) for i in range(args.num_layers) ] # If the server is chief which is `master` # In between graph replication Chief is one node in # the cluster with extra responsibility and by default # is worker task zero. We have assigned master as the chief. # # See https://youtu.be/la_M6bCV91M?t=1203 for details on # distributed TensorFlow and motivation about chief. if is_chief: tf.logging.info('Created DNN hidden units {}'.format(hidden_units)) evaluation_graph = tf.Graph() with evaluation_graph.as_default(): # Features and label tensors features, labels = model.input_fn( args.eval_files, num_epochs=None if args.eval_steps else 1, batch_size=args.eval_batch_size, shuffle=False ) # Accuracy and AUROC metrics # model.model_fn returns the dict when EVAL mode metric_dict = model.model_fn( model.EVAL, features.copy(), labels, hidden_units=hidden_units, learning_rate=args.learning_rate ) hooks = [EvaluationRunHook( args.job_dir, metric_dict, evaluation_graph, args.eval_frequency, eval_steps=args.eval_steps, )] else: hooks = [] # Create a new graph and specify that as default. with tf.Graph().as_default(): # Placement of ops on devices using replica device setter # which automatically places the parameters on the `ps` server # and the `ops` on the workers. # # See: # https://www.tensorflow.org/api_docs/python/tf/train/replica_device_setter with tf.device(tf.train.replica_device_setter(cluster=cluster_spec)): # Features and label tensors as read using filename queue. features, labels = model.input_fn( args.train_files, num_epochs=args.num_epochs, batch_size=args.train_batch_size ) # Returns the training graph and global step tensor. train_op, global_step_tensor = model.model_fn( model.TRAIN, features.copy(), labels, hidden_units=hidden_units, learning_rate=args.learning_rate ) # Creates a MonitoredSession for training. # MonitoredSession is a Session-like object that handles # initialization, recovery and hooks # https://www.tensorflow.org/api_docs/python/tf/train/MonitoredTrainingSession with tf.train.MonitoredTrainingSession(master=target, is_chief=is_chief, checkpoint_dir=args.job_dir, hooks=hooks, save_checkpoint_secs=20, save_summaries_steps=50) as session: # Global step to keep track of global number of steps particularly in # distributed setting step = global_step_tensor.eval(session=session) # Run the training graph which returns the step number as tracked by # the global step tensor. # When train epochs is reached, session.should_stop() will be true. while (args.train_steps is None or step < args.train_steps) and not session.should_stop(): step, _ = session.run([global_step_tensor, train_op]) # Find the filename of the latest saved checkpoint file latest_checkpoint = tf.train.latest_checkpoint(args.job_dir) # Only perform this if chief if is_chief: build_and_run_exports(latest_checkpoint, args.job_dir, model.SERVING_INPUT_FUNCTIONS[args.export_format], hidden_units)
def eval_input(): return model.input_fn(hparams.eval_files, tf_transform_output, batch_size=EVAL_BATCH_SIZE)
def train_input(): return model.input_fn(hparams.train_files, tf_transform_output, batch_size=TRAIN_BATCH_SIZE)
def run(target, cluster_spec, is_chief, train_steps, eval_steps, job_dir, train_files, eval_files, train_batch_size, eval_batch_size, learning_rate, eval_frequency, first_layer_size, num_layers, scale_factor, num_epochs, export_format): """Run the training and evaluation graph. Args: target (string): Tensorflow server target is_chief (bool): Boolean flag to specify a chief server train_steps (int): Maximum number of training steps eval_steps (int): Number of steps to run evaluation for at each checkpoint. if eval_steps is None, evaluation will run for 1 epoch. job_dir (string): Output dir for checkpoint and summary train_files (string): List of CSV files to read train data eval_files (string): List of CSV files to read eval data train_batch_size (int): Batch size for training eval_batch_size (int): Batch size for evaluation learning_rate (float): Learning rate for Gradient Descent eval_frequency (int): Run evaluation frequency every n training steps. Do not evaluate too frequently otherwise you will pay for performance and do not evaluate too in-frequently otherwise you will not know how soon to stop training. Use default values to start with first_layer_size (int): Size of the first DNN layer num_layers (int): Number of hidden layers in the DNN scale_factor (float): Decay rate for the size of hidden layers num_epochs (int): Maximum number of training data epochs on which to train export_format (str): One of 'JSON', 'CSV' or 'EXAMPLE'. The input format for the outputed saved_model binary. """ # Calculate the number of hidden units hidden_units = [ max(2, int(first_layer_size * scale_factor**i)) for i in range(num_layers) ] # If the server is chief which is `master` # In between graph replication Chief is one node in # the cluster with extra responsibility and by default # is worker task zero. We have assigned master as the chief. # # See https://youtu.be/la_M6bCV91M?t=1203 for details on # distributed TensorFlow and motivation about chief. if is_chief: tf.logging.info("Created DNN hidden units {}".format(hidden_units)) evaluation_graph = tf.Graph() with evaluation_graph.as_default(): # Features and label tensors features, labels = model.input_fn( eval_files, num_epochs=None if eval_steps else 1, batch_size=eval_batch_size, shuffle=False) # Accuracy and AUROC metrics # model.model_fn returns the dict when EVAL mode metric_dict = model.model_fn(model.EVAL, features.copy(), labels, hidden_units=hidden_units, learning_rate=learning_rate) hooks = [ EvaluationRunHook( job_dir, metric_dict, evaluation_graph, eval_frequency, eval_steps=eval_steps, ) ] else: hooks = [] # Create a new graph and specify that as default with tf.Graph().as_default(): # Placement of ops on devices using replica device setter # which automatically places the parameters on the `ps` server # and the `ops` on the workers # # See: # https://www.tensorflow.org/api_docs/python/tf/train/replica_device_setter with tf.device(tf.train.replica_device_setter(cluster=cluster_spec)): # Features and label tensors as read using filename queue features, labels = model.input_fn(train_files, num_epochs=num_epochs, batch_size=train_batch_size) # Returns the training graph and global step tensor train_op, global_step_tensor = model.model_fn( model.TRAIN, features.copy(), labels, hidden_units=hidden_units, learning_rate=learning_rate) # Creates a MonitoredSession for training # MonitoredSession is a Session-like object that handles # initialization, recovery and hooks # https://www.tensorflow.org/api_docs/python/tf/train/MonitoredTrainingSession with tf.train.MonitoredTrainingSession( master=target, is_chief=is_chief, checkpoint_dir=job_dir, hooks=hooks, save_checkpoint_secs=20, save_summaries_steps=50) as session: # Global step to keep track of global number of steps particularly in # distributed setting step = global_step_tensor.eval(session=session) # Run the training graph which returns the step number as tracked by # the global step tensor. # When train epochs is reached, session.should_stop() will be true. while (train_steps is None or step < train_steps) and not session.should_stop(): step, _ = session.run([global_step_tensor, train_op]) # Find the filename of the latest saved checkpoint file latest_checkpoint = tf.train.latest_checkpoint(job_dir) # Only perform this if chief if is_chief: build_and_run_exports(latest_checkpoint, job_dir, model.SERVING_INPUT_FUNCTIONS[export_format], hidden_units)