def train_mnist_distributed_sync_replicas(task_id, is_chief, num_worker_tasks, num_ps_tasks, master, num_epochs, op_strategy, use_fake_data=False): """Train a ConvNet on MNIST using Sync replicas optimizer. Args: task_id: int. Integer in [0, num_worker_tasks). ID for this worker. is_chief: `boolean`, `True` if the worker is chief worker. num_worker_tasks: int. Number of workers in this distributed training setup. num_ps_tasks: int. Number of parameter servers holding variables. master: string. IP and port of TensorFlow runtime process. num_epochs: int. Number of passes to make over the training set. op_strategy: `string`, Strategy to run the covariance and inverse ops. If op_strategy == `chief_worker` then covariance and inverse update ops are run on chief worker otherwise they are run on dedicated workers. use_fake_data: bool. If True, generate a synthetic dataset. Returns: accuracy of model on the final minibatch of training data. Raises: ValueError: If `op_strategy` not in ["chief_worker", "dedicated_workers"]. """ # Load a dataset. tf.logging.info("Loading MNIST into memory.") (examples, labels) = mnist.load_mnist_as_iterator(num_epochs, 128, use_fake_data=use_fake_data, flatten_images=False) # Build a ConvNet. layer_collection = kfac.LayerCollection() with tf.device(tf.train.replica_device_setter(num_ps_tasks)): loss, accuracy = build_model( examples, labels, num_labels=10, layer_collection=layer_collection, register_layers_manually=_USE_MANUAL_REG) if not _USE_MANUAL_REG: layer_collection.auto_register_layers() # Fit model. checkpoint_dir = None if op_strategy == "chief_worker": return distributed_grads_only_and_ops_chief_worker( task_id, is_chief, num_worker_tasks, num_ps_tasks, master, checkpoint_dir, loss, accuracy, layer_collection) elif op_strategy == "dedicated_workers": return distributed_grads_and_ops_dedicated_workers( task_id, is_chief, num_worker_tasks, num_ps_tasks, master, checkpoint_dir, loss, accuracy, layer_collection) else: raise ValueError("Only supported op strategies are : {}, {}".format( "chief_worker", "dedicated_workers"))
def train_mnist_single_machine(num_epochs, use_fake_data=False, device=None, manual_op_exec=False): """Train a ConvNet on MNIST. Args: num_epochs: int. Number of passes to make over the training set. use_fake_data: bool. If True, generate a synthetic dataset. device: string or None. The covariance and inverse update ops are run on this device. If empty or None, the default device will be used. (Default: None) manual_op_exec: bool, If `True` then `minimize_loss_single_machine_manual` is called for training which handles inverse and covariance computation. This is shown only for illustrative purpose. Otherwise `minimize_loss_single_machine` is called which relies on `PeriodicInvCovUpdateOpt` for op placement and execution. Returns: accuracy of model on the final minibatch of training data. """ # Load a dataset. tf.logging.info("Loading MNIST into memory.") (examples, labels) = mnist.load_mnist_as_iterator(num_epochs, 128, use_fake_data=use_fake_data, flatten_images=False) # Build a ConvNet. layer_collection = kfac.LayerCollection() loss, accuracy = build_model(examples, labels, num_labels=10, layer_collection=layer_collection, register_layers_manually=_USE_MANUAL_REG) if not _USE_MANUAL_REG: layer_collection.auto_register_layers() # Without setting allow_soft_placement=True there will be problems when # the optimizer tries to place certain ops like "mod" on the GPU (which isn't # supported). config = tf.ConfigProto(allow_soft_placement=True) # Fit model. if manual_op_exec: return minimize_loss_single_machine_manual(loss, accuracy, layer_collection, device=device, session_config=config) else: return minimize_loss_single_machine(loss, accuracy, layer_collection, device=device, session_config=config)
def input_fn(): tf.logging.info("Loading MNIST into memory.") return mnist.load_mnist_as_iterator(num_epochs=num_epochs, batch_size=64, flatten_images=False, use_fake_data=use_fake_data)
def train_mnist_multitower(num_epochs, num_towers, devices, use_fake_data=False, session_config=None): """Train a ConvNet on MNIST. Training data is split equally among the towers. Each tower computes loss on its own batch of data and the loss is aggregated on the CPU. The model variables are placed on first tower. The covariance and inverse update ops and variables are placed on specified devices in a round robin manner. Args: num_epochs: int. Number of passes to make over the training set. num_towers: int. Number of towers. devices: list of strings. List of devices to place the towers. use_fake_data: bool. If True, generate a synthetic dataset. session_config: None or tf.ConfigProto. Configuration for tf.Session(). Returns: accuracy of model on the final minibatch of training data. """ num_towers = 1 if not devices else len(devices) # Load a dataset. tf.logging.info("Loading MNIST into memory.") tower_batch_size = 128 batch_size = tower_batch_size * num_towers tf.logging.info( ("Loading MNIST into memory. Using batch_size = %d = %d towers * %d " "tower batch size.") % (batch_size, num_towers, tower_batch_size)) (examples, labels) = mnist.load_mnist_as_iterator(num_epochs, batch_size, use_fake_data=use_fake_data, flatten_images=False) # Split minibatch across towers. examples = tf.split(examples, num_towers) labels = tf.split(labels, num_towers) # Build an MLP. Each tower's layers will be added to the LayerCollection. layer_collection = kfac.LayerCollection() tower_results = [] for tower_id in range(num_towers): with tf.device(devices[tower_id]): with tf.name_scope("tower%d" % tower_id): with tf.variable_scope(tf.get_variable_scope(), reuse=(tower_id > 0)): tf.logging.info("Building tower %d." % tower_id) tower_results.append( build_model(examples[tower_id], labels[tower_id], 10, layer_collection, register_layers_manually=_USE_MANUAL_REG)) losses, accuracies = zip(*tower_results) # When using multiple towers we only want to perform automatic # registation once, after the final tower is made if not _USE_MANUAL_REG: layer_collection.auto_register_layers() # Average across towers. loss = tf.reduce_mean(losses) accuracy = tf.reduce_mean(accuracies) # Fit model. g_step = tf.train.get_or_create_global_step() optimizer = kfac.PeriodicInvCovUpdateKfacOpt( invert_every=_INVERT_EVERY, cov_update_every=_COV_UPDATE_EVERY, learning_rate=0.0001, cov_ema_decay=0.95, damping=0.001, layer_collection=layer_collection, placement_strategy="round_robin", cov_devices=devices, inv_devices=devices, trans_devices=devices, momentum=0.9) with tf.device(devices[0]): train_op = optimizer.minimize(loss, global_step=g_step) # Without setting allow_soft_placement=True there will be problems when # the optimizer tries to place certain ops like "mod" on the GPU (which isn't # supported). if not session_config: session_config = tf.ConfigProto(allow_soft_placement=True) tf.logging.info("Starting training.") with tf.train.MonitoredTrainingSession(config=session_config) as sess: while not sess.should_stop(): global_step_, loss_, accuracy_, _ = sess.run( [g_step, loss, accuracy, train_op]) if global_step_ % _REPORT_EVERY == 0: tf.logging.info("global_step: %d | loss: %f | accuracy: %s", global_step_, loss_, accuracy_)