def testErrorIfUsedBeforeMinimizeCalled(self): opt = training.SyncReplicasOptimizer( opt=gradient_descent.GradientDescentOptimizer(1.0), replicas_to_aggregate=1, total_num_replicas=1) hook = opt.make_session_run_hook(True) with self.assertRaisesRegex(ValueError, "apply_gradient should be called"): hook.begin()
def testCanCreatedBeforeMinimizeCalled(self): """This behavior is required to be integrated with Estimators.""" opt = training.SyncReplicasOptimizer( opt=gradient_descent.GradientDescentOptimizer(1.0), replicas_to_aggregate=1, total_num_replicas=1) hook = opt.make_session_run_hook(True) v = variables.VariableV1([0.]) global_step = variables.VariableV1(0, name="global_step", trainable=False) opt.minimize(v, global_step=global_step) hook.begin()
def testFetchVariableList(self): opt = training.SyncReplicasOptimizer(opt=adam.AdamOptimizer(0.01), replicas_to_aggregate=1, total_num_replicas=1) v = variables.Variable([0.], name="fetch_variable_test") global_step = variables.Variable(0, name="global_step", trainable=False) opt.minimize(v, global_step=global_step) opt_variables = opt.variables() self.assertIn(opt._opt._beta1_power, opt_variables) self.assertIn(opt._opt._beta2_power, opt_variables)
def get_workers(num_workers, replicas_to_aggregate, workers): sessions = [] graphs = [] train_ops = [] for worker_id in range(num_workers): graph = ops.Graph() is_chief = (worker_id == 0) with graph.as_default(): with ops.device("/job:ps/task:0"): global_step = variables.VariableV1(0, name="global_step", trainable=False) var_0 = variables.VariableV1(0.0, name="v0") with ops.device("/job:ps/task:1"): var_1 = variables.VariableV1(1.0, name="v1") var_sparse = variables.VariableV1([[3.0], [4.0]], name="v_sparse") with ops.device("/job:worker/task:" + str(worker_id)): grads_0 = constant_op.constant(0.1 + worker_id * 0.2) grads_1 = constant_op.constant(0.9 + worker_id * 0.2) # This is to test against sparse gradients. grads_sparse = ops.IndexedSlices( constant_op.constant([0.1 + worker_id * 0.2], shape=[1, 1]), constant_op.constant([1]), constant_op.constant([2, 1])) sgd_opt = gradient_descent.GradientDescentOptimizer(2.0) sync_rep_opt = training.SyncReplicasOptimizer( sgd_opt, replicas_to_aggregate=replicas_to_aggregate, total_num_replicas=num_workers) train_op = [ sync_rep_opt.apply_gradients( zip([grads_0, grads_1, grads_sparse], [var_0, var_1, var_sparse]), global_step=global_step) ] sync_replicas_hook = sync_rep_opt.make_session_run_hook( is_chief, num_tokens=num_workers) # Creates MonitoredSession session = training.MonitoredTrainingSession( master=workers[worker_id].target, is_chief=is_chief, hooks=[sync_replicas_hook]) sessions.append(session) graphs.append(graph) train_ops.append(train_op) return sessions, graphs, train_ops
def get_workers(num_workers, replicas_to_aggregate, workers): sessions = [] graphs = [] train_ops = [] for worker_id in range(num_workers): graph = ops.Graph() is_chief = (worker_id == 0) with graph.as_default(): with ops.device("/job:ps/task:0"): global_step = variables.Variable(0, name="global_step", trainable=False) var_0 = variables.Variable(0.0, name="v0") with ops.device("/job:ps/task:1"): var_1 = variables.Variable(1.0, name="v1") var_sparse = variables.Variable([[3.0], [4.0]], name="v_sparse") with ops.device("/job:worker/task:" + str(worker_id)): grads_0 = constant_op.constant(0.1 + worker_id * 0.2) grads_1 = constant_op.constant(0.9 + worker_id * 0.2) # This is to test against sparse gradients. grads_sparse = ops.IndexedSlices( constant_op.constant( [0.1 + worker_id * 0.2], shape=[1, 1]), constant_op.constant([1]), constant_op.constant([2, 1])) sgd_opt = gradient_descent.GradientDescentOptimizer(2.0) sync_rep_opt = training.SyncReplicasOptimizer( sgd_opt, replicas_to_aggregate=replicas_to_aggregate, total_num_replicas=num_workers) train_op = [ sync_rep_opt.apply_gradients( zip([grads_0, grads_1, grads_sparse], [var_0, var_1, var_sparse]), global_step=global_step) ] init_op = variables.global_variables_initializer() # Needed ops from the sync_rep optimizer. This is mainly for the # local_step initialization. local_init_op = sync_rep_opt.local_step_init_op if is_chief: local_init_op = sync_rep_opt.chief_init_op ready_for_local_init_op = sync_rep_opt.ready_for_local_init_op # Chief_queue_runner chief_queue_runner = sync_rep_opt.get_chief_queue_runner() sync_init_op = sync_rep_opt.get_init_tokens_op(num_workers) # Creates session for chief. supervisor = supervisor_lib.Supervisor( graph=graph, is_chief=is_chief, recovery_wait_secs=1, init_op=init_op, local_init_op=local_init_op, ready_for_local_init_op=ready_for_local_init_op) session = supervisor.prepare_or_wait_for_session(workers[worker_id].target) # Chief should execute the sync_init_op and start the chief queue runner. if is_chief: session.run(sync_init_op) supervisor.StartQueueRunners(session, [chief_queue_runner]) sessions.append(session) graphs.append(graph) train_ops.append(train_op) return sessions, graphs, train_ops