def testAccumulatorSizeEmpty(self): with self.cached_session(): q = data_flow_ops.ConditionalAccumulator(dtypes_lib.float32, name="Q") self.assertEqual(q.num_accumulated().eval(), 0)
def apply_gradients(self, grads_and_vars, global_step=None, name=None): """Apply gradients to variables. This contains most of the synchronization implementation and also wraps the apply_gradients() from the real optimizer. Args: grads_and_vars: List of (gradient, variable) pairs as returned by compute_gradients(). global_step: Optional Variable to increment by one after the variables have been updated. name: Optional name for the returned operation. Default to the name passed to the Optimizer constructor. Returns: train_op: The op to dequeue a token so the replicas can exit this batch and start the next one. This is executed by each replica. Raises: ValueError: If the grads_and_vars is empty. ValueError: If global step is not provided, the staleness cannot be checked. """ if not grads_and_vars: raise ValueError("Must supply at least one variable") if global_step is None: raise ValueError("Global step is required to check staleness") self._global_step = global_step train_ops = [] aggregated_grad = [] var_list = [] # local_anchor op will be placed on this worker task by default. local_anchor = control_flow_ops.no_op() # Colocating local_step variable prevents it being placed on the PS. distribution_strategy = distribution_strategy_context.get_strategy() with distribution_strategy.extended.colocate_vars_with(local_anchor): self._local_step = variable_scope.variable( initial_value=0, trainable=False, collections=[ops.GraphKeys.LOCAL_VARIABLES], dtype=global_step.dtype.base_dtype, name="sync_rep_local_step") self.local_step_init_op = state_ops.assign(self._local_step, global_step) chief_init_ops = [self.local_step_init_op] self.ready_for_local_init_op = variables.report_uninitialized_variables( variables.global_variables()) with ops.name_scope(None, self._name): for grad, var in grads_and_vars: var_list.append(var) with ops.device(var.device): # Dense gradients. if grad is None: aggregated_grad.append(None) # pass-through. continue elif isinstance(grad, ops.Tensor): grad_accum = data_flow_ops.ConditionalAccumulator( grad.dtype, shape=var.get_shape(), shared_name=var.name + "/grad_accum") train_ops.append(grad_accum.apply_grad( grad, local_step=self._local_step)) aggregated_grad.append(grad_accum.take_grad( self._replicas_to_aggregate)) else: if not isinstance(grad, ops.IndexedSlices): raise ValueError("Unknown grad type!") grad_accum = data_flow_ops.SparseConditionalAccumulator( grad.dtype, shape=(), shared_name=var.name + "/grad_accum") train_ops.append(grad_accum.apply_indexed_slices_grad( grad, local_step=self._local_step)) aggregated_grad.append(grad_accum.take_indexed_slices_grad( self._replicas_to_aggregate)) self._accumulator_list.append((grad_accum, var.device)) aggregated_grads_and_vars = zip(aggregated_grad, var_list) # sync_op will be assigned to the same device as the global step. with ops.device(global_step.device), ops.name_scope(""): update_op = self._opt.apply_gradients(aggregated_grads_and_vars, global_step) # Create token queue. with ops.device(global_step.device), ops.name_scope(""): sync_token_queue = ( data_flow_ops.FIFOQueue(-1, global_step.dtype.base_dtype, shapes=(), name="sync_token_q", shared_name="sync_token_q")) self._sync_token_queue = sync_token_queue # dummy_queue is passed to the queue runner. Don't use the real queues # because the queue runner doesn't automatically reopen it once it # closed queues in PS devices. dummy_queue = ( data_flow_ops.FIFOQueue(1, types_pb2.DT_INT32, shapes=(), name="dummy_queue", shared_name="dummy_queue")) with ops.device(global_step.device), ops.name_scope(""): # Replicas have to wait until they can get a token from the token queue. with ops.control_dependencies(train_ops): token = sync_token_queue.dequeue() train_op = state_ops.assign(self._local_step, token) with ops.control_dependencies([update_op]): # Sync_op needs to insert tokens to the token queue at the end of the # step so the replicas can fetch them to start the next step. tokens = array_ops.fill([self._tokens_per_step], global_step) sync_op = sync_token_queue.enqueue_many((tokens,)) if self._variable_averages is not None: with ops.control_dependencies([sync_op]), ops.name_scope(""): sync_op = self._variable_averages.apply( self._variables_to_average) self._chief_queue_runner = queue_runner.QueueRunner(dummy_queue, [sync_op]) for accum, dev in self._accumulator_list: with ops.device(dev): chief_init_ops.append( accum.set_global_step( global_step, name="SetGlobalStep")) self.chief_init_op = control_flow_ops.group(*(chief_init_ops)) self._gradients_applied = True return train_op
def testConstructorWithInvalidArg(self): with ops.Graph().as_default(): with self.assertRaises(ValueError): data_flow_ops.ConditionalAccumulator( dtypes_lib.float32, name="Q", reduction_type="Invalid")
def apply_gradients(self, grads_and_vars, worker_id, global_step=None, name=None, collect_cdfs=False): """Apply gradients to variables. This contains most of the synchronization implementation and also wraps the apply_gradients() from the real optimizer. Args: grads_and_vars: List of (gradient, variable) pairs as returned by compute_gradients(). global_step: Optional Variable to increment by one after the variables have been updated. name: Optional name for the returned operation. Default to the name passed to the Optimizer constructor. Returns: train_op: The op to dequeue a token so the replicas can exit this batch and start the next one. This is executed by each replica. Raises: ValueError: If the grads_and_vars is empty. ValueError: If global step is not provided, the staleness cannot be checked. """ if not grads_and_vars: raise ValueError("Must supply at least one variable") if global_step is None: raise ValueError("Global step is required to check staleness") self._global_step = global_step train_ops = [] aggregated_grad = [] var_list = [] printer_ops = [] def f_pos(): enq_total_ops = self._stop_queue.enqueue(global_step) ''' for worker_id in range(self._total_num_replicas): enq_ops = self._should_stop_queues[worker_id].enqueue(global_step) with ops.control_dependencies([enq_ops]): L = [] ''' # ret_pos = [tf.constant(i) for i in range(self._construtor)] with ops.control_dependencies([enq_total_ops]): return tf.Print(global_step, [global_step], message="Enquequed to stop queue") # ret_pos = tf.Variable(33) # return ret_pos def f_neg(): # ret_neg = [tf.constant(i+5) for i in range(self._construtor)] ret_neg = tf.Variable(22) return tf.Print(global_step, [global_step], message="Nothing to stop queue") # worker_id_list_printer = logging_ops.Print(global_step, # [a for a in self._worker_idx_list] + [worker_id] + [global_step], # message="Worker ID list status") # train_ops.append(worker_id_list_printer) self._local_step = variables.Variable( initial_value=0, trainable=False, collections=[ops.GraphKeys.LOCAL_VARIABLES], dtype=global_step.dtype.base_dtype, name="sync_rep_local_step") self.local_step_init_op = state_ops.assign(self._local_step, global_step._ref()) chief_init_ops = [self.local_step_init_op] self.ready_for_local_init_op = variables.report_uninitialized_variables( variables.all_variables()) # The wait op waits for the current worker to dequeue a token from its respective token queue self._wait_op = self._sync_token_queues[worker_id].dequeue() # Replicas have to wait until they can get a token from the token queue # BEFORE begining to compute gradients. with ops.device(global_step.device): queue_size = self._sync_token_queues[worker_id].size() update_local_step_op = state_ops.assign(self._local_step, global_step._ref()) # Gradient accum creation with ops.name_scope(None, self._name): for grad, var in grads_and_vars: var_list.append(var) tf.logging.info("Grad " + str(grad) + " assigned to " + str(var.device)) with ops.device(var.device): if grad is None: continue elif isinstance(grad, ops.Tensor): grad_accum = data_flow_ops.ConditionalAccumulator( grad.dtype, shape=var.get_shape(), shared_name=var.name + "/grad_accum") else: if not isinstance(grad, ops.IndexedSlices): raise ValueError("Unknown grad type!") grad_accum = data_flow_ops.SparseConditionalAccumulator( grad.dtype, shape=(), shared_name=var.name + "/grad_accum") self._accumulator_list.append((grad_accum, var)) """# Phase 1 gradient computation with ops.control_dependencies([update_local_step_op]): for index, (grad, var) in enumerate(grads_and_vars): with ops.device(var.device): if grad is None: continue elif isinstance(grad, ops.Tensor): grad_accum = self._accumulator_list[index][0] train_ops.append(grad_accum.apply_grad(grad, local_step=self._local_step._ref())) else: if not isinstance(grad, ops.IndexedSlices): raise ValueError("Unknown grad type!") grad_accum = self._accumulator_list[index][0] train_ops.append(grad_accum.apply_indexed_slices_grad( grad, local_step=self._local_step._ref()))""" # Phase 1 gradient computation with ops.control_dependencies([update_local_step_op]): for index, (grad, var) in enumerate(grads_and_vars): print_start_op = logging_ops.Print( global_step, [global_step], message="Starting to apply grads for variable %d" % index) with ops.device(var.device): if grad is None: continue elif isinstance(grad, ops.Tensor): grad_accum = self._accumulator_list[index][0] with ops.control_dependencies([print_start_op]): with tf.device("job:worker/task:%d" % worker_id): apply_grad_op = grad_accum.apply_grad( grad, local_step=self._local_step._ref()) with ops.control_dependencies( [apply_grad_op]): finished_print_op = logging_ops.Print( global_step, [global_step], message= "Done applying grads for variable %d" % index) train_ops.append(finished_print_op) else: if not isinstance(grad, ops.IndexedSlices): raise ValueError("Unknown grad type!") grad_accum = self._accumulator_list[index][0] with ops.control_dependencies([print_start_op]): with tf.device("job:worker/task:%d" % worker_id): apply_grad_op = grad_accum.apply_indexed_slices_grad( grad, local_step=self._local_step._ref()) with ops.control_dependencies( [apply_grad_op]): finished_print_op = logging_ops.Print( global_step, [global_step], message= "Done applying grads for variable %d" % index) train_ops.append(finished_print_op) with ops.control_dependencies([apply_grad_op]): accum_sizes_printer = logging_ops.Print( global_step, [ x[0].num_accumulated() for x in self._accumulator_list ] + [worker_id] + [global_step], message="Accum aggregated status on ps") train_ops.append(accum_sizes_printer) x = self._accumulator_list[0] ret = tf.cond( tf.greater_equal( x[0].num_accumulated(), self._constant_for_comparison), f_pos, f_neg) should_stop_list_printer = logging_ops.Print( global_step, [ret], message="Should stop ret val status on ps") train_ops.append(should_stop_list_printer) with ops.control_dependencies([ret]): queue_total_printer = logging_ops.Print( global_step, [self._stop_queue.size()], message="shared should stop queue size") train_ops.append(queue_total_printer) # Phase 2 gradient applying for index, (grad, var) in enumerate(grads_and_vars): with ops.device(var.device): grad_accum = self._accumulator_list[index][0] if grad is None: aggregated_grad.append(None) elif isinstance(grad, ops.Tensor): if collect_cdfs: aggregated_grad.append( grad_accum.take_grad(self._total_num_replicas)) else: aggregated_grad.append(grad_accum.take_grad(1)) else: if collect_cdfs: aggregated_grad.append( grad_accum.take_grad(self._total_num_replicas)) else: aggregated_grad.append( grad_accum.take_indexed_slices_grad(1)) aggregated_grads_and_vars = zip(aggregated_grad, var_list) # Some debug operations self.print_sizes = logging_ops.Print(global_step, [ self._sync_token_queues[i].size() for i in range(self._total_num_replicas) ], message="queue sizes") self.print_accum_sizes = logging_ops.Print( self._local_step, [x[0].num_accumulated() for x in self._accumulator_list] + [worker_id], message="Accum sizes") self.print_local_step = logging_ops.Print( self._local_step, [self._local_step._ref(), global_step._ref()], message="local vs global step") # sync_op will be assigned to the same device as the global step. with ops.device(global_step.device), ops.name_scope(""): with ops.control_dependencies([self.print_accum_sizes]): update_op = self._opt.apply_gradients( aggregated_grads_and_vars, global_step) self._update_op = update_op num_to_dequeue = self._stop_queue.size() deq_ops = self._stop_queue.dequeue_many(num_to_dequeue) with ops.control_dependencies([deq_ops]): size_printer_2 = logging_ops.Print( global_step, [self.print_accum_sizes], message="Complelted the dequeue operation!") printer_ops.append(size_printer_2) with ops.control_dependencies(printer_ops): with ops.control_dependencies([update_op]): sync_op = [] for cur_worker_id in range( self._total_num_replicas): sync_op.append( self._sync_token_queues[cur_worker_id]. enqueue(global_step)) sync_op = control_flow_ops.group(*(sync_op)) # dummy_queue is passed to the queue runner. Don't use the real queues # because the queue runner doesn't automatically reopen it once it # closed queues in PS devices. dummy_queue = (data_flow_ops.FIFOQueue( 1, types_pb2.DT_INT32, shapes=(), shared_name="dummy_queue")) self._chief_queue_runner = queue_runner.QueueRunner( dummy_queue, [sync_op]) with ops.device(global_step.device), ops.name_scope(""): with ops.control_dependencies(train_ops): # Worker finished applying gradients. Add token to phase1_finished_queue train_op = logging_ops.Print( self._local_step._ref(), [ x[0].num_accumulated() for x in self._accumulator_list ] + [worker_id] + [global_step], message="Finished worker updates", name="FinishedWorkerUpdatesPrint") for accum, var in self._accumulator_list: with ops.device(var.device): chief_init_ops.append( accum.set_global_step(global_step, name="SetGlobalStep")) self.chief_init_op = control_flow_ops.group(*(chief_init_ops)) self._gradients_applied = True return train_op
def _train_op_fn(loss): """Run one training iteration.""" if training_state_cache: train_op.append(training_state_cache.insert(tree_ids, node_ids, logits)) if closed_form_grad_and_hess_fn: gradients, hessians = closed_form_grad_and_hess_fn(logits, labels) else: gradients = gradients_impl.gradients(loss, logits, name='Gradients')[0] hessians = gradients_impl.gradients( gradients, logits, name='Hessians')[0] stats_summaries_list = [] for i, feature_ids in enumerate(feature_ids_list): num_buckets = bucket_size_list[i] summaries = [ array_ops.squeeze( boosted_trees_ops.make_stats_summary( node_ids=node_ids, gradients=gradients, hessians=hessians, bucketized_features_list=[input_feature_list[f]], max_splits=max_splits, num_buckets=num_buckets), axis=0) for f in feature_ids ] stats_summaries_list.append(summaries) accumulators = [] def grow_tree_from_stats_summaries(stats_summaries_list, feature_ids_list): """Updates ensemble based on the best gains from stats summaries.""" node_ids_per_feature = [] gains_list = [] thresholds_list = [] left_node_contribs_list = [] right_node_contribs_list = [] all_feature_ids = [] assert len(stats_summaries_list) == len(feature_ids_list) for i, feature_ids in enumerate(feature_ids_list): (numeric_node_ids_per_feature, numeric_gains_list, numeric_thresholds_list, numeric_left_node_contribs_list, numeric_right_node_contribs_list) = ( boosted_trees_ops.calculate_best_gains_per_feature( node_id_range=last_layer_nodes_range, stats_summary_list=stats_summaries_list[i], l1=tree_hparams.l1, l2=tree_hparams.l2, tree_complexity=tree_hparams.tree_complexity, min_node_weight=tree_hparams.min_node_weight, max_splits=max_splits)) all_feature_ids += feature_ids node_ids_per_feature += numeric_node_ids_per_feature gains_list += numeric_gains_list thresholds_list += numeric_thresholds_list left_node_contribs_list += numeric_left_node_contribs_list right_node_contribs_list += numeric_right_node_contribs_list grow_op = boosted_trees_ops.update_ensemble( # Confirm if local_tree_ensemble or tree_ensemble should be used. tree_ensemble.resource_handle, feature_ids=all_feature_ids, node_ids=node_ids_per_feature, gains=gains_list, thresholds=thresholds_list, left_node_contribs=left_node_contribs_list, right_node_contribs=right_node_contribs_list, learning_rate=tree_hparams.learning_rate, max_depth=tree_hparams.max_depth, pruning_mode=boosted_trees_ops.PruningMode.NO_PRUNING) return grow_op if train_in_memory and is_single_machine: train_op.append(distribute_lib.increment_var(global_step)) train_op.append( grow_tree_from_stats_summaries(stats_summaries_list, feature_ids_list)) else: dependencies = [] for i, feature_ids in enumerate(feature_ids_list): stats_summaries = stats_summaries_list[i] accumulator = data_flow_ops.ConditionalAccumulator( dtype=dtypes.float32, # The stats consist of grads and hessians (the last dimension). shape=[len(feature_ids), max_splits, bucket_size_list[i], 2], shared_name='numeric_stats_summary_accumulator_' + str(i)) accumulators.append(accumulator) apply_grad = accumulator.apply_grad( array_ops.stack(stats_summaries, axis=0), stamp_token) dependencies.append(apply_grad) def grow_tree_from_accumulated_summaries_fn(): """Updates the tree with the best layer from accumulated summaries.""" # Take out the accumulated summaries from the accumulator and grow. stats_summaries_list = [] stats_summaries_list = [ array_ops.unstack(accumulator.take_grad(1), axis=0) for accumulator in accumulators ] grow_op = grow_tree_from_stats_summaries(stats_summaries_list, feature_ids_list) return grow_op with ops.control_dependencies(dependencies): train_op.append(distribute_lib.increment_var(global_step)) if config.is_chief: min_accumulated = math_ops.reduce_min( array_ops.stack( [acc.num_accumulated() for acc in accumulators])) train_op.append( control_flow_ops.cond( math_ops.greater_equal(min_accumulated, n_batches_per_layer), grow_tree_from_accumulated_summaries_fn, control_flow_ops.no_op, name='wait_until_n_batches_accumulated')) return control_flow_ops.group(train_op, name='train_op')
def _train_op_fn(loss): """Run one training iteration.""" train_op = [] if cache: train_op.append(cache.insert(tree_ids, node_ids, logits)) if closed_form_grad_and_hess_fn: gradients, hessians = closed_form_grad_and_hess_fn( logits, labels) else: gradients = gradients_impl.gradients(loss, logits, name='Gradients')[0] hessians = gradients_impl.gradients(gradients, logits, name='Hessians')[0] stats_summary_list = [ array_ops.squeeze(boosted_trees_ops.make_stats_summary( node_ids=node_ids, gradients=gradients, hessians=hessians, bucketized_features_list=[input_feature_list[f]], max_splits=max_splits, num_buckets=num_buckets), axis=0) for f in range(num_features) ] def grow_tree_from_stats_summaries(stats_summary_list): """Updates ensemble based on the best gains from stats summaries.""" (node_ids_per_feature, gains_list, thresholds_list, left_node_contribs_list, right_node_contribs_list) = ( boosted_trees_ops.calculate_best_gains_per_feature( node_id_range=array_ops.stack([ math_ops.reduce_min(node_ids), math_ops.reduce_max(node_ids) ]), stats_summary_list=stats_summary_list, l1=tree_hparams.l1, l2=tree_hparams.l2, tree_complexity=tree_hparams.tree_complexity, max_splits=max_splits)) grow_op = boosted_trees_ops.update_ensemble( # Confirm if local_tree_ensemble or tree_ensemble should be used. tree_ensemble.resource_handle, feature_ids=math_ops.range(0, num_features, dtype=dtypes.int32), node_ids=node_ids_per_feature, gains=gains_list, thresholds=thresholds_list, left_node_contribs=left_node_contribs_list, right_node_contribs=right_node_contribs_list, learning_rate=tree_hparams.learning_rate, max_depth=tree_hparams.max_depth, pruning_mode=boosted_trees_ops.PruningMode.NO_PRUNING) return grow_op if train_in_memory and is_single_machine: train_op.append(distribute_lib.increment_var(global_step)) train_op.append( grow_tree_from_stats_summaries(stats_summary_list)) else: summary_accumulator = data_flow_ops.ConditionalAccumulator( dtype=dtypes.float32, # The stats consist of gradients and hessians (the last dimension). shape=[num_features, max_splits, num_buckets, 2], shared_name='stats_summary_accumulator') apply_grad = summary_accumulator.apply_grad( array_ops.stack(stats_summary_list, axis=0), stamp_token) def grow_tree_from_accumulated_summaries_fn(): """Updates the tree with the best layer from accumulated summaries.""" # Take out the accumulated summaries from the accumulator and grow. stats_summary_list = array_ops.unstack( summary_accumulator.take_grad(1), axis=0) grow_op = grow_tree_from_stats_summaries( stats_summary_list) return grow_op with ops.control_dependencies([apply_grad]): train_op.append(distribute_lib.increment_var(global_step)) if config.is_chief: train_op.append( control_flow_ops.cond( math_ops.greater_equal( summary_accumulator.num_accumulated(), n_batches_per_layer), grow_tree_from_accumulated_summaries_fn, control_flow_ops.no_op, name='wait_until_n_batches_accumulated')) return control_flow_ops.group(train_op, name='train_op')
def testAccumulatorApplyGradFloat32(self): with self.cached_session(): q = data_flow_ops.ConditionalAccumulator( dtypes_lib.float32, name="Q", shape=tensor_shape.TensorShape([1])) accum_op = q.apply_grad((10.0,)) accum_op.run()
def testAccumulatorSetGlobalStep(self): with self.cached_session(): q = data_flow_ops.ConditionalAccumulator( dtypes_lib.float32, name="Q", shape=tensor_shape.TensorShape([1])) set_global_step_op = q.set_global_step(1) set_global_step_op.run()
def _apply_averages(): # pylint: disable=missing-docstring # Collect local and global vars local_vars = [v for g, v in grads_and_vars if g is not None] global_vars = ops.get_collection_ref("global_model") # sync queue, place it in the ps with ops.colocate_with(self._global_step): sync_queue = data_flow_ops.FIFOQueue(-1, [dtypes.bool], shapes=[[]], shared_name="sync_queue") train_ops = [] aggregated_vars = [] with ops.name_scope(None, self._name + "/global"): for var, gvar in zip(local_vars, global_vars): # pylint: disable=protected-access # Get reference to the tensor, # this works with Variable and ResourceVariable var = ops.convert_to_tensor(var) # Place the accumulator in the same ps as the global_var with ops.device(gvar.device): var_accum = data_flow_ops.ConditionalAccumulator( var.dtype, shape=var.get_shape(), shared_name=gvar.name + "/var_accum", ) # Add op to push local_var to accumulator train_ops.append( var_accum.apply_grad(var, local_step=global_step)) # Op to average the vars in the accumulator aggregated_vars.append( var_accum.take_grad(self._replicas_to_aggregate)) # Remember accumulator and corresponding device self._accumulator_list.append((var_accum, gvar.device)) # chief worker updates global vars and enqueues tokens to the sync queue if self._is_chief: update_ops = [] # Make sure train_ops are run with ops.control_dependencies(train_ops): # Update global_vars with average values for avg_var, gvar in zip(aggregated_vars, global_vars): with ops.device(gvar.device): update_ops.append(state_ops.assign(gvar, avg_var)) # Update shared global_step with ops.device(global_step.device): update_ops.append( state_ops.assign_add(self._global_step, 1)) # After averaging, push tokens to the queue with ops.control_dependencies(update_ops), ops.device( global_step.device): tokens = array_ops.fill([self._tokens_per_step], constant_op.constant(False)) sync_op = sync_queue.enqueue_many(tokens) # non chief workers deque a token, they will block here until chief is done else: # Make sure train_ops are run with ops.control_dependencies(train_ops), ops.device( global_step.device): sync_op = sync_queue.dequeue() # All workers pull averaged values with ops.control_dependencies([sync_op]): local_update_op = self._assign_vars(local_vars, global_vars) return local_update_op
def apply_gradients(self, grads_and_vars, worker_id, global_step=None, name=None, collect_cdfs=False, # batch_idx_list=None, worker_kill_list=None, num_workers=None, num_batches_per_epoch=None): matrix_to_solve=None, num_batches_per_epoch=None): """Apply gradients to variables. This contains most of the synchronization implementation and also wraps the apply_gradients() from the real optimizer. Args: grads_and_vars: List of (gradient, variable) pairs as returned by compute_gradients(). global_step: Optional Variable to increment by one after the variables have been updated. name: Optional name for the returned operation. Default to the name passed to the Optimizer constructor. Returns: train_op: The op to dequeue a token so the replicas can exit this batch and start the next one. This is executed by each replica. Raises: ValueError: If the grads_and_vars is empty. ValueError: If global step is not provided, the staleness cannot be checked. """ if not grads_and_vars: raise ValueError("Must supply at least one variable") if global_step is None: raise ValueError("Global step is required to check staleness") self._global_step = global_step train_ops = [] aggregated_grad = [] var_list = [] self._local_step = variables.Variable( initial_value=0, trainable=False, collections=[ops.GraphKeys.LOCAL_VARIABLES], dtype=global_step.dtype.base_dtype, name="sync_rep_local_step") self.local_step_init_op = state_ops.assign(self._local_step, global_step._ref()) chief_init_ops = [self.local_step_init_op] self.ready_for_local_init_op = variables.report_uninitialized_variables( variables.all_variables()) # The wait op waits for the current worker to dequeue a token from its respective token queue self._wait_op = self._sync_token_queues[worker_id].dequeue() # Replicas have to wait until they can get a token from the token queue # BEFORE begining to compute gradients. with ops.device(global_step.device): queue_size = self._sync_token_queues[worker_id].size() update_local_step_op = state_ops.assign(self._local_step, global_step._ref()) # Gradient accum creation with ops.name_scope(None, self._name): for grad, var in grads_and_vars: var_list.append(var) tf.logging.info("Grad " + str(grad) + " assigned to " + str(var.device)) with ops.device(var.device): if grad is None: continue elif isinstance(grad, ops.Tensor): grad_accum = data_flow_ops.ConditionalAccumulator( grad.dtype, shape=var.get_shape(), shared_name=var.name + "/grad_accum") else: if not isinstance(grad, ops.IndexedSlices): raise ValueError("Unknown grad type!") grad_accum = data_flow_ops.SparseConditionalAccumulator( grad.dtype, shape=(), shared_name=var.name + "/grad_accum") self._accumulator_list.append((grad_accum, var)) """# Phase 1 gradient computation with ops.control_dependencies([update_local_step_op]): for index, (grad, var) in enumerate(grads_and_vars): with ops.device(var.device): if grad is None: continue elif isinstance(grad, ops.Tensor): grad_accum = self._accumulator_list[index][0] train_ops.append(grad_accum.apply_grad(grad, local_step=self._local_step._ref())) else: if not isinstance(grad, ops.IndexedSlices): raise ValueError("Unknown grad type!") grad_accum = self._accumulator_list[index][0] train_ops.append(grad_accum.apply_indexed_slices_grad( grad, local_step=self._local_step._ref()))""" # Phase 1 gradient computation with ops.control_dependencies([update_local_step_op]): for index, (grad, var) in enumerate(grads_and_vars): print_start_op = logging_ops.Print(global_step, [global_step], message="Starting to apply grads for variable %d" % index) train_ops.append(print_start_op) with ops.device(var.device): ps_step_printer0 = logging_ops.Print(global_step, [global_step], message="global step printer0 on ps") train_ops.append(ps_step_printer0) '''Implement LS computation and solution here''' #b = np.ones(int(num_batches_per_epoch)) b = tf.ones([int(num_batches_per_epoch),1], tf.float32) A = matrix_to_solve # A_for_calc = np.transpose(A) LS_solution = linalg_ops.matrix_solve_ls(A, b, fast=False) LS_calc = tf.reshape(LS_solution, [-1]) weight = tf.slice(LS_calc, [worker_id], [1]) # print_ls_op = logging_ops.Print(LS_calc, [LS_calc], message="Solution for LS!") # train_ops.append(print_ls_op) weighted_grad = tf.scalar_mul(weight[0], grad) '''Kill some workers''' if grad is None: continue elif isinstance(grad, ops.Tensor): grad_accum = self._accumulator_list[index][0] num_accum = grad_accum.num_accumulated() tf.logging.info("Grad Accumed %s, Worker ID: %s" % (str(num_accum), str(worker_id))) with ops.control_dependencies([print_start_op]): with tf.device("job:worker/task:%d" % worker_id): apply_grad_op = grad_accum.apply_grad(grad, # apply_grad_op = grad_accum.apply_grad(weighted_grad, local_step=self._local_step._ref()) with ops.control_dependencies([apply_grad_op]): finished_print_op = logging_ops.Print(global_step, [global_step], message="Done applying grads for variable %d" % index) train_ops.append(finished_print_op) else: if not isinstance(grad, ops.IndexedSlices): raise ValueError("Unknown grad type!") grad_accum = self._accumulator_list[index][0] with ops.control_dependencies([print_start_op]): with tf.device("job:worker/task:%d" % worker_id): apply_grad_op = grad_accum.apply_indexed_slices_grad( grad, local_step=self._local_step._ref()) # weighted_grad, local_step=self._local_step._ref()) with ops.control_dependencies([apply_grad_op]): finished_print_op = logging_ops.Print(global_step, [global_step], message="Done applying grads for variable %d" % index) train_ops.append(finished_print_op) # Phase 2 gradient applying for index, (grad, var) in enumerate(grads_and_vars): with ops.device(var.device): work_idx_print1 = logging_ops.Print(worker_id, [worker_id], message="worker id for aggregate grad") ps_step_printer1 = logging_ops.Print(global_step, [global_step], message="global step printer1 on ps") num_replica_aggragate = logging_ops.Print(self._replicas_to_aggregate, [self._replicas_to_aggregate], message="num replica aggregate") train_ops.append(work_idx_print1) train_ops.append(ps_step_printer1) train_ops.append(num_replica_aggragate) grad_accum = self._accumulator_list[index][0] if grad is None: aggregated_grad.append(None) elif isinstance(grad, ops.Tensor): if collect_cdfs: # aggregated_grad.append(grad_accum.take_grad(self._total_num_replicas)) aggregated_grad.append(grad_accum.take_grad(self._replicas_to_aggregate)) else: aggregated_grad.append(grad_accum.take_grad(1)) else: if collect_cdfs: # aggregated_grad.append(grad_accum.take_grad(self._total_num_replicas)) aggregated_grad.append(grad_accum.take_grad(self._replicas_to_aggregate)) else: aggregated_grad.append(grad_accum.take_indexed_slices_grad(1)) aggregated_grads_and_vars = zip(aggregated_grad, var_list) # Some debug operations self.print_sizes = logging_ops.Print(global_step, [self._sync_token_queues[i].size() for i in range(self._total_num_replicas)], message="queue sizes") self.print_accum_sizes = logging_ops.Print(self._local_step, [x[0].num_accumulated() for x in self._accumulator_list] + [worker_id], message="Accum sizes") self.print_local_step = logging_ops.Print(self._local_step, [self._local_step._ref(), global_step._ref()], message="local vs global step") # sync_op will be assigned to the same device as the global step. with ops.device(global_step.device), ops.name_scope(""): with ops.control_dependencies([self.print_accum_sizes]): update_op = self._opt.apply_gradients(aggregated_grads_and_vars, global_step) self._update_op = update_op with ops.control_dependencies([update_op]): sync_op = [] for cur_worker_id in range(self._total_num_replicas): sync_op.append(self._sync_token_queues[cur_worker_id].enqueue(global_step)) sync_op = control_flow_ops.group(*(sync_op)) # dummy_queue is passed to the queue runner. Don't use the real queues # because the queue runner doesn't automatically reopen it once it # closed queues in PS devices. dummy_queue = ( data_flow_ops.FIFOQueue(1, types_pb2.DT_INT32, shapes=(), shared_name="dummy_queue")) self._chief_queue_runner = queue_runner.QueueRunner(dummy_queue, [sync_op]) with ops.device(global_step.device), ops.name_scope(""): with ops.control_dependencies(train_ops): # Worker finished applying gradients. Add token to phase1_finished_queue train_op = logging_ops.Print(self._local_step._ref(), [x[0].num_accumulated() for x in self._accumulator_list] + [worker_id], message="Finished worker updates", name="FinishedWorkerUpdatesPrint") for accum, var in self._accumulator_list: with ops.device(var.device): chief_init_ops.append( accum.set_global_step( global_step, name="SetGlobalStep")) self.chief_init_op = control_flow_ops.group(*(chief_init_ops)) self._gradients_applied = True return train_op