def test_convergence(self): for dtype in ['mixed', tf.float32]: with tf.Graph().as_default() as g: n_samples = 10 n_hid = 10 var_dtype = tf.float32 if dtype == tf.float32 else tf.float16 np.random.seed(0) X = np.random.rand(n_samples, n_hid) y = np.random.rand(n_samples, 1) w = np.linalg.solve(X.T.dot(X), X.T.dot(y)) x_ph = tf.placeholder(var_dtype, [n_samples, n_hid]) y_ph = tf.placeholder(var_dtype, [n_samples, 1]) y_pred = tf.layers.dense(x_ph, 1, use_bias=False) loss = tf.losses.mean_squared_error(y_ph, y_pred) loss += tf.losses.get_regularization_loss() train_op = optimize_loss(loss, "Adam", {}, lambda gs: fixed_lr(gs, 0.05), dtype=dtype) with self.test_session(g, use_gpu=True) as sess: sess.run(tf.global_variables_initializer()) for i in range(6000): sess.run(train_op, {x_ph: X, y_ph: y}) w_learned = sess.run(tf.trainable_variables()[0]) npt.assert_allclose(w_learned, w, atol=0.01)
def test_updates(self): try: import horovod.tensorflow as hvd hvd.init() except ImportError: print("Horovod not installed skipping test_updates") return dtype = tf.float32 with tf.Graph().as_default() as g: n_samples = 10 n_hid = 10 var_dtype = tf.float32 if dtype == tf.float32 else tf.float16 np.random.seed(0) X = np.random.rand(n_samples, n_hid) y = np.random.rand(n_samples, 1) w = np.linalg.solve(X.T.dot(X), X.T.dot(y)) x_ph = tf.placeholder(var_dtype, [n_samples, n_hid]) y_ph = tf.placeholder(var_dtype, [n_samples, 1]) y_pred = tf.layers.dense(x_ph, 1, use_bias=False) loss = tf.losses.mean_squared_error(y_ph, y_pred) loss += tf.losses.get_regularization_loss() skip_update_ph = tf.placeholder(tf.bool) iter_size = 8 train_op = optimize_loss(loss, "SGD", {}, lambda gs: fixed_lr(gs, 0.1), dtype=dtype, iter_size=iter_size, on_horovod=True, skip_update_ph=skip_update_ph) grad_accum = [var for var in tf.global_variables() if 'accum' in var.name][0] var = tf.trainable_variables()[0] with self.test_session(g, use_gpu=True) as sess: sess.run(tf.global_variables_initializer()) for _ in range(3): g, v = sess.run([grad_accum, var]) npt.assert_allclose(g, np.zeros(g.shape)) true_g = 2 * (X.T.dot(X).dot(v) - X.T.dot(y)) / X.shape[0] / iter_size sess.run(train_op, {x_ph: X, y_ph: y, skip_update_ph: True}) g_new, v_new = sess.run([grad_accum, var]) npt.assert_allclose(g_new, true_g, atol=1e-7) npt.assert_allclose(v_new, v) sess.run(train_op, {x_ph: X, y_ph: y, skip_update_ph: True}) g_new, v_new = sess.run([grad_accum, var]) npt.assert_allclose(g_new, true_g * 2, atol=1e-7) npt.assert_allclose(v_new, v) sess.run(train_op, {x_ph: X, y_ph: y, skip_update_ph: True}) g_new, v_new = sess.run([grad_accum, var]) npt.assert_allclose(g_new, true_g * 3, atol=1e-7) npt.assert_allclose(v_new, v) sess.run(train_op, {x_ph: X, y_ph: y, skip_update_ph: False}) g_new, v_new = sess.run([grad_accum, var]) npt.assert_allclose(g_new, np.zeros(g.shape)) npt.assert_allclose(v_new, v - 0.1 * true_g * 4, atol=1e-7)
def test_convergence(self): for dtype in ['mixed', tf.float32]: with tf.Graph().as_default() as g: n_samples = 10 n_hid = 10 var_dtype = tf.float32 if dtype == tf.float32 else tf.float16 np.random.seed(0) X = np.random.rand(n_samples, n_hid) y = np.random.rand(n_samples, 1) w = np.linalg.solve(X.T.dot(X), X.T.dot(y)) x_ph = tf.placeholder(var_dtype, [n_samples, n_hid]) y_ph = tf.placeholder(var_dtype, [n_samples, 1]) y_pred = tf.layers.dense(x_ph, 1, use_bias=False) loss = tf.losses.mean_squared_error(y_ph, y_pred) loss += tf.losses.get_regularization_loss() train_op = optimize_loss(loss, "Adam", {}, lambda gs: fixed_lr(gs, 0.05), dtype=dtype) with self.test_session(g, use_gpu=True) as sess: sess.run(tf.global_variables_initializer()) for i in range(6000): sess.run(train_op, {x_ph: X, y_ph: y}) w_learned = sess.run(tf.trainable_variables()[0]) npt.assert_allclose(w_learned, w, atol=0.01)
def compile(self, force_var_reuse=False): """TensorFlow graph is built here.""" if 'initializer' not in self.params: initializer = None else: init_dict = self.params.get('initializer_params', {}) initializer = self.params['initializer'](**init_dict) if not self.on_horovod: # not using Horovod # below we follow data parallelism for multi-GPU training losses = [] for gpu_cnt, gpu_id in enumerate(self._gpu_ids): with tf.device("/gpu:{}".format(gpu_id)), tf.variable_scope( name_or_scope=tf.get_variable_scope(), # re-using variables across GPUs. reuse=force_var_reuse or (gpu_cnt > 0), initializer=initializer, dtype=self.get_tf_dtype(), ): deco_print("Building graph on GPU:{}".format(gpu_id)) self.get_data_layer(gpu_cnt).build_graph() input_tensors = self.get_data_layer(gpu_cnt).input_tensors loss, self._outputs[ gpu_cnt] = self._build_forward_pass_graph( input_tensors, gpu_id=gpu_cnt, ) if self._outputs[gpu_cnt] is not None and \ not isinstance(self._outputs[gpu_cnt], list): raise ValueError( 'Decoder outputs have to be either None or list') if self._mode == "train" or self._mode == "eval": losses.append(loss) # end of for gpu_ind loop if self._mode == "train": self.loss = tf.reduce_mean(losses) if self._mode == "eval": self.eval_losses = losses else: # is using Horovod # gpu_id should always be zero, since Horovod takes care of isolating # different processes to 1 GPU only with tf.device("/gpu:0"), tf.variable_scope( name_or_scope=tf.get_variable_scope(), reuse=force_var_reuse, initializer=initializer, dtype=self.get_tf_dtype(), ): deco_print("Building graph in Horovod rank: {}".format( self._hvd.rank())) self.get_data_layer().build_graph() input_tensors = self.get_data_layer().input_tensors loss, self._output = self._build_forward_pass_graph( input_tensors, gpu_id=0) if self._output is not None and not isinstance( self._output, list): raise ValueError( 'Decoder outputs have to be either None or list') if self._mode == "train": self.loss = loss if self._mode == "eval": self.eval_losses = [loss] try: self._num_objects_per_step = [ self._get_num_objects_per_step(worker_id) for worker_id in range(self.num_gpus) ] except NotImplementedError: pass if self._mode == "train": if 'lr_policy' not in self.params: lr_policy = None else: lr_params = self.params.get('lr_policy_params', {}) # adding default decay_steps = max_steps if lr_policy supports it and # different value is not provided func_params = signature(self.params['lr_policy']).parameters if 'decay_steps' in func_params and 'decay_steps' not in lr_params: lr_params['decay_steps'] = self._last_step if 'steps_per_epoch' in func_params and \ 'steps_per_epoch' not in lr_params and 'num_epochs' in self.params: lr_params['steps_per_epoch'] = self.steps_in_epoch lr_policy = lambda gs: self.params['lr_policy'](global_step=gs, **lr_params) if self.params.get('iter_size', 1) > 1: self.skip_update_ph = tf.placeholder(tf.bool) self.train_op = optimize_loss( loss=tf.cast(self.loss, tf.float32) + get_regularization_loss(), dtype=self.params['dtype'], optimizer=self.params['optimizer'], optimizer_params=self.params.get('optimizer_params', {}), clip_gradients=self.params.get('max_grad_norm', None), learning_rate_decay_fn=lr_policy, summaries=self.params.get('summaries', None), larc_params=self.params.get('larc_params', None), loss_scaling=self.params.get('loss_scaling', 1.0), on_horovod=self.on_horovod, iter_size=self.params.get('iter_size', 1), skip_update_ph=self.skip_update_ph, ) tf.summary.scalar(name="train_loss", tensor=self.loss) if self.steps_in_epoch: tf.summary.scalar( name="epoch", tensor=tf.floor( tf.train.get_global_step() / tf.constant(self.steps_in_epoch, dtype=tf.int64)), ) if not self.on_horovod or self._hvd.rank() == 0: deco_print("Trainable variables:") total_params = 0 unknown_shape = False for var in tf.trainable_variables(): var_params = 1 deco_print('{}'.format(var.name), offset=2) deco_print('shape: {}, {}'.format(var.get_shape(), var.dtype), offset=4) if var.get_shape(): for dim in var.get_shape(): var_params *= dim.value total_params += var_params else: unknown_shape = True if unknown_shape: deco_print( "Encountered unknown variable shape, can't compute total " "number of parameters.") else: deco_print( 'Total trainable parameters: {}'.format(total_params))
def compile(self, force_var_reuse=False): """TensorFlow graph is built here.""" if 'initializer' not in self.params: initializer = None else: init_dict = self.params.get('initializer_params', {}) initializer = self.params['initializer'](**init_dict) self.data_layer.build_graph() input_tensors = self.data_layer.get_input_tensors() if not self.on_horovod: # not using Horovod # below we follow data parallelism for multi-GPU training losses = [] for gpu_cnt, gpu_id in enumerate(self._gpu_ids): with tf.device("/gpu:{}".format(gpu_id)), tf.variable_scope( name_or_scope=tf.get_variable_scope(), # re-using variables across GPUs. reuse=force_var_reuse or (gpu_cnt > 0), initializer=initializer, dtype=self.get_tf_dtype(), ): deco_print("Building graph on GPU:{}".format(gpu_id)) loss, self._outputs[ gpu_cnt] = self._build_forward_pass_graph( [ input_tensor[gpu_cnt] for input_tensor in input_tensors ], gpu_id=gpu_cnt, ) if self._mode == "train" or self._mode == "eval": losses.append(loss) # end of for gpu_ind loop if self._mode == "train" or self._mode == "eval": self.loss = tf.reduce_mean(losses) else: # is using Horovod # gpu_id should always be zero, since Horovod takes care of isolating # different processes to 1 GPU only with tf.device("/gpu:0"), tf.variable_scope( name_or_scope=tf.get_variable_scope(), reuse=force_var_reuse, initializer=initializer, dtype=self.get_tf_dtype(), ): deco_print("Building graph in Horovod rank: {}".format( self._hvd.rank())) loss, self._outputs[0] = self._build_forward_pass_graph( input_tensors, gpu_id=0) if self._mode == "train" or self._mode == "eval": self.loss = loss if self._mode == "train": if 'lr_policy' not in self.params: lr_policy = None else: lr_params = self.params.get('lr_policy_params', {}) # adding default decay_steps = max_steps if lr_policy supports it and # different value is not provided if 'decay_steps' in self.params['lr_policy'].__code__.co_varnames and \ 'decay_steps' not in lr_params: lr_params['decay_steps'] = self._last_step lr_policy = lambda lr, gs: self.params['lr_policy'](lr, gs, ** lr_params) self.train_op = optimize_loss( loss=self.loss + get_regularization_loss(), dtype=self.params['dtype'], learning_rate=self.params['learning_rate'], optimizer=self.params['optimizer'], optimizer_params=self.params.get('optimizer_params', {}), gradient_noise_scale=None, gradient_multipliers=None, clip_gradients=self.params.get('max_grad_norm', None), learning_rate_decay_fn=lr_policy, update_ops=None, variables=None, name="Loss_Optimization", summaries=self.params.get('summaries', None), colocate_gradients_with_ops=True, increment_global_step=True, LARC_nu=self.params.get('larc_nu', None), LARC_mode=self.params.get('larc_mode', 'clip'), loss_scale=self.params.get('loss_scale', 1.0), automatic_loss_scaling=self.params.get( 'automatic_loss_scaling', None), on_horovod=self.on_horovod, ) tf.summary.scalar(name="train_loss", tensor=self.loss) if not self.on_horovod or self._hvd.rank() == 0: deco_print("Trainable variables:") total_params = 0 unknown_shape = False for var in tf.trainable_variables(): var_params = 1 deco_print('{}'.format(var.name), offset=2) deco_print('shape: {}, {}'.format(var.get_shape(), var.dtype), offset=4) if var.get_shape(): for dim in var.get_shape(): var_params *= dim.value total_params += var_params else: unknown_shape = True if unknown_shape: deco_print( "Encountered unknown variable shape, can't compute total " "number of parameters.") else: deco_print( 'Total trainable parameters: {}'.format(total_params))
def compile(self, force_var_reuse=False): """TensorFlow graph is built here.""" if 'initializer' not in self.params: initializer = None else: init_dict = self.params.get('initializer_params', {}) initializer = self.params['initializer'](**init_dict) if not self.on_horovod: # not using Horovod # below we follow data parallelism for multi-GPU training losses = [] for gpu_cnt, gpu_id in enumerate(self._gpu_ids): with tf.device("/gpu:{}".format(gpu_id)), tf.variable_scope( name_or_scope=tf.get_variable_scope(), # re-using variables across GPUs. reuse=force_var_reuse or (gpu_cnt > 0), initializer=initializer, dtype=self.get_tf_dtype(), ): deco_print("Building graph on GPU:{}".format(gpu_id)) self.get_data_layer(gpu_cnt).build_graph() input_tensors = self.get_data_layer(gpu_cnt).input_tensors loss, self._outputs[gpu_cnt] = self._build_forward_pass_graph( input_tensors, gpu_id=gpu_cnt, ) if self._outputs[gpu_cnt] is not None and \ not isinstance(self._outputs[gpu_cnt], list): raise ValueError('Decoder samples have to be either None or list') if self._mode == "train" or self._mode == "eval": losses.append(loss) # end of for gpu_ind loop if self._mode == "train": self.loss = tf.reduce_mean(losses) if self._mode == "eval": self.eval_losses = losses else: # is using Horovod # gpu_id should always be zero, since Horovod takes care of isolating # different processes to 1 GPU only with tf.device("/gpu:0"), tf.variable_scope( name_or_scope=tf.get_variable_scope(), reuse=force_var_reuse, initializer=initializer, dtype=self.get_tf_dtype(), ): deco_print( "Building graph in Horovod rank: {}".format(self._hvd.rank()) ) self.get_data_layer().build_graph() input_tensors = self.get_data_layer().input_tensors loss, self._output = self._build_forward_pass_graph(input_tensors, gpu_id=0) if self._output is not None and not isinstance(self._output, list): raise ValueError('Decoder samples have to be either None or list') if self._mode == "train": self.loss = loss if self._mode == "eval": self.eval_losses = [loss] if self._mode == "train": if 'lr_policy' not in self.params: lr_policy = None else: lr_params = self.params.get('lr_policy_params', {}) # adding default decay_steps = max_steps if lr_policy supports it and # different value is not provided if 'decay_steps' in self.params['lr_policy'].__code__.co_varnames and \ 'decay_steps' not in lr_params: lr_params['decay_steps'] = self._last_step if 'steps_per_epoch' in self.params['lr_policy'].__code__.co_varnames and \ 'steps_per_epoch' not in lr_params and 'num_epochs' in self.params: lr_params['steps_per_epoch'] = self.steps_in_epoch lr_policy = lambda gs: self.params['lr_policy'](global_step=gs, **lr_params) self.train_op = optimize_loss( loss=tf.cast(self.loss, tf.float32) + get_regularization_loss(), dtype=self.params['dtype'], optimizer=self.params['optimizer'], optimizer_params=self.params.get('optimizer_params', {}), gradient_noise_scale=None, gradient_multipliers=None, clip_gradients=self.params.get('max_grad_norm', None), learning_rate_decay_fn=lr_policy, update_ops=None, variables=None, name="Loss_Optimization", summaries=self.params.get('summaries', None), colocate_gradients_with_ops=True, increment_global_step=True, larc_params=self.params.get('larc_params', None), loss_scale=self.params.get('loss_scale', 1.0), automatic_loss_scaling=self.params.get('automatic_loss_scaling', None), on_horovod=self.on_horovod, ) tf.summary.scalar(name="train_loss", tensor=self.loss) if self.steps_in_epoch: tf.summary.scalar( name="epoch", tensor=tf.floor(tf.train.get_global_step() / tf.constant(self.steps_in_epoch, dtype=tf.int64)), ) if not self.on_horovod or self._hvd.rank() == 0: deco_print("Trainable variables:") total_params = 0 unknown_shape = False for var in tf.trainable_variables(): var_params = 1 deco_print('{}'.format(var.name), offset=2) deco_print('shape: {}, {}'.format(var.get_shape(), var.dtype), offset=4) if var.get_shape(): for dim in var.get_shape(): var_params *= dim.value total_params += var_params else: unknown_shape = True if unknown_shape: deco_print("Encountered unknown variable shape, can't compute total " "number of parameters.") else: deco_print('Total trainable parameters: {}'.format(total_params))
def compile(self, force_var_reuse=False, checkpoint=None): """TensorFlow graph is built here.""" if 'initializer' not in self.params: initializer = None else: init_dict = self.params.get('initializer_params', {}) initializer = self.params['initializer'](**init_dict) if not self.on_horovod: # not using Horovod # below we follow data parallelism for multi-GPU training losses = [] for gpu_cnt, gpu_id in enumerate(self._gpu_ids): with tf.device("/gpu:{}".format(gpu_id)), tf.variable_scope( name_or_scope=tf.get_variable_scope(), # re-using variables across GPUs. reuse=force_var_reuse or (gpu_cnt > 0), initializer=initializer, dtype=self.get_tf_dtype(), ): deco_print("Building graph on GPU:{}".format(gpu_id)) self.get_data_layer(gpu_cnt).build_graph() input_tensors = self.get_data_layer(gpu_cnt).input_tensors if self.params.get("use_trt", False): # Build TF-TRT graph loss, self._outputs[gpu_cnt] = self.build_trt_forward_pass_graph( input_tensors, gpu_id=gpu_cnt, checkpoint=checkpoint ) else: # Build regular TF graph loss, self._outputs[gpu_cnt] = self._build_forward_pass_graph( input_tensors, gpu_id=gpu_cnt ) if self._outputs[gpu_cnt] is not None and \ not isinstance(self._outputs[gpu_cnt], list): raise ValueError( 'Decoder outputs have to be either None or list') if self._mode == "train" or self._mode == "eval": losses.append(loss) # end of for gpu_ind loop if self._mode == "train": self.loss = tf.reduce_mean(losses) if self._mode == "eval": self.eval_losses = losses else: # is using Horovod # gpu_id should always be zero, since Horovod takes care of isolating # different processes to 1 GPU only with tf.device("/gpu:0"), tf.variable_scope( name_or_scope=tf.get_variable_scope(), reuse=force_var_reuse, initializer=initializer, dtype=self.get_tf_dtype(), ): deco_print( "Building graph in Horovod rank: {}".format( self._hvd.rank()) ) self.get_data_layer().build_graph() input_tensors = self.get_data_layer().input_tensors if self.params.get("use_trt", False): # Build TF-TRT graph all_loss, self._output = self.build_trt_forward_pass_graph( input_tensors, gpu_id=0, checkpoint=checkpoint ) else: # Build regular TF graph all_loss, self._output = self._build_forward_pass_graph( input_tensors, gpu_id=0 ) if isinstance(all_loss, (dict,)): loss = all_loss['loss'] else: loss = all_loss if self._output is not None and not isinstance(self._output, list): raise ValueError( 'Decoder outputs have to be either None or list') if self._mode == "train": self.loss = loss if self._mode == "eval": self.eval_losses = [loss] try: self._num_objects_per_step = [self._get_num_objects_per_step(worker_id) for worker_id in range(self.num_gpus)] except NotImplementedError: pass if self._mode == "train": if 'lr_policy' not in self.params: lr_policy = None else: lr_params = self.params.get('lr_policy_params', {}) # adding default decay_steps = max_steps if lr_policy supports it and # different value is not provided func_params = signature(self.params['lr_policy']).parameters if 'decay_steps' in func_params and 'decay_steps' not in lr_params: lr_params['decay_steps'] = self._last_step if 'begin_decay_at' in func_params: if 'warmup_steps' in func_params: lr_params['begin_decay_at'] = max( lr_params.get('begin_decay_at', 0), lr_params.get('warmup_steps', 0) ) lr_params['decay_steps'] -= lr_params.get( 'begin_decay_at', 0) if 'steps_per_epoch' in func_params and 'steps_per_epoch' not in lr_params and 'num_epochs' in self.params: lr_params['steps_per_epoch'] = self.steps_in_epoch def lr_policy(gs): return self.params['lr_policy']( global_step=gs, **lr_params) if self.params.get('iter_size', 1) > 1: self.skip_update_ph = tf.placeholder(tf.bool) var_list = tf.trainable_variables() freeze_variables_regex = self.params.get( 'freeze_variables_regex', None) if freeze_variables_regex is not None: pattern = re.compile(freeze_variables_regex) var_list = [var for var in tf.trainable_variables() if not pattern.match(var.name)] self.train_op = optimize_loss( loss=tf.cast(self.loss, tf.float32) + get_regularization_loss(), dtype=self.params['dtype'], optimizer=self.params['optimizer'], optimizer_params=self.params.get('optimizer_params', {}), var_list=var_list, clip_gradients=self.params.get('max_grad_norm', None), learning_rate_decay_fn=lr_policy, summaries=self.params.get('summaries', None), larc_params=self.params.get('larc_params', None), loss_scaling=self.params.get('loss_scaling', 1.0), loss_scaling_params=self.params.get( 'loss_scaling_params', None), on_horovod=self.on_horovod, iter_size=self.params.get('iter_size', 1), skip_update_ph=self.skip_update_ph, model=self ) tf.summary.scalar(name="train_loss", tensor=self.loss) if self.steps_in_epoch: tf.summary.scalar( name="epoch", tensor=tf.floor(tf.train.get_global_step( ) / tf.constant(self.steps_in_epoch, dtype=tf.int64)), )