def on_epoch_end(self, epoch, logs=None): logs = logs or {} logs['lr'] = K.get_value(self.model.optimizer.lr) current = logs.get(self.monitor) if current is None: logging.warning('Reduce LR on plateau conditioned on metric `%s` ' 'which is not available. Available metrics are: %s', self.monitor, ','.join(list(logs.keys()))) else: if self.in_cooldown(): self.cooldown_counter -= 1 self.wait = 0 if self.monitor_op(current, self.best): self.best = current self.wait = 0 elif not self.in_cooldown(): self.wait += 1 if self.wait >= self.patience: old_lr = float(K.get_value(self.model.optimizer.lr)) if old_lr > self.min_lr: new_lr = old_lr * self.factor new_lr = max(new_lr, self.min_lr) K.set_value(self.model.optimizer.lr, new_lr) if self.verbose > 0: print('\nEpoch %05d: ReduceLROnPlateau reducing learning ' 'rate to %s.' % (epoch + 1, new_lr)) self.cooldown_counter = self.cooldown self.wait = 0
def get_config(self): config = { 'lr': float(K.get_value(self.lr)), 'decay': float(K.get_value(self.decay)), 'epsilon': self.epsilon } base_config = super(Adagrad, self).get_config() return dict(list(base_config.items()) + list(config.items()))
def get_config(self): config = { 'lr': float(K.get_value(self.lr)), 'momentum': float(K.get_value(self.momentum)), 'decay': float(K.get_value(self.decay)), 'nesterov': self.nesterov } base_config = super(SGD, self).get_config() return dict(list(base_config.items()) + list(config.items()))
def get_config(self): config = { 'lr': float(K.get_value(self.lr)), 'beta_1': float(K.get_value(self.beta_1)), 'beta_2': float(K.get_value(self.beta_2)), 'epsilon': self.epsilon, 'schedule_decay': self.schedule_decay } base_config = super(Nadam, self).get_config() return dict(list(base_config.items()) + list(config.items()))
def get_config(self): config = { 'lr': float(K.get_value(self.lr)), 'beta_1': float(K.get_value(self.beta_1)), 'beta_2': float(K.get_value(self.beta_2)), 'decay': float(K.get_value(self.decay)), 'epsilon': self.epsilon, 'amsgrad': self.amsgrad } base_config = super(Adam, self).get_config() return dict(list(base_config.items()) + list(config.items()))
def show_batch_normalization_layer(layer): """Serialize batch normalization layer to dict""" moving_mean = K.get_value(layer.moving_mean) moving_variance = K.get_value(layer.moving_variance) result = {} result['moving_mean'] = encode_floats(moving_mean) result['moving_variance'] = encode_floats(moving_variance) if layer.center: beta = K.get_value(layer.beta) result['beta'] = encode_floats(beta) if layer.scale: gamma = K.get_value(layer.gamma) result['gamma'] = encode_floats(gamma) return result
def on_epoch_begin(self, epoch, logs=None): # TODO(yashkatariya): Change the property checking when the learning # rate attribute is unified across all TF Optimizers. if isinstance(self.model.optimizer, optimizers.TFOptimizer): if not hasattr(self.model.optimizer.optimizer, '_lr') and not hasattr( self.model.optimizer.optimizer, '_learning_rate'): raise ValueError( 'TF Optimizer must have a "_lr" or "_learning_rate" attribute.') else: opt = self.model.optimizer.optimizer if hasattr(opt, '_lr'): opt_lr = Variable(opt._lr) # pylint: disable=protected-access elif hasattr(opt, '_learning_rate'): opt_lr = Variable(opt._learning_rate) # pylint: disable=protected-access else: if not hasattr(self.model.optimizer, 'lr'): raise ValueError('Optimizer must have a "lr" attribute.') else: opt = self.model.optimizer opt_lr = opt.lr try: # new API lr = float(K.get_value(opt_lr)) lr = self.schedule(epoch, lr) except TypeError: # Support for old API for backward compatibility lr = self.schedule(epoch) if not isinstance(lr, (float, np.float32, np.float64)): raise ValueError('The output of the "schedule" function ' 'should be float.') K.set_value(opt_lr, lr) if self.verbose > 0: print('\nEpoch %05d: LearningRateScheduler reducing learning ' 'rate to %s.' % (epoch + 1, lr))
def _serialize_hyperparameter(self, hyperparameter_name): """Serialize a hyperparameter that can be a float, callable, or Tensor.""" value = self._get_hyper(hyperparameter_name) if callable(value): return value() if isinstance(value, (ops.Tensor, tf_variables.Variable)): return backend.get_value(value) return value
def testOptimizerWithCallbacks(self): np.random.seed(1331) input_np = np.random.random((10, 3)) output_np = np.random.random((10, 4)) a = input_layer.Input(shape=(3,), name='input_a') model = sequential.Sequential() model.add(core.Dense(4, name='dense')) model.add(core.Dropout(0.5, name='dropout')) model(a) optimizer = gradient_descent.SGD(learning_rate=0.1) model.compile(optimizer, loss='mse', metrics=['mae']) # This does not reduce the LR after the first epoch (due to low delta). cbks = [ callbacks.ReduceLROnPlateau( monitor='val_loss', factor=0.1, min_delta=0, patience=1, cooldown=5) ] model.fit( input_np, output_np, batch_size=10, validation_data=(input_np, output_np), callbacks=cbks, epochs=2, verbose=0) self.assertAllClose( float(backend.get_value(model.optimizer.lr)), 0.1, atol=1e-4) # This should reduce the LR after the first epoch (due to high delta). cbks = [ callbacks.ReduceLROnPlateau( monitor='val_loss', factor=0.1, min_delta=10, patience=1, cooldown=5) ] model.fit( input_np, output_np, batch_size=10, validation_data=(input_np, output_np), callbacks=cbks, epochs=2, verbose=2) self.assertAllClose( float(backend.get_value(model.optimizer.lr)), 0.01, atol=1e-4)
def _serialize_hyperparameter(self, hyperparameter_name): """Serialize a hyperparameter that can be a float, callable, or Tensor.""" value = self._hyper[hyperparameter_name] if isinstance(value, learning_rate_schedule.LearningRateSchedule): return learning_rate_schedule.serialize(value) if callable(value): return value() if tensor_util.is_tensor(value): return backend.get_value(value) return value
def _serialize_hyperparameter(self, hyperparameter_name): """Serialize a hyperparameter that can be a float, callable, or Tensor.""" value = self._hyper[hyperparameter_name] if isinstance(value, learning_rate_schedule.LearningRateSchedule): return learning_rate_schedule.serialize(value) if callable(value): return value() if isinstance(value, (ops.Tensor, tf_variables.Variable, distributed_values.TPUMirroredVariable)): return backend.get_value(value) return value
def test_save_weights_with_autocast_vars(self, strategy_fn, h5=False): with strategy_fn().scope(): with policy.policy_scope('infer_float32_vars'): x = layers.Input(shape=(1,), batch_size=2, dtype=dtypes.float16) layer = AddLayer(assert_type=dtypes.float16) y = layer(x) y = math_ops.cast(y, dtypes.float32) model = models.Model(inputs=x, outputs=y) model.set_weights([np.array(100.)]) x = np.ones((2, 1), dtype=np.float16) self.assertAllClose(backend.get_value(model(x)), x + 100.) suffix = '.h5' if h5 else '' weights_file = os.path.join(self.get_temp_dir(), 'weights' + suffix) model.save_weights(weights_file) model.set_weights([np.array(200.)]) self.assertAllClose(backend.get_value(model(x)), x + 200.) model.load_weights(weights_file) self.assertAllClose(backend.get_value(model(x)), x + 100.) self.assertEqual(model.get_weights(), [np.array(100.)])
def on_epoch_begin(self, epoch, logs=None): if not hasattr(self.model.optimizer, 'lr'): raise ValueError('Optimizer must have a "lr" attribute.') try: # new API lr = float(K.get_value(self.model.optimizer.lr)) lr = self.schedule(epoch, lr) except TypeError: # Support for old API for backward compatibility lr = self.schedule(epoch) if not isinstance(lr, (float, np.float32, np.float64)): raise ValueError('The output of the "schedule" function ' 'should be float.') K.set_value(self.model.optimizer.lr, lr) if self.verbose > 0: print('\nEpoch %05d: LearningRateScheduler reducing learning ' 'rate to %s.' % (epoch + 1, lr))
def convert_all_kernels_in_model(model): """Converts all convolution kernels in a model from Theano to TensorFlow. Also works from TensorFlow to Theano. Arguments: model: target model for the conversion. """ # Note: SeparableConvolution not included # since only supported by TF. conv_classes = { 'Conv1D', 'Conv2D', 'Conv3D', 'Conv2DTranspose', } to_assign = [] for layer in model.layers: if layer.__class__.__name__ in conv_classes: original_kernel = K.get_value(layer.kernel) converted_kernel = convert_kernel(original_kernel) to_assign.append((layer.kernel, converted_kernel)) K.batch_set_value(to_assign)
def apply_mask(self, prediction_result): """Removes prediction output that corresponds to padded input.""" padding_mask = K.get_value(self.padding_mask) assert len(padding_mask.shape) == 1 if len(self.output_shape) == 1: prediction = np.take(prediction_result, np.nonzero( padding_mask[:len(prediction_result)]), axis=0) if prediction.shape[0] == 1: prediction = np.squeeze(prediction, axis=0) return prediction else: predictions = [] for i in range(len(self.output_shape)): prediction = prediction_result[i] prediction = np.take(prediction, np.nonzero( padding_mask[:len(prediction)]), axis=0) predictions.append(np.squeeze(prediction)) return predictions
def test_save_weights_with_dynamic_loss_scaling(self, strategy_fn): with context.eager_mode(): strategy = strategy_fn() if (isinstance(strategy, mirrored_strategy.MirroredStrategy) and not context.executing_eagerly()): # TODO(b/121381184): Enable running the test in this case. return # Create and run model. with strategy.scope(): x = layers.Input(shape=(2,), batch_size=2, dtype=dtypes.float32) y = AddLayer(assert_type=dtypes.float32)(x) model = models.Model(inputs=x, outputs=y) loss_scale = loss_scale_module.DynamicLossScale( initial_loss_scale=1., increment_period=2., multiplier=2.) opt = gradient_descent.SGD(1.) opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale) model.compile(optimizer=opt, loss='mse') # Run for 3 steps (6 examples with a batch size of 2) model.fit(np.zeros((6, 2)), np.zeros((6, 2)), batch_size=2) self.assertEqual(backend.get_value(loss_scale()), 2) self.assertEqual(backend.get_value(loss_scale._num_good_steps), 1) # Save model weights. save_prefix = os.path.join(self.get_temp_dir(), 'ckpt') model.save_weights(save_prefix) # Run model again for 1 step (2 examples with a batch size of 2) model.fit(np.zeros((2, 2)), np.zeros((2, 2)), batch_size=2) self.assertEqual(backend.get_value(loss_scale()), 4) self.assertEqual(backend.get_value(loss_scale._num_good_steps), 0) # Load model weights and ensure loss scale weights are restored. model.load_weights(save_prefix) self.assertEqual(backend.get_value(loss_scale()), 2) self.assertEqual(backend.get_value(loss_scale._num_good_steps), 1)
def create_cnn_model(weights_path=None): # creates our cnn model #filters which total weights is “n*m*k*l” (Here the input has l=32 feature maps as inputs, k=64 feature maps as outputs) #Then there is a term called bias for each feature map. So, the total number of parameters are “(n*m*l+1)*k”. ''' PARAMETERS https://towardsdatascience.com/understanding-and-calculating-the-number-of-parameters-in-convolution-neural-networks-cnns-fc88790d530d https://medium.com/@shashikachamod4u/calculate-output-size-and-number-of-trainable-parameters-in-a-convolution-layer-1d64cae6c009 https://medium.com/@iamvarman/how-to-calculate-the-number-of-parameters-in-the-cnn-5bd55364d7ca https://cs231n.github.io/convolutional-networks/ ''' input = Input(shape=(1, IMG_WIDTH, IMG_HEIGHT)) input_pad = ZeroPadding2D(padding=(3, 3))(input) conv1_1_3x3_s1 = Conv2D(32, (3, 3), strides=(1, 1), padding='same', activation='relu', name='conv1_1/3x3_s1', kernel_regularizer=l2(l2_regulizer))(input_pad) conv1_2_3x3_s1 = Conv2D( 32, (3, 3), strides=(1, 1), padding='same', activation='relu', name='conv1_2/3x3_s1', kernel_regularizer=l2(l2_regulizer))(conv1_1_3x3_s1) conv1_zero_pad = ZeroPadding2D(padding=(1, 1))(conv1_2_3x3_s1) pool1_helper = PoolHelper()(conv1_zero_pad) pool1_2_2x2_s1 = MaxPooling2D(pool_size=(2, 2), strides=(1, 1), padding='same', name='pool1/2x2_s1')(pool1_helper) pool1_norm1 = LRN(name='pool1/norm1')(pool1_2_2x2_s1) conv2_1_3x3_reduce = Conv2D( 64, (1, 1), padding='same', activation='relu', name='conv2_1/3x3_reduce', kernel_regularizer=l2(l2_regulizer))(pool1_norm1) conv2_2_3x3 = Conv2D( 64, (3, 3), padding='same', activation='relu', name='conv2_2/3x3', kernel_regularizer=l2(l2_regulizer))(conv2_1_3x3_reduce) conv2_norm2 = LRN(name='conv2/norm2')(conv2_2_3x3) conv2_zero_pad = ZeroPadding2D(padding=(1, 1))(conv2_norm2) pool2_helper = PoolHelper()(conv2_zero_pad) pool2_3x3_s2 = MaxPooling2D(pool_size=(3, 3), strides=(2, 2), padding='same', name='pool2/3x3_s2')(pool2_helper) conv3_1_3x3_s1 = Conv2D(128, (3, 3), strides=(1, 1), padding='same', activation='relu', name='conv3_1/3x3_s1', kernel_regularizer=l2(l2_regulizer))(pool2_3x3_s2) conv3_2_3x3_s1 = Conv2D( 128, (3, 3), strides=(1, 1), padding='same', activation='relu', name='conv3_2/3x3_s1', kernel_regularizer=l2(l2_regulizer))(conv3_1_3x3_s1) conv3_zero_pad = ZeroPadding2D(padding=(1, 1))(conv3_2_3x3_s1) pool3_helper = PoolHelper()(conv3_zero_pad) pool3_2_2x2_s1 = MaxPooling2D(pool_size=(2, 2), strides=(1, 1), padding='same', name='pool3/2x2_s1')(pool3_helper) pool3_norm1 = LRN(name='pool3/norm1')(pool3_2_2x2_s1) conv4_1_3x3_reduce = Conv2D( 256, (1, 1), padding='same', activation='relu', name='conv4_1/3x3_reduce', kernel_regularizer=l2(l2_regulizer))(pool3_norm1) conv4_2_3x3 = Conv2D( 256, (3, 3), padding='same', activation='relu', name='conv4_2/3x3', kernel_regularizer=l2(l2_regulizer))(conv4_1_3x3_reduce) conv4_norm2 = LRN(name='conv4/norm2')(conv4_2_3x3) conv4_zero_pad = ZeroPadding2D(padding=(1, 1))(conv4_norm2) pool4_helper = PoolHelper()(conv4_zero_pad) pool4_3x3_s2 = MaxPooling2D(pool_size=(3, 3), strides=(2, 2), padding='same', name='pool4/3x3_s2')(pool4_helper) conv5_1_3x3_s1 = Conv2D(512, (3, 3), strides=(1, 1), padding='same', activation='relu', name='conv5_1/3x3_s1', kernel_regularizer=l2(l2_regulizer))(pool4_3x3_s2) conv5_2_3x3_s1 = Conv2D( 512, (3, 3), strides=(1, 1), padding='same', activation='relu', name='conv5_2/3x3_s1', kernel_regularizer=l2(l2_regulizer))(conv5_1_3x3_s1) conv5_zero_pad = ZeroPadding2D(padding=(1, 1))(conv5_2_3x3_s1) pool5_helper = PoolHelper()(conv5_zero_pad) pool5_2_2x2_s1 = MaxPooling2D(pool_size=(2, 2), strides=(1, 1), padding='same', name='pool5/2x2_s1')(pool5_helper) pool5_norm1 = LRN(name='pool5/norm1')(pool5_2_2x2_s1) conv6_1_3x3_reduce = Conv2D( 1024, (1, 1), padding='same', activation='relu', name='conv6_1/3x3_reduce', kernel_regularizer=l2(l2_regulizer))(pool5_norm1) conv6_2_3x3 = Conv2D( 1024, (3, 3), padding='same', activation='relu', name='conv6_2/3x3', kernel_regularizer=l2(l2_regulizer))(conv6_1_3x3_reduce) conv6_norm2 = LRN(name='conv6/norm2')(conv6_2_3x3) conv6_zero_pad = ZeroPadding2D(padding=(1, 1))(conv6_norm2) pool6_helper = PoolHelper()(conv6_zero_pad) pool6_3x3_s2 = MaxPooling2D(pool_size=(3, 3), strides=(2, 2), padding='same', name='pool6/3x3_s2')(pool6_helper) pool7_2x2_s1 = AveragePooling2D(pool_size=(2, 2), strides=(1, 1), name='pool7/2x2_s1')(pool6_3x3_s2) loss_flat = Flatten()(pool7_2x2_s1) pool7_drop_2x2_s1 = Dropout(rate=0.5)(loss_flat) loss_classifier = Dense( num_classes, name='loss3/classifier', kernel_regularizer=l2(l2_regulizer))(pool7_drop_2x2_s1) loss_classifier_act = Activation('softmax', name='prob')(loss_classifier) mynet = Model(inputs=input, outputs=[loss_classifier_act]) if weights_path: mynet.load_weights(weights_path) if keras.backend.backend() == 'tensorflow': # convert the convolutional kernels for tensorflow ops = [] for layer in mynet.layers: if layer.__class__.__name__ == 'Conv2D': original_w = K.get_value(layer.kernel) converted_w = convert_kernel(original_w) ops.append(tf.assign(layer.kernel, converted_w).op) K.get_session().run(ops) return mynet
def iterator_predict_loop(model, inputs, steps, verbose=0): """Predict function for eager execution when input is dataset iterator. Arguments: model: Instance of `Model`. inputs: Input dataset iterator. steps: Total number of steps (batches of samples) before declaring `_predict_loop` finished. verbose: Verbosity mode. Returns: Array of predictions (if the model has a single output) or list of arrays of predictions (if the model has multiple outputs). Raises: ValueError: In case of mismatch between given number of inputs and expectations of the model. """ assert isinstance(inputs, iterator_ops.EagerIterator) outs = [] if verbose == 1: progbar = generic_utils.Progbar(target=steps) for step_index in range(steps): # Get data from the iterator. try: next_element = inputs.get_next() except errors.OutOfRangeError: logging.warning( 'Your dataset iterator ran out of data; ' 'interrupting prediction. Make sure that your ' 'dataset can generate at least `steps` ' 'batches (in this case, %d batches).', steps) break if not isinstance(next_element, (list, tuple)) or len(next_element) != 2: raise ValueError( 'Please provide data as a list or tuple of 2 elements ' ' - input and target pair. Received %s. We do not use the ' '`target` value here.' % next_element) x, _ = next_element # Validate and standardize data. x, _, _ = model._standardize_user_data(x) if model._expects_training_arg: batch_outs = model.call(x[0] if len(x) == 1 else x, training=False) else: batch_outs = model.call(x[0] if len(x) == 1 else x) if not isinstance(batch_outs, list): batch_outs = [batch_outs] # We collect the results from every step and then concatenate them once # in the end. This is an expensive process. We are doing this because we # do not know the number of samples beforehand. if step_index == 0: for _ in batch_outs: outs.append([]) for i, batch_out in enumerate(batch_outs): outs[i].append(backend.get_value(batch_out)) if verbose == 1: progbar.update(step_index + 1) for i, out in enumerate(outs): outs[i] = np.concatenate(tuple(out), axis=0) if len(outs) == 1: return outs[0] return outs
def iterator_predict_loop(model, inputs, steps, verbose=0): """Predict function for eager execution when input is dataset iterator. Arguments: model: Instance of `Model`. inputs: Input dataset iterator. steps: Total number of steps (batches of samples) before declaring `_predict_loop` finished. verbose: Verbosity mode. Returns: Array of predictions (if the model has a single output) or list of arrays of predictions (if the model has multiple outputs). Raises: ValueError: In case of mismatch between given number of inputs and expectations of the model. """ assert isinstance(inputs, iterator_ops.EagerIterator) if not isinstance(inputs.output_shapes, (list, tuple)) or len(inputs.output_shapes) > 2: raise ValueError( 'Please provide data as a list or tuple of 1 or 2 elements ' ' - input or input and target pair. Received %s. We do not use the ' '`target` value here.' % inputs.output_shapes) outs = [] if verbose == 1: progbar = generic_utils.Progbar(target=steps) for step_index in range(steps): # Get data from the iterator. try: next_element = inputs.get_next() except errors.OutOfRangeError: logging.warning( 'Your dataset iterator ran out of data; ' 'interrupting prediction. Make sure that your ' 'dataset can generate at least `steps` ' 'batches (in this case, %d batches).', steps) break # expects a tuple, where first element of tuple represents inputs x = next_element[0] # Validate and standardize data. x, _, _ = model._standardize_user_data(x) x = training_utils.cast_if_floating_dtype(x) if model._expects_training_arg: batch_outs = model.call(x[0] if len(x) == 1 else x, training=False) else: batch_outs = model.call(x[0] if len(x) == 1 else x) if not isinstance(batch_outs, list): batch_outs = [batch_outs] # We collect the results from every step and then concatenate them once # in the end. This is an expensive process. We are doing this because we # do not know the number of samples beforehand. if step_index == 0: for _ in batch_outs: outs.append([]) for i, batch_out in enumerate(batch_outs): outs[i].append(backend.get_value(batch_out)) if verbose == 1: progbar.update(step_index + 1) for i, out in enumerate(outs): outs[i] = np.concatenate(tuple(out), axis=0) if len(outs) == 1: return outs[0] return outs
def test_dynamic_loss_scaling(self, strategy_fn, cloning=True): strategy = strategy_fn() initial_loss_scale = 2. batch_size = 4 expected_gradient = backend.variable([initial_loss_scale / batch_size], dtype=dtypes.float16) # If this variable is set to True, the model below will have NaN gradients have_nan_gradients = backend.variable(False, dtype=dtypes.bool) with strategy.scope(): with policy.policy_scope(policy.Policy('infer_float32_vars')): x = layers.Input(shape=(1, ), batch_size=batch_size, dtype=dtypes.float16) layer = AddLayer(assert_type=dtypes.float16) y = layer(x) identity_with_nan_grads = ( mp_test_util.create_identity_with_nan_gradients_fn( have_nan_gradients)) y = core.Lambda(identity_with_nan_grads)(y) identity_with_grad_check_fn = ( mp_test_util.create_identity_with_grad_check_fn( expected_dtype=dtypes.float16, expected_gradient=expected_gradient)) y = core.Lambda(identity_with_grad_check_fn)(y) y = math_ops.cast(y, dtypes.float32) model = models.Model(inputs=x, outputs=y) def loss_fn(y_true, y_pred): del y_true return math_ops.reduce_mean(y_pred) opt = gradient_descent.SGD(1.) loss_scale = loss_scale_module.DynamicLossScale( initial_loss_scale=initial_loss_scale, increment_period=2) opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale) model.compile(opt, loss=loss_fn, cloning=cloning) self.assertEqual(backend.eval(layer.v), 1) x = np.ones((batch_size, 1)) y = np.ones((batch_size, 1)) dataset = dataset_ops.Dataset.from_tensor_slices( (x, y)).batch(batch_size) model.fit(dataset) # The variables starts with 1 and has a gradient of 1, so will go down by 1 # each step. self.assertEqual(backend.eval(layer.v), 0) model.fit(dataset) self.assertEqual(backend.eval(layer.v), -1) # There have been two steps without NaNs, so the loss scale will double backend.set_value(expected_gradient, backend.get_value(expected_gradient * 2)) model.fit(dataset) self.assertEqual(backend.eval(layer.v), -2) # Next test with NaN gradients. backend.set_value(have_nan_gradients, True) model.fit(dataset) # Variable should not be updated self.assertEqual(backend.eval(layer.v), -2) # Test with finite gradients again backend.set_value(have_nan_gradients, False) # The loss scale will be halved due to the NaNs, so the gradient will also # be halved backend.set_value(expected_gradient, backend.get_value(expected_gradient / 2)) model.fit(dataset) self.assertEqual(backend.eval(layer.v), -3)
def train_net(args): data_dir = config.dataset_path image_size = config.image_shape[0:2] assert len(image_size) == 2 assert image_size[0] == image_size[1] print('image_size', image_size) print('num_classes', config.num_classes) training_path = os.path.join(data_dir, "train.tfrecords") print('Called with argument:', args, config) train_dataset, batches_per_epoch = data_input.training_dataset( training_path, default.per_batch_size) extractor, classifier = build_model((image_size[0], image_size[1], 3), args) global_step = 0 ckpt_path = os.path.join( args.models_root, '%s-%s-%s' % (args.network, args.loss, args.dataset), 'model-{step:04d}.ckpt') ckpt_dir = os.path.dirname(ckpt_path) print('ckpt_path', ckpt_path) if not os.path.exists(ckpt_dir): os.makedirs(ckpt_dir) if len(args.pretrained) == 0: latest = tf.train.latest_checkpoint(ckpt_dir) if latest: global_step = int(latest.split('-')[-1].split('.')[0]) classifier.load_weights(latest) else: print('loading', args.pretrained, args.pretrained_epoch) load_path = os.path.join(args.pretrained, '-', args.pretrained_epoch, '.ckpt') classifier.load_weights(load_path) initial_epoch = global_step // batches_per_epoch rest_batches = global_step % batches_per_epoch lr_decay_steps = [(int(x), args.lr * np.power(0.1, i + 1)) for i, x in enumerate(args.lr_steps.split(','))] print('lr_steps', lr_decay_steps) valid_datasets = data_input.load_valid_set(data_dir, config.val_targets) classifier.compile( optimizer=keras.optimizers.SGD(lr=args.lr, momentum=args.mom), loss=keras.losses.CategoricalCrossentropy(from_logits=True), metrics=[keras.metrics.SparseCategoricalAccuracy()]) classifier.summary() tensor_board = keras.callbacks.TensorBoard(ckpt_dir) tensor_board.set_model(classifier) train_names = ['train_loss', 'train_acc'] train_results = [] highest_score = 0 for epoch in range(initial_epoch, default.end_epoch): for batch in range(rest_batches, batches_per_epoch + 1): utils.update_learning_rate(classifier, lr_decay_steps, global_step) train_results = classifier.train_on_batch(train_dataset, reset_metrics=False) global_step += 1 if global_step % 1000 == 0: print('lr-batch-epoch:', float(K.get_value(classifier.optimizer.lr)), batch, epoch) if global_step >= 0 and global_step % args.verbose == 0: acc_list = [] for key in valid_datasets: data_set, data_set_flip, is_same_list = valid_datasets[key] embeddings = extractor.predict(data_set) embeddings_flip = extractor.predict(data_set_flip) embeddings_parts = [embeddings, embeddings_flip] x_norm = 0.0 x_norm_cnt = 0 for part in embeddings_parts: for i in range(part.shape[0]): embedding = part[i] norm = np.linalg.norm(embedding) x_norm += norm x_norm_cnt += 1 x_norm /= x_norm_cnt embeddings = embeddings_parts[0] + embeddings_parts[1] embeddings = sklearn.preprocessing.normalize(embeddings) print(embeddings.shape) _, _, accuracy, val, val_std, far = verification.evaluate( embeddings, is_same_list, folds=10) acc, std = np.mean(accuracy), np.std(accuracy) print('[%s][%d]XNorm: %f' % (key, batch, x_norm)) print('[%s][%d]Accuracy-Flip: %1.5f+-%1.5f' % (key, batch, acc, std)) acc_list.append(acc) if len(acc_list) > 0: score = sum(acc_list) if highest_score == 0: highest_score = score elif highest_score >= score: print('\nStep %05d: score did not improve from %0.5f' % (global_step, highest_score)) else: path = ckpt_path.format(step=global_step) print( '\nStep %05d: score improved from %0.5f to %0.5f,' ' saving model to %s' % (global_step, highest_score, score, path)) highest_score = score classifier.save_weights(path) utils.write_log(tensor_board, train_names, train_results, epoch) classifier.reset_metrics()
def run_skf_with_training_error(model_mode, loss_mode, fl, fl_store, hparams, skf_file, label_type='cutoff', scoring='mse', skf_sheet=None, te_sheet=None, k_folds=10, k_shuffle=True, save_model=False, save_model_name=None, save_model_dir=None, plot_name=None): ''' Stratified k fold cross validation for training and evaluating model 2 only. Model 1 data is trained before hand. :param model_mode: Choose between using SNN or cDNN (non_smiles) and SNN_smiles or cDNN_smiles :param cv_mode: Cross validation mode. Either 'skf' or 'loocv'. :param hparams: hparams dict containing hyperparameters information :param loader_file: data_loader excel file location :param skf_file: skf_file name to save excel file as :param skf_sheet: name of sheet to save inside the skf_file excel. If None, will default to SNN or cDNN as name :param k_folds: Number of k folds. Used only for skf cv_mode :param k_shuffle: Whether to shuffle the given examples to split into k folds if using skf :return: ''' fn = 6 numel = 3 # Run k model instance to perform skf predicted_labels_store = [] mse_store = [] mse_norm_store = [] folds = [] val_idx = [] val_features_c = [] val_labels = [] column_headers = fl.labels_names wb = openpyxl.load_workbook(te_sheet) msee_store = [] mre_store = [] for fold, fl_tuple in enumerate(fl_store): instance_start = time.time() (ss_fl, i_ss_fl) = fl_tuple # ss_fl is training fl, i_ss_fl is validation fl wb.create_sheet('{}'.format(fold)) ws = wb[wb.sheetnames[-1]] # Set up model if loss_mode == 'normal': sess = tf.compat.v1.Session() # sess = tf.Session() K.set_session(sess) model = MTmodel(fl=ss_fl, mode=model_mode, hparams=hparams, labels_norm=labels_norm) elif loss_mode == 'hul': model = HULMTmodel(fl=ss_fl, mode=model_mode, hparams=hparams, labels_norm=labels_norm) print('HUL Standard Deviation Values:') print([ np.exp(K.get_value(log_var[0]))**0.5 for log_var in model.model.layers[-1].log_vars ]) elif loss_mode == 'ann': sess = tf.compat.v1.Session() # sess = tf.Session() K.set_session(sess) model = Kmodel(fl=ss_fl, mode=model_mode, hparams=hparams) elif loss_mode == 'p_model': model = Pmodel(fl=ss_fl, mode=model_mode, hparams=hparams) elif loss_mode == 'svr': if not fl.normalise_labels: raise TypeError( 'fl labels are not normalised. For SVR, the labels must be normalised.' ) model = SVRmodel(fl=ss_fl, epsilon=hparams['epsilon'], c=hparams['c']) elif loss_mode == 'dtr': #if not fl.normalise_labels: #raise TypeError('fl labels are not normalised. For SVR, the labels must be normalised.') model = DTRmodel(fl=ss_fl, max_depth=hparams['max_depth'], num_est=hparams['num_est']) elif loss_mode == 'mimosvr': if not fl.normalise_labels: raise TypeError( 'fl labels are not normalised. For SVR, the labels must be normalised.' ) model = MIMOSVRmodel(fl=ss_fl, gamma=hparams['gamma']) else: raise KeyError('loss_mode ' + loss_mode + 'is not a valid selection for loss mode.') # Train model and save model training loss vs epoch plot if plot_name is given, else no plot will be saved if plot_name: model.train_model(ss_fl, i_ss_fl, plot_name='{}_fold_{}.png'.format( plot_name, fold)) else: model.train_model(ss_fl, i_ss_fl) p_y, _, _ = model.eval(fl) if fl.normalise_labels: p_y = fl.labels_scaler.inverse_transform(p_y) for row, p_label in enumerate(p_y.tolist()): if p_label[1] > p_label[2]: p_y[row, 1] = p_y[row, 2] if p_label[0] > p_y[row, 1]: p_y[row, 0] = p_y[row, 1] se_store = (fl.labels - p_y)**2 re_store = np.abs(fl.labels - p_y) / fl.labels df = pd.DataFrame(data=np.concatenate( (fl.labels, p_y, se_store, re_store), axis=1), index=list(range(1, 1 + fl.count)), columns=list(column_headers) + ['P_{}'.format(col) for col in column_headers] + ['SE_{}'.format(col) for col in column_headers] + ['RE_{}'.format(col) for col in column_headers]) print_df_to_excel(df=df, ws=ws) col = fn + 1 + 1 + 2 * numel + 3 msee_store.append(np.mean(se_store)) mre_store.append(np.mean(re_store)) ws.cell(1, col).value = 'MSE' ws.cell(1, col + 1).value = msee_store[-1] ws.cell(2, col).value = 'MRE' ws.cell(2, col + 1).value = mre_store[-1] ws.cell(3, col).value = 'ARE' ws.cell(3, col + 1).value = mare_store[-1] # Evaluation predicted_labels, mse, mse_norm = model.eval(i_ss_fl) if fl.normalise_labels: predicted_labels = fl.labels_scaler.inverse_transform( predicted_labels) if label_type == 'cutoff': for row, p_label in enumerate(predicted_labels.tolist()): if p_label[1] > p_label[2]: predicted_labels[row, 1] = predicted_labels[row, 2] if p_label[0] > predicted_labels[row, 1]: predicted_labels[row, 0] = predicted_labels[row, 1] pass predicted_labels_store.extend(predicted_labels) mse_store.append(mse) mse_norm_store.append(mse_norm) ''' if fold == k_folds-1: stringlist = [] model.model.summary(print_fn=lambda x: stringlist.append(x)) short_model_summary = "\n".join(stringlist) print(short_model_summary) ''' # Saving model if save_model: # Set save_model_name if isinstance(save_model_name, str): save_model_name1 = save_model_name + '_' + model_mode + '_' + str( fold + 1) else: save_model_name1 = model_mode + '_' + str(fold + 1) # Save model print('Saving instance {} model in {}'.format( fold + 1, save_model_dir + save_model_name1 + '.h5')) if loss_mode == 'normal' or loss_mode == 'ann': model.model.save(save_model_dir + save_model_name1 + '.h5') elif loss_mode == 'hul': model.prediction_model.save(save_model_dir + save_model_name1 + '.h5') elif loss_mode == 'svr' or loss_mode == 'dtr': pickle.dump( Predict_SVR_DTR(model=model.model, labels_scaler=model.labels_scaler), open(save_model_dir + save_model_name1 + '.pkl', 'wb')) # Need to put the next 3 lines if not memory will run out del model if loss_mode == 'normal' or loss_mode == 'ann': K.clear_session() sess.close() gc.collect() # Preparing data to put into new_df that consists of all the validation dataset and its predicted labels folds.extend( [fold] * i_ss_fl.count ) # Make a col that contains the fold number for each example if len(val_features_c): val_features_c = np.concatenate( (val_features_c, i_ss_fl.features_c), axis=0) else: val_features_c = i_ss_fl.features_c val_labels.extend(i_ss_fl.labels) val_idx.extend(i_ss_fl.idx) # Printing one instance summary. instance_end = time.time() print( '\nFor k-fold run {} out of {}. Each fold has {} examples. Model is {} with {} loss. Time taken for ' 'instance = {}\n' 'Post-training results: \nmse = {}, mse_norm = {}. Scoring is {}\n' '####################################################################################################' .format(fold + 1, k_folds, i_ss_fl.count, model_mode, loss_mode, instance_end - instance_start, mse, mse_norm, scoring)) ws = wb[wb.sheetnames[0]] df = pd.DataFrame(data=np.array([msee_store, mre_store]).T, columns=['mse', 're'], index=range(1, 1 + len(msee_store))) df.insert(0, 'Fold', list(range(len(fl_store)))) print_df_to_excel(df=df, ws=ws) wb.save(te_sheet) mse_avg = np.average(mse_store) mse_norm_avg = np.average(mse_norm_store) re = np.average( np.abs(np.array(val_labels) - np.array(predicted_labels_store)) / np.array(val_labels)) # Calculating metrics based on complete validation prediction mse_full = mean_squared_error(val_labels, predicted_labels_store) try: mse_norm_full = mean_squared_error( fl.labels_scaler.transform(val_labels), fl.labels_scaler.transform(predicted_labels_store)) except AttributeError: mse_norm_full = mse_full # Creating dataframe to print into excel later. new_df = np.concatenate( ( np.array(folds)[:, None], # Convert 1d list to col. vector val_features_c, np.array(val_labels), np.array(predicted_labels_store)), axis=1) if fl.label_type == 'points': predicted_labels_name = list(map(str, np.arange(2, 101))) predicted_labels_name = [f'P_{x}' for x in predicted_labels_name] headers = ['folds'] + \ list(map(str, fl.features_c_names)) + \ list(map(str, np.arange(2,101))) + \ predicted_labels_name elif fl.label_type == 'cutoff': predicted_labels_name = list(fl.labels_names) predicted_labels_name = [f'P_{x}' for x in predicted_labels_name] headers = ['folds'] + \ list(map(str, fl.features_c_names)) + \ list(fl.labels_names) + \ predicted_labels_name # val_idx is the original position of the example in the data_loader new_df = pd.DataFrame(data=new_df, columns=headers, index=val_idx) print('Writing into' + skf_file) wb = load_workbook(skf_file) # Creating new worksheet. Even if SNN worksheet already exists, a new SNN1 ws will be created and so on if skf_sheet is None: wb.create_sheet(model_mode) else: wb.create_sheet(model_mode + skf_sheet) sheet_name = wb.sheetnames[ -1] # Taking the ws name from the back ensures that if SNN1 is the new ws, it works # Writing hparam dataframe first pd_writer = pd.ExcelWriter(skf_file, engine='openpyxl') pd_writer.book = wb pd_writer.sheets = dict((ws.title, ws) for ws in wb.worksheets) new_df.to_excel(pd_writer, sheet_name) start_col = len(new_df.columns) + 4 hparams = pd.DataFrame(dict([(k, Series(v)) for k, v in hparams.items()])) hparams.to_excel(pd_writer, sheet_name, startrow=0, startcol=start_col - 1) start_row = 5 # Writing other subset split, instance per run, and bounds ws = wb[sheet_name] headers = ['mse', 'mse_norm', 're'] values = [mse_avg, mse_norm_avg] values_full = [mse_full, mse_norm_full, re] print_array_to_excel(np.array(headers), (1 + start_row, start_col + 1), ws, axis=1) print_array_to_excel(np.array(values), (2 + start_row, start_col + 1), ws, axis=1) print_array_to_excel(np.array(values_full), (3 + start_row, start_col + 1), ws, axis=1) ws.cell(2 + start_row, start_col).value = 'Folds avg' ws.cell(3 + start_row, start_col).value = 'Overall' pd_writer.save() pd_writer.close() wb.close() if scoring == 'mse': return mse_full elif scoring == 're': return re else: raise KeyError('Scoring function {} is not valid'.format(scoring))
def test_dynamic_loss_scaling(self, strategy_fn, pass_loss_scale_to_policy=False, get_config=False): strategy = strategy_fn() initial_loss_scale = 2. batch_size = 4 loss_scale = loss_scale_module.DynamicLossScale( initial_loss_scale=initial_loss_scale, increment_period=2) expected_gradient = backend.variable([initial_loss_scale / batch_size], dtype=dtypes.float16) # If this variable is set to True, the model below will have NaN gradients have_nan_gradients = backend.variable(False, dtype=dtypes.bool) with strategy.scope(): opt = gradient_descent.SGD(1.) if pass_loss_scale_to_policy: p = policy.Policy('mixed_float16', loss_scale=loss_scale) else: p = policy.Policy('mixed_float16', loss_scale=None) opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale) with policy.policy_scope(p): x = layers.Input(shape=(1, ), batch_size=batch_size, dtype=dtypes.float16) layer = mp_test_util.MultiplyLayer(assert_type=dtypes.float16) y = layer(x) identity_with_nan_grads = ( mp_test_util.create_identity_with_nan_gradients_fn( have_nan_gradients)) y = core.Lambda(identity_with_nan_grads)(y) identity_with_grad_check_fn = ( mp_test_util.create_identity_with_grad_check_fn( expected_dtype=dtypes.float16, expected_gradient=expected_gradient)) y = core.Lambda(identity_with_grad_check_fn)(y) model = models.Model(inputs=x, outputs=y) if get_config: config = model.get_config() model = model.__class__.from_config( config, custom_objects={ 'MultiplyLayer': mp_test_util.MultiplyLayer }) (layer, ) = ( layer for layer in model.layers if isinstance(layer, mp_test_util.MultiplyLayer)) def loss_fn(y_true, y_pred): del y_true return math_ops.reduce_mean(y_pred) model.compile(opt, loss=loss_fn, run_eagerly=testing_utils.should_run_eagerly()) self.assertEqual(backend.eval(layer.v), 1) x = np.ones((batch_size, 1)) y = np.ones((batch_size, 1)) dataset = dataset_ops.Dataset.from_tensor_slices( (x, y)).batch(batch_size) model.fit(dataset) # The variables starts with 1 and has a gradient of 1, so will go down by 1 # each step. self.assertEqual(backend.eval(layer.v), 0) model.fit(dataset) self.assertEqual(backend.eval(layer.v), -1) # There have been two steps without NaNs, so the loss scale will double backend.set_value(expected_gradient, backend.get_value(expected_gradient * 2)) model.fit(dataset) self.assertEqual(backend.eval(layer.v), -2) # Next test with NaN gradients. backend.set_value(have_nan_gradients, True) model.fit(dataset) # Variable should not be updated self.assertEqual(backend.eval(layer.v), -2) # Test with finite gradients again backend.set_value(have_nan_gradients, False) # The loss scale will be halved due to the NaNs, so the gradient will also # be halved backend.set_value(expected_gradient, backend.get_value(expected_gradient / 2)) model.fit(dataset) self.assertEqual(backend.eval(layer.v), -3)
def on_epoch_begin(self, epoch, logs=None): if self.verbose_ > 0: print("KL Divergence weight: %.3f" % K.get_value(self.scale_))
def on_epoch_end(self, epoch, logs=None): lr = float(K.get_value(self.model.optimizer.lr)) print(f"\nEnd epoch {epoch + 1}| LR={lr: 0.08f}\n\n")
def on_train_batch_end(self, batch, logs=None): epoch = len(self.model.history.epoch) batch_id = epoch * self.params.get('steps', None) + batch new_lr = 1e-8 * 10**(batch_id / 20) K.set_value(self.model.optimizer.lr, K.get_value(new_lr)) print(f"\n...Training: end of batch {batch} LR->{new_lr: 0.09f}\n\n")
def on_epoch_begin(self, step, log=None): if self.model is not None: wd = self.wd_m * K.get_value(self.model.optimizer.lr) K.set_value(self.model.optimizer.weight_decay, wd) # wd = self.model.optimizer.weight_decay print("Weight decay for iter {} is {}".format(step + 1, wd))
def on_train_begin(self, logs=None): self.step = K.get_value(self.model.optimizer.iterations)
def test_save_model_with_dynamic_loss_scaling( self, strategy_fn, h5=False, use_v1_loss_scale_optimizer=False): # TODO(reedwm): Support and test saving model with a mixed_[b]float16 policy # as well. strategy = strategy_fn() if (isinstance(strategy, mirrored_strategy.MirroredStrategy) and not context.executing_eagerly()): # TODO(b/121381184): Enable running the test in this case. return # Create and run model. with strategy.scope(): x = layers.Input(shape=(2, ), batch_size=2, dtype=dtypes.float32) y = mp_test_util.MultiplyLayer()(x) model = models.Model(inputs=x, outputs=y) opt = gradient_descent.SGD(1.) if use_v1_loss_scale_optimizer: loss_scale = loss_scale_module.DynamicLossScale( initial_loss_scale=1., increment_period=2.) opt = loss_scale_optimizer.LossScaleOptimizerV1( opt, loss_scale) else: opt = loss_scale_optimizer.LossScaleOptimizer( opt, initial_scale=1., dynamic_growth_steps=2.) model.compile(optimizer=opt, loss='mse', run_eagerly=testing_utils.should_run_eagerly()) # Run for 3 steps (6 examples with a batch size of 2) model.fit(np.ones((6, 2)), np.zeros((6, 2)), batch_size=2) self.assertEqual(backend.get_value(opt.loss_scale), 2) self.assertEqual(backend.get_value(opt.dynamic_counter), 1) (weight, ) = model.trainable_weights orig_weight = backend.get_value(weight) # Save model weights. save_path = os.path.join(self.get_temp_dir(), 'model') model.save(save_path, save_format='h5' if h5 else 'tf') # Run model again for 1 step (2 examples with a batch size of 2) model.fit(np.ones((2, 2)), np.zeros((2, 2)), batch_size=2) new_weight = backend.get_value(weight) self.assertNotEqual(new_weight, orig_weight) self.assertEqual(backend.get_value(opt.loss_scale), 4) self.assertEqual(backend.get_value(opt.dynamic_counter), 0) # Load model weights and ensure loss scale weights are restored. model = save.load_model( save_path, custom_objects={'MultiplyLayer': mp_test_util.MultiplyLayer}) (weight, ) = model.trainable_weights loaded_weight = backend.get_value(weight) self.assertEqual(loaded_weight, orig_weight) # Currently the loss scale isn't always saved when the model is saved with # Model.save(). So we assert the loss scale either has the value when it was # saved, or the value it was initialized with. # TODO(reedwm): Always save/restore the loss scale with Model.save(). self.assertIn(backend.get_value(model.optimizer.loss_scale), (1, 2)) self.assertIn(backend.get_value(model.optimizer.dynamic_counter), (0, 1)) # Test optimizer attributes and type self.assertEqual(model.optimizer.initial_scale, 1.) self.assertEqual(model.optimizer.dynamic_growth_steps, 2.) self.assertEqual(type(model.optimizer), loss_scale_optimizer.LossScaleOptimizer)
def __call__(self, shape, dtype=None, partition_info=None): # set bias to -log((1 - p)/p) for foreground bias = -K.log((1 - self.probability) / self.probability) result = K.get_value(K.ones(shape, dtype=dtype)) * bias return result
def train(self, reranker, train_dataset, train_output_path, dev_data, dev_output_path, qrels, metric, relevance_level=1): if self.tpu: # WARNING: not sure if pathlib is compatible with gs:// train_output_path = Path( "{0}/{1}/{2}".format( self.config["storage"], "train_output", hashlib.md5(str(train_output_path).encode("utf-8")).hexdigest() ) ) dev_best_weight_fn, weights_output_path, info_output_path, loss_fn, metric_fn = self.get_paths_for_early_stopping( train_output_path, dev_output_path ) train_records = self.get_tf_train_records(reranker, train_dataset) dev_records = self.get_tf_dev_records(reranker, dev_data) dev_dist_dataset = self.strategy.experimental_distribute_dataset(dev_records) # Does not very much from https://www.tensorflow.org/tutorials/distribute/custom_training strategy_scope = self.strategy.scope() with strategy_scope: reranker.build_model() wrapped_model = self.get_wrapped_model(reranker.model) loss_object = self.get_loss(self.config["loss"]) optimizer_1 = tf.keras.optimizers.Adam(learning_rate=self.config["lr"]) optimizer_2 = tf.keras.optimizers.Adam(learning_rate=self.config["bertlr"]) # "You should remove the use of the LossScaleOptimizer when TPUs are used." if self.amp and not self.tpu: optimizer_2 = mixed_precision.LossScaleOptimizer(optimizer_2, loss_scale="dynamic") def compute_loss(labels, predictions): per_example_loss = loss_object(labels, predictions) return tf.nn.compute_average_loss(per_example_loss, global_batch_size=self.config["batch"]) def is_bert_variable(name): if "bert" in name: return True if "electra" in name: return True return False def train_step(inputs): data, labels = inputs with tf.GradientTape() as tape: train_predictions = wrapped_model(data, training=True) loss = compute_loss(labels, train_predictions) if self.amp and not self.tpu: loss = optimizer_2.get_scaled_loss(loss) gradients = tape.gradient(loss, wrapped_model.trainable_variables) if self.amp and not self.tpu: optimizer_2.get_unscaled_gradients(gradients) bert_variables = [ (gradients[i], variable) for i, variable in enumerate(wrapped_model.trainable_variables) if is_bert_variable(variable.name) and "classifier" not in variable.name ] classifier_vars = [ (gradients[i], variable) for i, variable in enumerate(wrapped_model.trainable_variables) if "classifier" in variable.name ] other_vars = [ (gradients[i], variable) for i, variable in enumerate(wrapped_model.trainable_variables) if not is_bert_variable(variable.name) and "classifier" not in variable.name ] assert len(bert_variables) + len(classifier_vars) + len(other_vars) == len(wrapped_model.trainable_variables) # TODO: Clean this up for general use # Making sure that we did not miss any variables optimizer_1.apply_gradients(classifier_vars) optimizer_2.apply_gradients(bert_variables) if other_vars: optimizer_1.apply_gradients(other_vars) return loss def test_step(inputs): data, labels = inputs predictions = wrapped_model.predict_step(data) return predictions @tf.function def distributed_train_step(dataset_inputs): per_replica_losses = self.strategy.run(train_step, args=(dataset_inputs,)) return self.strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None) @tf.function def distributed_test_step(dataset_inputs): return self.strategy.run(test_step, args=(dataset_inputs,)) train_records = train_records.shuffle(100000) train_dist_dataset = self.strategy.experimental_distribute_dataset(train_records) initial_iter, metrics = ( self.fastforward_training(wrapped_model, weights_output_path, loss_fn, metric_fn) if self.config["fastforward"] else (0, {}) ) dev_best_metric = metrics.get(metric, -np.inf) logger.info("starting training from iteration %s/%s", initial_iter + 1, self.config["niters"]) logger.info(f"Best metric loaded: {metric}={dev_best_metric}") cur_step = initial_iter * self.n_batch_per_iter initial_lr = self.change_lr(step=cur_step, lr=self.config["bertlr"]) K.set_value(optimizer_2.lr, K.get_value(initial_lr)) train_loss = self.load_loss_file(loss_fn) if initial_iter > 0 else [] if 0 < initial_iter < self.config["niters"]: self.exhaust_used_train_data(train_dist_dataset, n_batch_to_exhaust=initial_iter * self.n_batch_per_iter) niter = initial_iter total_loss = 0 trec_preds = {} iter_bar = tqdm(desc="Training iteration", total=self.n_batch_per_iter) # Goes through the dataset ONCE (i.e niters * itersize). # However, the dataset may already contain multiple instances of the same sample, # depending upon what Sampler was used. # If you want multiple epochs, achieve it by tweaking the niters and itersize values. for x in train_dist_dataset: total_loss += distributed_train_step(x) cur_step += 1 iter_bar.update(1) # Do warmup and decay new_lr = self.change_lr(step=cur_step, lr=self.config["bertlr"]) K.set_value(optimizer_2.lr, K.get_value(new_lr)) if cur_step % self.n_batch_per_iter == 0: niter += 1 iter_bar.close() iter_bar = tqdm(total=self.n_batch_per_iter) train_loss.append(total_loss / self.n_batch_per_iter) logger.info("iter={} loss = {}".format(niter, train_loss[-1])) self.write_to_loss_file(loss_fn, train_loss) total_loss = 0 if self.config["fastforward"]: wrapped_model.save_weights(f"{weights_output_path}/{niter}") if niter % self.config["validatefreq"] == 0: dev_predictions = [] for x in tqdm(dev_dist_dataset, desc="validation"): pred_batch = ( distributed_test_step(x).values if self.strategy.num_replicas_in_sync > 1 else [distributed_test_step(x)] ) for p in pred_batch: dev_predictions.extend(p) trec_preds = self.get_preds_in_trec_format(dev_predictions, dev_data) metrics = evaluator.eval_runs(trec_preds, dict(qrels), evaluator.DEFAULT_METRICS, relevance_level) logger.info("dev metrics: %s", " ".join([f"{metric}={v:0.3f}" for metric, v in sorted(metrics.items())])) if metrics[metric] > dev_best_metric: dev_best_metric = metrics[metric] logger.info("new best dev metric: %0.4f", dev_best_metric) self.write_to_metric_file(metric_fn, metrics) wrapped_model.save_weights(dev_best_weight_fn) Searcher.write_trec_run(trec_preds, outfn=(dev_output_path / "best").as_posix()) if cur_step >= self.config["niters"] * self.n_batch_per_iter: break return trec_preds
def on_epoch_end(self, epoch, logs={}): logs = logs or {} logs['lr'] = K.get_value(self.model.optimizer.lr)
def on_batch_end(self, batch, logs={}): # Norm clipping: print(str(math.sqrt(sum(np.sum(K.get_value(w)) for w in self.model.optimizer.weights))) + '\n') return
def change_params(epoch, logs): if epoch <= 5 and epoch % 1 == 0: K.set_value(beta, K.get_value(beta) + 2e-5) if epoch == 30: K.set_value(alpha, 0.0)
def set_vocabulary(self, vocab, df_data=None, oov_df_value=None, append=False): """Sets vocabulary (and optionally document frequency) data for this layer. This method sets the vocabulary and DF data for this layer directly, instead of analyzing a dataset through 'adapt'. It should be used whenever the vocab (and optionally document frequency) information is already known. If vocabulary data is already present in the layer, this method will either replace it, if 'append' is set to False, or append to it (if 'append' is set to True). Arguments: vocab: An array of string tokens. df_data: An array of document frequency data. Only necessary if the layer output_mode is TFIDF. oov_df_value: The document frequency of the OOV token. Only necessary if output_mode is TFIDF. OOV data is optional when appending additional data in TFIDF mode; if an OOV value is supplied it will overwrite the existing OOV value. append: Whether to overwrite or append any existing vocabulary data. Raises: ValueError: If there are too many inputs, the inputs do not match, or input data is missing. """ current_table_size = self._get_table_size() total_vocab_size = len(vocab) + (current_table_size if append else 0) if self._max_tokens is not None and total_vocab_size > self._max_vocab_size: raise ValueError( "Attempted to set a vocabulary larger than the maximum vocab size. " "Passed vocab size is %s, max vocab size is %s. Note that the OOV " "token is automatically added to the number of tokens." % (total_vocab_size, self._max_vocab_size)) # We're only _really_ appending if the table_size is nonzero. This is # important for some sanity checks in tfidf mode (specifically, checking if # oov_df_value is set or not) and handling existing tfidf weight data. append = append if current_table_size > 0 else False if self._output_mode == TFIDF: if df_data is None: raise ValueError("df_data must be set if output_mode is TFIDF") if len(vocab) != len(df_data): raise ValueError("df_data must be the same length as vocab. " "len(df_data) is %s, len(vocab) is %s" % (len(vocab), len(df_data))) if not append and oov_df_value is None: raise ValueError( "You must pass an oov_df_value the first time " "'set_vocabulary' is called when output_mode is " "TFIDF.") else: if df_data is not None: raise ValueError( "df_data should only be set if output_mode is TFIDF. " "output_mode is %s." % self._output_mode) start_index = self._reserved_values + (self._get_table_size() if append else 0) values = np.arange(start_index, len(vocab) + start_index, dtype=np.int64) vocab = self._convert_to_ndarray(vocab) self._assert_same_type(dtypes.string, vocab, "vocab") values = self._convert_to_ndarray(values) self._assert_same_type(dtypes.int64, values, "values") if not append and self._vocab_size > 0: self._clear_table() self._insert_table_data(vocab, values) # When doing raw or integer output, we don't have a Vectorize layer to # manage. In this case, we can return directly. if self._output_mode in [None, INT]: return if not self._pad_to_max or self._max_tokens is None: num_tokens = total_vocab_size + self._reserved_values self._vectorize_layer.set_num_elements(num_tokens) if self._output_mode == TFIDF: df_data = self._convert_to_ndarray(df_data) if append: # The existing IDF data is stored in a Keras weight, so we can get it # by calling K.get_value() on the weight object. Take the first # table_size+1 values in case we're padding the weight with zeros existing_df_data = K.get_value( self._vectorize_layer.tf_idf_weights)[:current_table_size + 1] df_data = np.append(existing_df_data, df_data, axis=0) # If we are appending and need to replace the OOV DF value, we can # assign it over the existing OOV DF value at index 0 of the (already- # concatenated) DF value array. if oov_df_value is not None: df_data[0] = oov_df_value else: # If we are not appending (that is, we have only new data) we need to # insert the OOV value to the front of the array. (This is a append to # the head, not a replacement of the zeroth value.) if not isinstance(oov_df_value, np.ndarray): oov_df_value = np.array([oov_df_value]) df_data = np.insert(df_data, 0, oov_df_value) self._vectorize_layer.set_tfidf_data(df_data)
def get_config(self): return {'a': backend.get_value(self.a), 'b': self.b, 'name': self.name}
def value(self): return K.get_value(self.layers[-1].param)
batch_size=batch_size, validation_split=0.1, epochs=epochs, verbose=1, callbacks=callbacks_list, shuffle=True) # load the saved best model weights asr.act_model.load_weights('best_model.hdf5') # predict outputs on validation images prediction = asr.act_model.predict(data.x_test[:10]) # use CTC decoder out = K.get_value( K.ctc_decode(prediction, input_length=np.ones(prediction.shape[0]) * prediction.shape[1], greedy=True)[0][0]) # see the results i = 0 for x in out: print("original_text = ", data.ph_org_test[i]) print("predicted text = ", end='') for p in x: if int(p) != -1: print("'" + data.phonemes[int(p)] + "', ", end='') print('\n') i += 1
def train(args, model, config): logger = logging.getLogger('tensorflow') train_dataset = config['train_dataset'] eval_dataset = config['eval_dataset'] steps = int(config['steps_per_epoch']) schedule = get_schedule(args=args, steps_per_epoch=steps) writer = tf.summary.create_file_writer( os.path.join(args.model_dir, 'event_files')) deep_optimizer = tf.keras.optimizers.RMSprop( learning_rate=args.deep_learning_rate, rho=0.5) wide_optimizer = tf.keras.optimizers.Ftrl( learning_rate=args.linear_learning_rate) compiled_loss = tf.keras.losses.BinaryCrossentropy() eval_loss = tf.keras.metrics.Mean() metrics = [tf.keras.metrics.BinaryAccuracy(), tf.keras.metrics.AUC()] current_step_var = tf.Variable(0, trainable=False, dtype=tf.int64) display_id_counter = tf.Variable(0., trainable=False, dtype=tf.float64) streaming_map = tf.Variable(0., name='STREAMING_MAP', trainable=False, dtype=tf.float64) checkpoint = tf.train.Checkpoint(deep_optimizer=deep_optimizer, wide_optimizer=wide_optimizer, model=model, current_step=current_step_var) manager = tf.train.CheckpointManager(checkpoint=checkpoint, directory=os.path.join( args.model_dir, 'checkpoint'), max_to_keep=1) if args.use_checkpoint: checkpoint.restore(manager.latest_checkpoint) if manager.latest_checkpoint: logger.warning(f'Model restored from checkpoint {args.model_dir}') if args.benchmark: current_step_var.assign(0) else: logger.warning( f'Failed to restore model from checkpoint {args.model_dir}') if args.amp: deep_optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer( deep_optimizer, loss_scale='dynamic') wide_optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer( wide_optimizer, loss_scale='dynamic') @tf.function def train_step(x, y, first_batch): with tf.GradientTape(persistent=True) as tape: y_pred = model(x, training=True) loss = compiled_loss(y, y_pred) linear_loss = wide_optimizer.get_scaled_loss( loss) if args.amp else loss deep_loss = deep_optimizer.get_scaled_loss( loss) if args.amp else loss if not args.cpu: tape = hvd.DistributedGradientTape(tape) for metric in metrics: metric.update_state(y, y_pred) linear_vars = model.linear_model.trainable_variables dnn_vars = model.dnn_model.trainable_variables linear_grads = tape.gradient(linear_loss, linear_vars) dnn_grads = tape.gradient(deep_loss, dnn_vars) if args.amp: linear_grads = wide_optimizer.get_unscaled_gradients(linear_grads) dnn_grads = deep_optimizer.get_unscaled_gradients(dnn_grads) wide_optimizer.apply_gradients(zip(linear_grads, linear_vars)) deep_optimizer.apply_gradients(zip(dnn_grads, dnn_vars)) if first_batch and not args.cpu: hvd.broadcast_variables(model.linear_model.variables, root_rank=0) hvd.broadcast_variables(model.dnn_model.variables, root_rank=0) hvd.broadcast_variables(wide_optimizer.variables(), root_rank=0) hvd.broadcast_variables(deep_optimizer.variables(), root_rank=0) return loss @tf.function def evaluation_step(x, y): predictions = model(x, training=False) loss = compiled_loss(y, predictions) for metric in metrics: metric.update_state(y, predictions) predictions = tf.reshape(predictions, [-1]) predictions = tf.cast(predictions, tf.float64) display_ids = x[DISPLAY_ID_COLUMN] display_ids = tf.reshape(display_ids, [-1]) labels = tf.reshape(y, [-1]) sorted_ids = tf.argsort(display_ids) display_ids = tf.gather(display_ids, indices=sorted_ids) predictions = tf.gather(predictions, indices=sorted_ids) labels = tf.gather(labels, indices=sorted_ids) _, display_ids_idx, display_ids_ads_count = tf.unique_with_counts( display_ids, out_idx=tf.int64) pad_length = 30 - tf.reduce_max(display_ids_ads_count) preds = tf.RaggedTensor.from_value_rowids(predictions, display_ids_idx).to_tensor() labels = tf.RaggedTensor.from_value_rowids( labels, display_ids_idx).to_tensor() labels_mask = tf.math.reduce_max(labels, 1) preds_masked = tf.boolean_mask(preds, labels_mask) labels_masked = tf.boolean_mask(labels, labels_mask) labels_masked = tf.argmax(labels_masked, axis=1, output_type=tf.int32) labels_masked = tf.reshape(labels_masked, [-1, 1]) preds_masked = tf.pad(preds_masked, [(0, 0), (0, pad_length)]) _, predictions_idx = tf.math.top_k(preds_masked, 12) indices = tf.math.equal(predictions_idx, labels_masked) indices_mask = tf.math.reduce_any(indices, 1) masked_indices = tf.boolean_mask(indices, indices_mask) res = tf.argmax(masked_indices, axis=1) ap_matrix = tf.divide(1, tf.add(res, 1)) ap_sum = tf.reduce_sum(ap_matrix) shape = tf.cast(tf.shape(indices)[0], tf.float64) display_id_counter.assign_add(shape) streaming_map.assign_add(ap_sum) return loss t0 = None t_batch = None with writer.as_default(): for epoch in range(1, args.num_epochs + 1): for step, (x, y) in enumerate(train_dataset): current_step = np.asscalar(current_step_var.numpy()) schedule(optimizer=deep_optimizer, current_step=current_step) for metric in metrics: metric.reset_states() loss = train_step(x, y, epoch == 1 and step == 0) if args.cpu or hvd.rank() == 0: for metric in metrics: tf.summary.scalar(f'{metric.name}', metric.result(), step=current_step) tf.summary.scalar('loss', loss, step=current_step) tf.summary.scalar('schedule', K.get_value(deep_optimizer.lr), step=current_step) writer.flush() if args.benchmark: boundary = max(args.benchmark_warmup_steps, 1) if current_step == boundary: t0 = time.time() if current_step > boundary: batch_time = time.time() - t_batch samplesps = args.global_batch_size / batch_time dllogger.log(data={'batch_samplesps': samplesps}, step=(1, current_step)) if args.benchmark_steps <= current_step: train_time = time.time() - t0 epochs = args.benchmark_steps - max( args.benchmark_warmup_steps, 1) train_throughput = (args.global_batch_size * epochs) / train_time dllogger.log( data={'train_throughput': train_throughput}, step=tuple()) return else: if current_step % 100 == 0: train_data = { metric.name: f'{metric.result().numpy():.4f}' for metric in metrics } train_data['loss'] = f'{loss.numpy():.4f}' dllogger.log(data=train_data, step=(current_step, args.num_epochs * steps)) if step == steps: break current_step_var.assign_add(1) t_batch = time.time() if args.benchmark: continue for metric in metrics: metric.reset_states() eval_loss.reset_states() for step, (x, y) in enumerate(eval_dataset): loss = evaluation_step(x, y) eval_loss.update_state(loss) map_metric = tf.divide(streaming_map, display_id_counter) if args.cpu else \ hvd.allreduce(tf.divide(streaming_map, display_id_counter)) map_metric = map_metric.numpy() eval_loss_reduced = eval_loss.result() if args.cpu else \ hvd.allreduce(eval_loss.result()) metrics_reduced = { f'{metric.name}_val': metric.result() if args.cpu else hvd.allreduce(metric.result()) for metric in metrics } for name, result in metrics_reduced.items(): tf.summary.scalar(f'{name}', result, step=steps * epoch) tf.summary.scalar('loss_val', eval_loss_reduced, step=steps * epoch) tf.summary.scalar('map_val', map_metric, step=steps * epoch) writer.flush() eval_data = { name: f'{result.numpy():.4f}' for name, result in metrics_reduced.items() } eval_data.update({ 'loss_val': f'{eval_loss_reduced.numpy():.4f}', 'streaming_map_val': f'{map_metric:.4f}' }) dllogger.log(data=eval_data, step=(steps * epoch, args.num_epochs * steps)) if args.cpu or hvd.rank() == 0: manager.save() display_id_counter.assign(0) streaming_map.assign(0) if args.cpu or hvd.rank() == 0: dllogger.log(data=eval_data, step=tuple())
def set_vocabulary(self, vocab, df_data=None, oov_df_value=None, append=False): """Sets vocabulary (and optionally document frequency) data for this layer. This method sets the vocabulary and DF data for this layer directly, instead of analyzing a dataset through 'adapt'. It should be used whenever the vocab (and optionally document frequency) information is already known. If vocabulary data is already present in the layer, this method will either replace it, if 'append' is set to False, or append to it (if 'append' is set to True). Arguments: vocab: An array of string tokens. df_data: An array of document frequency data. Only necessary if the layer output_mode is TFIDF. oov_df_value: The document frequency of the OOV token. Only necessary if output_mode is TFIDF. OOV data is optional when appending additional data in TFIDF mode; if an OOV value is supplied it will overwrite the existing OOV value. append: Whether to overwrite or append any existing vocabulary data. Raises: ValueError: If there are too many inputs, the inputs do not match, or input data is missing. RuntimeError: If the vocabulary cannot be set when this function is called. This happens when "binary", "count", and "tfidf" modes, if "pad_to_max_tokens" is False and the layer itself has already been called. """ if self._output_mode != TFIDF and df_data is not None: raise ValueError( "df_data should only be set if output_mode is TFIDF. " "output_mode is %s." % self._output_mode) if (self._output_mode in [BINARY, COUNT, TFIDF] and self._called and not self._pad_to_max): raise RuntimeError( ("When using TextVectorization in {mode} mode and " "pad_to_max_tokens is False, the vocabulary cannot " "be changed after the layer is " "called.").format(mode=self._output_mode)) current_table_size = self._index_lookup_layer.vocab_size() self._index_lookup_layer.set_vocabulary(vocab, append) # When doing raw or integer output, we don't have a Vectorize layer to # manage. In this case, we can return directly. if self._output_mode in [None, INT]: return if not self._pad_to_max or self._max_tokens is None: num_tokens = self._index_lookup_layer.vocab_size( ) + self._reserved_values self._vectorize_layer.set_num_elements(num_tokens) # We're only _really_ appending if the table_size is nonzero. This is # important for some sanity checks in tfidf mode (specifically, checking if # oov_df_value is set or not) and handling existing tfidf weight data. append = append if current_table_size > 0 else False if self._output_mode == TFIDF: if df_data is None: raise ValueError("df_data must be set if output_mode is TFIDF") if len(vocab) != len(df_data): raise ValueError("df_data must be the same length as vocab. " "len(df_data) is %s, len(vocab) is %s" % (len(vocab), len(df_data))) if not append and oov_df_value is None: raise ValueError( "You must pass an oov_df_value the first time " "'set_vocabulary' is called when output_mode is " "TFIDF.") df_data = self._convert_to_ndarray(df_data) if append: # The existing IDF data is stored in a Keras weight, so we can get it # by calling K.get_value() on the weight object. Take the first # table_size+1 values in case we're padding the weight with zeros existing_df_data = K.get_value( self._vectorize_layer.tf_idf_weights)[:current_table_size + 1] df_data = np.append(existing_df_data, df_data, axis=0) # If we are appending and need to replace the OOV DF value, we can # assign it over the existing OOV DF value at index 0 of the (already- # concatenated) DF value array. if oov_df_value is not None: df_data[0] = oov_df_value else: # If we are not appending (that is, we have only new data) we need to # insert the OOV value to the front of the array. (This is a append to # the head, not a replacement of the zeroth value.) if not isinstance(oov_df_value, np.ndarray): oov_df_value = np.array([oov_df_value]) df_data = np.insert(df_data, 0, oov_df_value) self._vectorize_layer.set_tfidf_data(df_data)
def call(self, inputs): self._called = True if self._max_tokens is None: out_depth = K.get_value(self.num_elements) else: out_depth = self._max_tokens if self._sparse: if self._output_mode != COUNT: raise ValueError( "Only supports `sparse=True` when `output_mode` " ' is \"count\", got {}'.format(self._output_mode)) inputs = self._convert_to_sparse_inputs(inputs) # Consider having sparse.one_hot # Append values to indices, and reduce sum to get the counts. tokens = array_ops.expand_dims(math_ops.cast( inputs.values, dtypes.int64), axis=1) count_tokens = array_ops.concat([inputs.indices, tokens], axis=1) count_values = array_ops.ones_like(inputs.values, dtype=dtypes.int64) unreduced_count_shape = array_ops.concat( [inputs.dense_shape, [out_depth]], axis=0) counts = sparse_tensor.SparseTensor( indices=count_tokens, values=count_values, dense_shape=unreduced_count_shape) count_data = sparse_ops.sparse_reduce_sum_v2(counts, axis=1, output_is_sparse=True) return count_data # If the input is a sparse tensor, we densify it with the default value of # -1. Because -1 is ignored by one_hot, this effectively drops the non-set # positions from the output encoding. if isinstance(inputs, sparse_tensor.SparseTensor): inputs = sparse_ops.sparse_tensor_to_dense(inputs, default_value=-1) if self._output_mode == BINARY: bool_one_hot_data = array_ops.one_hot(inputs, depth=out_depth, on_value=True, off_value=False) reduced_bool_data = math_ops.reduce_any(bool_one_hot_data, axis=1) binary_data = math_ops.cast(reduced_bool_data, dtypes.int64) binary_data.set_shape(tensor_shape.TensorShape((None, out_depth))) return binary_data one_hot_data = array_ops.one_hot(inputs, depth=out_depth) counts = math_ops.reduce_sum(one_hot_data, axis=1) if self._output_mode == COUNT: count_data = math_ops.cast(counts, dtypes.int64) count_data.set_shape(tensor_shape.TensorShape((None, out_depth))) return count_data tf_idf_data = math_ops.multiply(counts, self.tf_idf_weights) tf_idf_data.set_shape(tensor_shape.TensorShape((None, out_depth))) if self._output_mode == TFIDF: return tf_idf_data # We can only get here if we didn't recognize the passed mode. raise ValueError("Unknown output mode %s" % self._output_mode)
def iterator_predict_loop(model, inputs, steps, verbose=0): """Predict function for eager execution when input is dataset iterator. Arguments: model: Instance of `Model`. inputs: Input dataset iterator. steps: Total number of steps (batches of samples) before declaring `_predict_loop` finished. verbose: Verbosity mode. Returns: Array of predictions (if the model has a single output) or list of arrays of predictions (if the model has multiple outputs). Raises: ValueError: In case of mismatch between given number of inputs and expectations of the model. """ assert isinstance(inputs, iterator_ops.EagerIterator) if not isinstance(inputs.output_shapes, (list, tuple)) or len(inputs.output_shapes) > 3: raise ValueError( 'Please provide data as a list or tuple of 1, 2, or 3 elements ' ' - `(input)`, or `(input, target)`, or `(input, target,' 'sample_weights)`. Received %s. We do not use the `target` or' '`sample_weights` value here.' % inputs.output_shapes) outs = [] if verbose == 1: progbar = generic_utils.Progbar(target=steps) for step_index in range(steps): # Get data from the iterator. try: next_element = inputs.get_next() except errors.OutOfRangeError: logging.warning( 'Your dataset iterator ran out of data; interrupting prediction. ' 'Make sure that your dataset can generate at least `steps` batches ' '(in this case, %d batches). You may need to use the repeat() ' 'function when building your dataset.', steps) break # expects a tuple, where first element of tuple represents inputs x = next_element[0] # Validate and standardize data. x, _, _ = model._standardize_user_data(x) x = training_utils.cast_if_floating_dtype(x) if isinstance(x, list) and len(x) == 1: x = x[0] if model._expects_training_arg: batch_outs = model.call(x, training=False) else: batch_outs = model.call(x) if not isinstance(batch_outs, list): batch_outs = [batch_outs] # We collect the results from every step and then concatenate them once # in the end. This is an expensive process. We are doing this because we # do not know the number of samples beforehand. if step_index == 0: for _ in batch_outs: outs.append([]) for i, batch_out in enumerate(batch_outs): outs[i].append(backend.get_value(batch_out)) if verbose == 1: progbar.update(step_index + 1) for i, out in enumerate(outs): outs[i] = np.concatenate(tuple(out), axis=0) if len(outs) == 1: return outs[0] return outs
def test_simple_with_other(self): with self.test_session(): # create simple FilterDetections layer layer = layers.FilterDetections() # create simple input boxes = np.array( [[ [0, 0, 10, 10], [0, 0, 10, 10], # this will be suppressed ]], dtype=K.floatx()) boxes = K.constant(boxes) classification = np.array( [[ [0, 0.9], # this will be suppressed [0, 1], ]], dtype=K.floatx()) classification = K.constant(classification) other = [] other.append( np.array( [[ [0, 1234], # this will be suppressed [0, 5678], ]], dtype=K.floatx())) other.append( np.array( [[ 5678, # this will be suppressed 1234, ]], dtype=K.floatx())) other = [K.constant(o) for o in other] # compute output actual = layer.call([boxes, classification] + other) actual_boxes = K.get_value(actual[0]) actual_scores = K.get_value(actual[1]) actual_labels = K.get_value(actual[2]) actual_other = [K.get_value(a) for a in actual[3:]] # define expected output expected_boxes = -1 * np.ones((1, 300, 4), dtype=K.floatx()) expected_boxes[0, 0, :] = [0, 0, 10, 10] expected_scores = -1 * np.ones((1, 300), dtype=K.floatx()) expected_scores[0, 0] = 1 expected_labels = -1 * np.ones((1, 300), dtype=K.floatx()) expected_labels[0, 0] = 1 expected_other = [] expected_other.append(-1 * np.ones((1, 300, 2), dtype=K.floatx())) expected_other[-1][0, 0, :] = [0, 5678] expected_other.append(-1 * np.ones((1, 300), dtype=K.floatx())) expected_other[-1][0, 0] = 1234 # assert actual and expected are equal self.assertAllEqual(actual_boxes, expected_boxes) self.assertAllEqual(actual_scores, expected_scores) self.assertAllEqual(actual_labels, expected_labels) for a, e in zip(actual_other, expected_other): self.assertAllEqual(a, e)
def train(self, reranker, train_dataset, train_output_path, dev_data, dev_output_path, qrels, metric, relevance_level=1, init_path=None): if self.tpu: train_output_path = "{0}/{1}/{2}".format( self.config["storage"], "train_output", hashlib.md5( str(train_output_path).encode("utf-8")).hexdigest()) os.makedirs(dev_output_path, exist_ok=True) start_epoch = self.config["niters"] if reranker.config.get( "modeltype", "") in ["nir", "cedr"] else 0 train_records = self.get_tf_train_records(reranker, train_dataset) dev_records = self.get_tf_dev_records(reranker, dev_data) dev_dist_dataset = self.strategy.experimental_distribute_dataset( dev_records) # Does not very much from https://www.tensorflow.org/tutorials/distribute/custom_training strategy_scope = self.strategy.scope() with strategy_scope: reranker.build_model() wrapped_model = self.get_wrapped_model(reranker.model) if init_path: logger.info(f"Initializing model from checkpoint {init_path}") print("number of vars: ", len(wrapped_model.trainable_variables)) wrapped_model.load_weights(init_path) loss_object = self.get_loss(self.config["loss"]) optimizer_1 = tf.keras.optimizers.Adam( learning_rate=self.config["lr"]) optimizer_2 = tf.keras.optimizers.Adam( learning_rate=self.config["bertlr"]) def compute_loss(labels, predictions): per_example_loss = loss_object(labels, predictions) return tf.nn.compute_average_loss( per_example_loss, global_batch_size=self.config["batch"]) def is_bert_parameters(name): name = name.lower() ''' if "layer" in name: if not ("9" in name or "10" in name or "11" in name or "12" in name): return False ''' if "/bert/" in name: return True if "/electra/" in name: return True if "/roberta/" in name: return True if "/albert/" in name: return True return False def train_step(inputs): data, labels = inputs with tf.GradientTape() as tape: train_predictions = wrapped_model(data, training=True) loss = compute_loss(labels, train_predictions) gradients = tape.gradient(loss, wrapped_model.trainable_variables) # TODO: Expose the layer names to lookout for as a ConfigOption? # TODO: Crystina mentioned that hugging face models have 'bert' in all the layers (including classifiers). Handle this case bert_variables = [ (gradients[i], variable) for i, variable in enumerate(wrapped_model.trainable_variables) if is_bert_parameters(variable.name) and "classifier" not in variable.name ] classifier_vars = [ (gradients[i], variable) for i, variable in enumerate(wrapped_model.trainable_variables) if "classifier" in variable.name ] other_vars = [ (gradients[i], variable) for i, variable in enumerate(wrapped_model.trainable_variables) if (not is_bert_parameters(variable.name)) and "classifier" not in variable.name ] assert len(bert_variables) + len(classifier_vars) + len( other_vars) == len(wrapped_model.trainable_variables) # TODO: Clean this up for general use # Making sure that we did not miss any variables if self.config["lr"] > 0: optimizer_1.apply_gradients(classifier_vars + other_vars) if self.config["bertlr"] > 0: optimizer_2.apply_gradients(bert_variables) return loss def test_step(inputs): data, labels = inputs predictions = wrapped_model.predict_step(data) return predictions @tf.function def distributed_train_step(dataset_inputs): per_replica_losses = self.strategy.run(train_step, args=(dataset_inputs, )) return self.strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None) @tf.function def distributed_test_step(dataset_inputs): return self.strategy.run(test_step, args=(dataset_inputs, )) best_metric = -np.inf epoch = 0 num_batches = 0 total_loss = 0 iter_bar = tqdm(total=self.config["itersize"]) initial_lr = self.change_lr(epoch, self.config["bertlr"], do_warmup=self.config["warmupbert"]) K.set_value(optimizer_2.lr, K.get_value(initial_lr)) wandb.log({"bertlr": K.get_value(initial_lr)}, step=epoch + start_epoch, commit=False) initial_lr = self.change_lr(epoch, self.config["lr"], do_warmup=self.config["warmupnonbert"]) K.set_value(optimizer_1.lr, K.get_value(initial_lr)) wandb.log({"lr": K.get_value(initial_lr)}, step=epoch + start_epoch, commit=False) train_records = train_records.shuffle(100000) train_dist_dataset = self.strategy.experimental_distribute_dataset( train_records) # Goes through the dataset ONCE (i.e niters * itersize * batch samples). However, the dataset may already contain multiple instances of the same sample, # depending upon what Sampler was used. If you want multiple epochs, achieve it by tweaking the niters and # itersize values. for x in train_dist_dataset: total_loss += distributed_train_step(x) train_loss = total_loss / num_batches num_batches += 1 iter_bar.update(1) if num_batches % self.config["itersize"] == 0: epoch += 1 # Do warmup and decay new_lr = self.change_lr(epoch, self.config["bertlr"], do_warmup=self.config["warmupbert"]) K.set_value(optimizer_2.lr, K.get_value(new_lr)) wandb.log({f"bertlr": K.get_value(new_lr)}, step=epoch + start_epoch, commit=False) new_lr = self.change_lr(epoch, self.config["lr"], do_warmup=self.config["warmupnonbert"]) K.set_value(optimizer_1.lr, K.get_value(new_lr)) wandb.log({f"lr": K.get_value(new_lr)}, step=epoch + start_epoch, commit=False) iter_bar.close() logger.info("train_loss for epoch {} is {}".format( epoch, train_loss)) wandb.log({f"loss": float(train_loss.numpy())}, step=epoch + start_epoch, commit=False) total_loss = 0 if epoch % self.config["validatefreq"] == 0: dev_predictions = [] for x in tqdm(dev_dist_dataset, desc="validation"): pred_batch = (distributed_test_step(x).values if self.strategy.num_replicas_in_sync > 1 else [distributed_test_step(x)]) for p in pred_batch: dev_predictions.extend(p) trec_preds = self.get_preds_in_trec_format( dev_predictions, dev_data) metrics = evaluator.eval_runs( trec_preds, dict(qrels), evaluator.DEFAULT_METRICS + ["bpref"], relevance_level) logger.info( "dev metrics: %s", " ".join([ f"{metric}={v:0.3f}" for metric, v in sorted(metrics.items()) ])) if metrics[metric] > best_metric: logger.info("Writing checkpoint") best_metric = metrics[metric] wrapped_model.save_weights( "{0}/dev.best".format(train_output_path)) wandb.log( { f"dev-{k}": v for k, v in metrics.items() if k in [ "map", "bpref", "P_20", "ndcg_cut_20", "judged_10", "judged_20", "judged_200" ] }, step=epoch + start_epoch, commit=False) iter_bar = tqdm(total=self.config["itersize"]) if num_batches >= self.config["niters"] * self.config["itersize"]: break
def train(): KITTI_train_gen = KITTILoader(subset='training') dim_avg, dim_cnt = KITTI_train_gen.get_average_dimension() new_data = orientation_confidence_flip(KITTI_train_gen.image_data, dim_avg) model = nn.network() #model.load_weights('model00000296.hdf5') early_stop = callbacks.EarlyStopping(monitor='val_loss', min_delta=0.001, patience=10, mode='min', verbose=1) checkpoint = callbacks.ModelCheckpoint('model{epoch:08d}.hdf5', monitor='val_loss', verbose=1, save_best_only=False, mode='min', period=1) tensorboard = callbacks.TensorBoard(log_dir='logs/', histogram_freq=0, write_graph=True, write_images=False) all_examples = len(new_data) trv_split = int(cfg().split * all_examples) # train val split train_gen = data_gen(new_data[: trv_split]) valid_gen = data_gen(new_data[trv_split : all_examples]) print("READY FOR TRAINING") train_num = int(np.ceil(trv_split / cfg().batch_size)) valid_num = int(np.ceil((all_examples - trv_split) / cfg().batch_size)) #gen_flow = gen_flow_for_two_inputs(X_train, X_angle_train, y_train) # choose the minimizer to be sgd # minimizer = optimizer.SGD(lr=0.0001, momentum = 0.9) minimizer = optimizer.Adam(lr=0.0001) # multi task learning model.compile(optimizer=minimizer, #minimizer, loss={'dimensions': 'mean_squared_error', 'orientation': orientation_loss, 'confidence': 'categorical_crossentropy'}, loss_weights={'dimensions': 1., 'orientation': 10., 'confidence': 5.}) print("####################################################") print(K.get_value(model.optimizer.lr)) # Tambahan aing def scheduler(epoch): if epoch%10==0 and epoch!=0: lr = K.get_value(model.optimizer.lr) K.set_value(model.optimizer.lr, lr*.8) print("lr changed to {}".format(lr*.8)) print("lr = ", K.get_value(model.optimizer.lr)) return K.get_value(model.optimizer.lr) lr_sched = callbacks.LearningRateScheduler(scheduler) # d:0.0088 o:0.0042, c:0.0098 # steps_per_epoch=train_num, # validation_steps=valid_num, # callbacks=[early_stop, checkpoint, tensorboard], model.fit_generator(generator=train_gen, steps_per_epoch=train_num, epochs=500, verbose=1, validation_data=valid_gen, validation_steps=valid_num, shuffle=True, callbacks=[checkpoint, tensorboard, lr_sched], max_queue_size=3)
def test_dynamic_loss_scaling(self, strategy_fn, cloning=True): strategy = strategy_fn() initial_loss_scale = 2. batch_size = 4 expected_gradient = backend.variable([initial_loss_scale / batch_size], dtype=dtypes.float16) # If this variable is set to True, the model below will have NaN gradients have_nan_gradients = backend.variable(False, dtype=dtypes.bool) with strategy.scope(): with policy.policy_scope(policy.Policy('infer_float32_vars')): x = layers.Input(shape=(1,), batch_size=batch_size, dtype=dtypes.float16) layer = AddLayer(assert_type=dtypes.float16) y = layer(x) identity_with_nan_grads = ( mp_test_util.create_identity_with_nan_gradients_fn( have_nan_gradients)) y = core.Lambda(identity_with_nan_grads)(y) identity_with_grad_check_fn = ( mp_test_util.create_identity_with_grad_check_fn( expected_dtype=dtypes.float16, expected_gradient=expected_gradient)) y = core.Lambda(identity_with_grad_check_fn)(y) y = math_ops.cast(y, dtypes.float32) model = models.Model(inputs=x, outputs=y) def loss_fn(y_true, y_pred): del y_true return math_ops.reduce_mean(y_pred) opt = gradient_descent.SGD(1.) loss_scale = loss_scale_module.DynamicLossScale( initial_loss_scale=initial_loss_scale, increment_period=2) opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale) model.compile(opt, loss=loss_fn, cloning=cloning) self.assertEqual(backend.eval(layer.v), 1) x = np.ones((batch_size, 1)) y = np.ones((batch_size, 1)) dataset = dataset_ops.Dataset.from_tensor_slices((x, y)).batch(batch_size) model.fit(dataset) # The variables starts with 1 and has a gradient of 1, so will go down by 1 # each step. self.assertEqual(backend.eval(layer.v), 0) model.fit(dataset) self.assertEqual(backend.eval(layer.v), -1) # There have been two steps without NaNs, so the loss scale will double backend.set_value(expected_gradient, backend.get_value(expected_gradient * 2)) model.fit(dataset) self.assertEqual(backend.eval(layer.v), -2) # Next test with NaN gradients. backend.set_value(have_nan_gradients, True) model.fit(dataset) # Variable should not be updated self.assertEqual(backend.eval(layer.v), -2) # Test with finite gradients again backend.set_value(have_nan_gradients, False) # The loss scale will be halved due to the NaNs, so the gradient will also # be halved backend.set_value(expected_gradient, backend.get_value(expected_gradient / 2)) model.fit(dataset) self.assertEqual(backend.eval(layer.v), -3)
def on_epoch_end(self, epoch, logs=None): logs = logs or {} logs['learning_rate'] = K.get_value(self.model.optimizer.lr) super().on_epoch_end(epoch, logs)
def on_train_batch_end(self, batch, logs={}): logs.update( {'learning_rate': float(k.get_value(self.model.optimizer.lr))}) index = tf.keras.backend.eval(self.model.optimizer.iterations) self._write_logs(logs, index)
def on_epoch_end(self, epoch, logs=None): logs = logs or {} logs["lr"] = K.get_value(self.model.optimizer.lr)