def test_pt_tf_model_equivalence(self): if not is_torch_available(): return import torch import transformers config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common( ) for model_class in self.all_model_classes: pt_model_class_name = model_class.__name__[ 2:] # Skip the "TF" at the beggining pt_model_class = getattr(transformers, pt_model_class_name) config.output_hidden_states = True tf_model = model_class(config) pt_model = pt_model_class(config) # Check we can load pt model in tf and vice-versa with model => model functions tf_model = transformers.load_pytorch_model_in_tf2_model( tf_model, pt_model, tf_inputs=inputs_dict) pt_model = transformers.load_tf2_model_in_pytorch_model( pt_model, tf_model) # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences pt_model.eval() pt_inputs_dict = dict( (name, torch.from_numpy(key.numpy()).to(torch.long)) for name, key in inputs_dict.items()) with torch.no_grad(): pto = pt_model(**pt_inputs_dict) tfo = tf_model(inputs_dict) max_diff = np.amax(np.abs(tfo[0].numpy() - pto[0].numpy())) self.assertLessEqual(max_diff, 2e-2) # Check we can load pt model in tf and vice-versa with checkpoint => model functions with TemporaryDirectory() as tmpdirname: pt_checkpoint_path = os.path.join(tmpdirname, 'pt_model.bin') torch.save(pt_model.state_dict(), pt_checkpoint_path) tf_model = transformers.load_pytorch_checkpoint_in_tf2_model( tf_model, pt_checkpoint_path) tf_checkpoint_path = os.path.join(tmpdirname, 'tf_model.h5') tf_model.save_weights(tf_checkpoint_path) pt_model = transformers.load_tf2_checkpoint_in_pytorch_model( pt_model, tf_checkpoint_path) # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences pt_model.eval() pt_inputs_dict = dict( (name, torch.from_numpy(key.numpy()).to(torch.long)) for name, key in inputs_dict.items()) with torch.no_grad(): pto = pt_model(**pt_inputs_dict) tfo = tf_model(inputs_dict) max_diff = np.amax(np.abs(tfo[0].numpy() - pto[0].numpy())) self.assertLessEqual(max_diff, 2e-2)
def convert_pt_checkpoint_to_tf(model_type, pytorch_checkpoint_path, config_file, tf_dump_path, compare_with_pt_model=False, use_cached_models=True): if model_type not in MODEL_CLASSES: raise ValueError( "Unrecognized model type, should be one of {}.".format( list(MODEL_CLASSES.keys()))) config_class, model_class, pt_model_class, aws_config_map = MODEL_CLASSES[ model_type] # Initialise TF model if config_file in aws_config_map: config_file = cached_path(aws_config_map[config_file], force_download=not use_cached_models) config = config_class.from_json_file(config_file) config.output_hidden_states = True config.output_attentions = True print("Building TensorFlow model from configuration: {}".format( str(config))) tf_model = model_class(config) # Load weights from tf checkpoint if pytorch_checkpoint_path in aws_config_map.keys(): pytorch_checkpoint_url = hf_bucket_url(pytorch_checkpoint_path, filename=WEIGHTS_NAME) pytorch_checkpoint_path = cached_path( pytorch_checkpoint_url, force_download=not use_cached_models) # Load PyTorch checkpoint in tf2 model: tf_model = load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path) if compare_with_pt_model: tfo = tf_model(tf_model.dummy_inputs, training=False) # build the network state_dict = torch.load(pytorch_checkpoint_path, map_location="cpu") pt_model = pt_model_class.from_pretrained( pretrained_model_name_or_path=None, config=config, state_dict=state_dict) with torch.no_grad(): pto = pt_model(**pt_model.dummy_inputs) np_pt = pto[0].numpy() np_tf = tfo[0].numpy() diff = np.amax(np.abs(np_pt - np_tf)) print("Max absolute difference between models outputs {}".format(diff)) assert diff <= 2e-2, "Error, model absolute difference is >2e-2: {}".format( diff) # Save pytorch-model print("Save TensorFlow model to {}".format(tf_dump_path)) tf_model.save_weights(tf_dump_path, save_format="h5")
def convert_pt_checkpoint_to_tf(model_type, pytorch_checkpoint_path, config_file, tf_dump_path, compare_with_pt_model=False, use_cached_models=True): if model_type not in MODEL_CLASSES: raise ValueError( "Unrecognized model type, should be one of {}.".format( list(MODEL_CLASSES.keys()))) config_class, model_class, pt_model_class, aws_model_maps, aws_config_map = MODEL_CLASSES[ model_type] # Initialise TF model if config_file in aws_config_map: config_file = cached_path(aws_config_map[config_file], force_download=not use_cached_models) config = config_class.from_json_file(config_file) config.output_hidden_states = True config.output_attentions = True print("Building TensorFlow model from configuration: {}".format( str(config))) tf_model = model_class(config) # Load weights from tf checkpoint if pytorch_checkpoint_path in aws_model_maps: pytorch_checkpoint_path = cached_path( aws_model_maps[pytorch_checkpoint_path], force_download=not use_cached_models) # Load PyTorch checkpoint in tf2 model: tf_model = load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path) if compare_with_pt_model: inputs_list = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]] tf_inputs = tf.constant(inputs_list) tfo = tf_model(tf_inputs, training=False) # build the network pt_model = pt_model_class.from_pretrained(None, config=config, state_dict=torch.load( pytorch_checkpoint_path, map_location='cpu')) pt_inputs = torch.tensor(inputs_list) with torch.no_grad(): pto = pt_model(pt_inputs) np_pt = pto[0].detach().numpy() np_tf = tfo[0].numpy() diff = np.amax(np.abs(np_pt - np_tf)) print("Max absolute difference between models outputs {}".format(diff)) assert diff <= 2e-2, "Error, model absolute difference is >2e-2" # Save pytorch-model print("Save TensorFlow model to {}".format(tf_dump_path)) tf_model.save_weights(tf_dump_path, save_format='h5')
def test_pt_tf_model_equivalence(self): for model_class in self.all_model_classes: config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common( return_obj_labels="PreTraining" in model_class.__name__) tf_model_class_name = "TF" + model_class.__name__ # Add the "TF" at the beginning if not hasattr(transformers, tf_model_class_name): # transformers does not have TF version yet return tf_model_class = getattr(transformers, tf_model_class_name) config.output_hidden_states = True config.task_obj_predict = False pt_model = model_class(config) tf_model = tf_model_class(config) # Check we can load pt model in tf and vice-versa with model => model functions pt_inputs = self._prepare_for_class(inputs_dict, model_class) def recursive_numpy_convert(iterable): return_dict = {} for key, value in iterable.items(): if type(value) == bool: return_dict[key] = value if isinstance(value, dict): return_dict[key] = recursive_numpy_convert(value) else: if isinstance(value, (list, tuple)): return_dict[key] = (tf.convert_to_tensor( iter_value.cpu().numpy(), dtype=tf.int32) for iter_value in value) else: return_dict[key] = tf.convert_to_tensor( value.cpu().numpy(), dtype=tf.int32) return return_dict tf_inputs_dict = recursive_numpy_convert(pt_inputs) tf_model = transformers.load_pytorch_model_in_tf2_model( tf_model, pt_model, tf_inputs=tf_inputs_dict) pt_model = transformers.load_tf2_model_in_pytorch_model( pt_model, tf_model).to(torch_device) # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences pt_model.eval() # Delete obj labels as we want to compute the hidden states and not the loss if "obj_labels" in inputs_dict: del inputs_dict["obj_labels"] pt_inputs = self._prepare_for_class(inputs_dict, model_class) tf_inputs_dict = recursive_numpy_convert(pt_inputs) with torch.no_grad(): pto = pt_model(**pt_inputs) tfo = tf_model(tf_inputs_dict, training=False) tf_hidden_states = tfo[0].numpy() pt_hidden_states = pto[0].cpu().numpy() tf_nans = np.copy(np.isnan(tf_hidden_states)) pt_nans = np.copy(np.isnan(pt_hidden_states)) pt_hidden_states[tf_nans] = 0 tf_hidden_states[tf_nans] = 0 pt_hidden_states[pt_nans] = 0 tf_hidden_states[pt_nans] = 0 max_diff = np.amax(np.abs(tf_hidden_states - pt_hidden_states)) # Debug info (remove when fixed) if max_diff >= 2e-2: print("===") print(model_class) print(config) print(inputs_dict) print(pt_inputs) self.assertLessEqual(max_diff, 6e-2) # Check we can load pt model in tf and vice-versa with checkpoint => model functions with tempfile.TemporaryDirectory() as tmpdirname: pt_checkpoint_path = os.path.join(tmpdirname, "pt_model.bin") torch.save(pt_model.state_dict(), pt_checkpoint_path) tf_model = transformers.load_pytorch_checkpoint_in_tf2_model( tf_model, pt_checkpoint_path) tf_checkpoint_path = os.path.join(tmpdirname, "tf_model.h5") tf_model.save_weights(tf_checkpoint_path) pt_model = transformers.load_tf2_checkpoint_in_pytorch_model( pt_model, tf_checkpoint_path) # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences pt_model.eval() for key, value in pt_inputs.items(): if key in ("visual_feats", "visual_pos"): pt_inputs[key] = value.to(torch.float32) else: pt_inputs[key] = value.to(torch.long) with torch.no_grad(): pto = pt_model(**pt_inputs) tfo = tf_model(tf_inputs_dict) tfo = tfo[0].numpy() pto = pto[0].cpu().numpy() tf_nans = np.copy(np.isnan(tfo)) pt_nans = np.copy(np.isnan(pto)) pto[tf_nans] = 0 tfo[tf_nans] = 0 pto[pt_nans] = 0 tfo[pt_nans] = 0 max_diff = np.amax(np.abs(tfo - pto)) self.assertLessEqual(max_diff, 6e-2)
def test_pt_tf_model_equivalence(self): if not is_torch_available(): return import torch import transformers config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common( ) for model_class in self.all_model_classes: pt_model_class_name = model_class.__name__[ 2:] # Skip the "TF" at the beggining pt_model_class = getattr(transformers, pt_model_class_name) config.output_hidden_states = True tf_model = model_class(config) pt_model = pt_model_class(config) # Check we can load pt model in tf and vice-versa with model => model functions tf_model = transformers.load_pytorch_model_in_tf2_model( tf_model, pt_model, tf_inputs=self._prepare_for_class(inputs_dict, model_class)) pt_model = transformers.load_tf2_model_in_pytorch_model( pt_model, tf_model) # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences pt_model.eval() pt_inputs_dict = dict( (name, torch.from_numpy(key.numpy()).to(torch.long)) for name, key in self._prepare_for_class( inputs_dict, model_class).items()) # need to rename encoder-decoder "inputs" for PyTorch if "inputs" in pt_inputs_dict and self.is_encoder_decoder: pt_inputs_dict["input_ids"] = pt_inputs_dict.pop("inputs") with torch.no_grad(): pto = pt_model(**pt_inputs_dict) tfo = tf_model(self._prepare_for_class(inputs_dict, model_class), training=False) tf_hidden_states = tfo[0].numpy() pt_hidden_states = pto[0].numpy() tf_nans = np.copy(np.isnan(tf_hidden_states)) pt_nans = np.copy(np.isnan(pt_hidden_states)) pt_hidden_states[tf_nans] = 0 tf_hidden_states[tf_nans] = 0 pt_hidden_states[pt_nans] = 0 tf_hidden_states[pt_nans] = 0 max_diff = np.amax(np.abs(tf_hidden_states - pt_hidden_states)) # Debug info (remove when fixed) if max_diff >= 2e-2: print("===") print(model_class) print(config) print(inputs_dict) print(pt_inputs_dict) self.assertLessEqual(max_diff, 2e-2) # Check we can load pt model in tf and vice-versa with checkpoint => model functions with tempfile.TemporaryDirectory() as tmpdirname: pt_checkpoint_path = os.path.join(tmpdirname, "pt_model.bin") torch.save(pt_model.state_dict(), pt_checkpoint_path) tf_model = transformers.load_pytorch_checkpoint_in_tf2_model( tf_model, pt_checkpoint_path) tf_checkpoint_path = os.path.join(tmpdirname, "tf_model.h5") tf_model.save_weights(tf_checkpoint_path) pt_model = transformers.load_tf2_checkpoint_in_pytorch_model( pt_model, tf_checkpoint_path) # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences pt_model.eval() pt_inputs_dict = dict( (name, torch.from_numpy(key.numpy()).to(torch.long)) for name, key in self._prepare_for_class( inputs_dict, model_class).items()) # need to rename encoder-decoder "inputs" for PyTorch if "inputs" in pt_inputs_dict and self.is_encoder_decoder: pt_inputs_dict["input_ids"] = pt_inputs_dict.pop("inputs") with torch.no_grad(): pto = pt_model(**pt_inputs_dict) tfo = tf_model(self._prepare_for_class(inputs_dict, model_class)) tfo = tfo[0].numpy() pto = pto[0].numpy() tf_nans = np.copy(np.isnan(tfo)) pt_nans = np.copy(np.isnan(pto)) pto[tf_nans] = 0 tfo[tf_nans] = 0 pto[pt_nans] = 0 tfo[pt_nans] = 0 max_diff = np.amax(np.abs(tfo - pto)) self.assertLessEqual(max_diff, 2e-2)
def test_pt_tf_model_equivalence(self): import numpy as np import tensorflow as tf import transformers config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common( ) for model_class in self.all_model_classes: tf_model_class_name = "TF" + model_class.__name__ # Add the "TF" at the beginning if not hasattr(transformers, tf_model_class_name): # transformers does not have TF version yet return tf_model_class = getattr(transformers, tf_model_class_name) config.output_hidden_states = True tf_model = tf_model_class(config) pt_model = model_class(config) # make sure only tf inputs are forward that actually exist in function args tf_input_keys = set( inspect.signature(tf_model.call).parameters.keys()) # remove all head masks tf_input_keys.discard("head_mask") tf_input_keys.discard("cross_attn_head_mask") tf_input_keys.discard("decoder_head_mask") pt_inputs = self._prepare_for_class(inputs_dict, model_class) pt_inputs = { k: v for k, v in pt_inputs.items() if k in tf_input_keys } # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences pt_model.eval() tf_inputs_dict = {} for key, tensor in pt_inputs.items(): # skip key that does not exist in tf if type(tensor) == bool: tf_inputs_dict[key] = tensor elif key == "input_values": tf_inputs_dict[key] = tf.convert_to_tensor( tensor.numpy(), dtype=tf.float32) elif key == "pixel_values": tf_inputs_dict[key] = tf.convert_to_tensor( tensor.numpy(), dtype=tf.float32) else: tf_inputs_dict[key] = tf.convert_to_tensor(tensor.numpy(), dtype=tf.int32) # Check we can load pt model in tf and vice-versa with model => model functions tf_model = transformers.load_pytorch_model_in_tf2_model( tf_model, pt_model, tf_inputs=tf_inputs_dict) pt_model = transformers.load_tf2_model_in_pytorch_model( pt_model, tf_model) # need to rename encoder-decoder "inputs" for PyTorch # if "inputs" in pt_inputs_dict and self.is_encoder_decoder: # pt_inputs_dict["input_ids"] = pt_inputs_dict.pop("inputs") with torch.no_grad(): pto = pt_model(**pt_inputs) tfo = tf_model(tf_inputs_dict, training=False) self.assertEqual(len(tfo), len(pto), "Output lengths differ between TF and PyTorch") for tf_output, pt_output in zip(tfo.to_tuple(), pto.to_tuple()): if not (isinstance(tf_output, tf.Tensor) and isinstance(pt_output, torch.Tensor)): continue tf_out = tf_output.numpy() pt_out = pt_output.numpy() self.assertEqual( tf_out.shape, pt_out.shape, "Output component shapes differ between TF and PyTorch") if len(tf_out.shape) > 0: tf_nans = np.copy(np.isnan(tf_out)) pt_nans = np.copy(np.isnan(pt_out)) pt_out[tf_nans] = 0 tf_out[tf_nans] = 0 pt_out[pt_nans] = 0 tf_out[pt_nans] = 0 max_diff = np.amax(np.abs(tf_out - pt_out)) self.assertLessEqual(max_diff, 4e-2) # Check we can load pt model in tf and vice-versa with checkpoint => model functions with tempfile.TemporaryDirectory() as tmpdirname: pt_checkpoint_path = os.path.join(tmpdirname, "pt_model.bin") torch.save(pt_model.state_dict(), pt_checkpoint_path) tf_model = transformers.load_pytorch_checkpoint_in_tf2_model( tf_model, pt_checkpoint_path) tf_checkpoint_path = os.path.join(tmpdirname, "tf_model.h5") tf_model.save_weights(tf_checkpoint_path) pt_model = transformers.load_tf2_checkpoint_in_pytorch_model( pt_model, tf_checkpoint_path) # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences pt_model.eval() tf_inputs_dict = {} for key, tensor in pt_inputs.items(): # skip key that does not exist in tf if type(tensor) == bool: tensor = np.array(tensor, dtype=bool) tf_inputs_dict[key] = tf.convert_to_tensor(tensor, dtype=tf.int32) elif key == "input_values": tf_inputs_dict[key] = tf.convert_to_tensor( tensor.numpy(), dtype=tf.float32) elif key == "pixel_values": tf_inputs_dict[key] = tf.convert_to_tensor( tensor.numpy(), dtype=tf.float32) else: tf_inputs_dict[key] = tf.convert_to_tensor(tensor.numpy(), dtype=tf.int32) # need to rename encoder-decoder "inputs" for PyTorch # if "inputs" in pt_inputs_dict and self.is_encoder_decoder: # pt_inputs_dict["input_ids"] = pt_inputs_dict.pop("inputs") with torch.no_grad(): pto = pt_model(**pt_inputs) tfo = tf_model(tf_inputs_dict) self.assertEqual(len(tfo), len(pto), "Output lengths differ between TF and PyTorch") for tf_output, pt_output in zip(tfo.to_tuple(), pto.to_tuple()): if not (isinstance(tf_output, tf.Tensor) and isinstance(pt_output, torch.Tensor)): continue tf_out = tf_output.numpy() pt_out = pt_output.numpy() self.assertEqual( tf_out.shape, pt_out.shape, "Output component shapes differ between TF and PyTorch") if len(tf_out.shape) > 0: tf_nans = np.copy(np.isnan(tf_out)) pt_nans = np.copy(np.isnan(pt_out)) pt_out[tf_nans] = 0 tf_out[tf_nans] = 0 pt_out[pt_nans] = 0 tf_out[pt_nans] = 0 max_diff = np.amax(np.abs(tf_out - pt_out)) self.assertLessEqual(max_diff, 4e-2)
def test_pt_tf_model_equivalence(self): from transformers import is_torch_available if not is_torch_available(): return import torch import transformers for model_class in self.all_model_classes: config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common( return_obj_labels="PreTraining" in model_class.__name__ ) pt_model_class_name = model_class.__name__[2:] # Skip the "TF" at the beginning pt_model_class = getattr(transformers, pt_model_class_name) config.output_hidden_states = True config.task_obj_predict = False tf_model = model_class(config) pt_model = pt_model_class(config) # Check we can load pt model in tf and vice-versa with model => model functions tf_model = transformers.load_pytorch_model_in_tf2_model( tf_model, pt_model, tf_inputs=self._prepare_for_class(inputs_dict, model_class) ) pt_model = transformers.load_tf2_model_in_pytorch_model(pt_model, tf_model) # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences pt_model.eval() # Delete obj labels as we want to compute the hidden states and not the loss if "obj_labels" in inputs_dict: del inputs_dict["obj_labels"] def torch_type(key): if key in ("visual_feats", "visual_pos"): return torch.float32 else: return torch.long def recursive_numpy_convert(iterable): return_dict = {} for key, value in iterable.items(): if isinstance(value, dict): return_dict[key] = recursive_numpy_convert(value) else: if isinstance(value, (list, tuple)): return_dict[key] = ( torch.from_numpy(iter_value.numpy()).to(torch_type(key)) for iter_value in value ) else: return_dict[key] = torch.from_numpy(value.numpy()).to(torch_type(key)) return return_dict pt_inputs_dict = recursive_numpy_convert(self._prepare_for_class(inputs_dict, model_class)) # need to rename encoder-decoder "inputs" for PyTorch if "inputs" in pt_inputs_dict and self.is_encoder_decoder: pt_inputs_dict["input_ids"] = pt_inputs_dict.pop("inputs") with torch.no_grad(): pto = pt_model(**pt_inputs_dict) tfo = tf_model(self._prepare_for_class(inputs_dict, model_class), training=False) tf_hidden_states = tfo[0].numpy() pt_hidden_states = pto[0].numpy() import numpy as np tf_nans = np.copy(np.isnan(tf_hidden_states)) pt_nans = np.copy(np.isnan(pt_hidden_states)) pt_hidden_states[tf_nans] = 0 tf_hidden_states[tf_nans] = 0 pt_hidden_states[pt_nans] = 0 tf_hidden_states[pt_nans] = 0 max_diff = np.amax(np.abs(tf_hidden_states - pt_hidden_states)) # Debug info (remove when fixed) if max_diff >= 2e-2: print("===") print(model_class) print(config) print(inputs_dict) print(pt_inputs_dict) self.assertLessEqual(max_diff, 6e-2) # Check we can load pt model in tf and vice-versa with checkpoint => model functions with tempfile.TemporaryDirectory() as tmpdirname: import os pt_checkpoint_path = os.path.join(tmpdirname, "pt_model.bin") torch.save(pt_model.state_dict(), pt_checkpoint_path) tf_model = transformers.load_pytorch_checkpoint_in_tf2_model(tf_model, pt_checkpoint_path) tf_checkpoint_path = os.path.join(tmpdirname, "tf_model.h5") tf_model.save_weights(tf_checkpoint_path) pt_model = transformers.load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path) # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences pt_model.eval() pt_inputs_dict = dict( (name, torch.from_numpy(key.numpy()).to(torch.long)) for name, key in self._prepare_for_class(inputs_dict, model_class).items() ) for key, value in pt_inputs_dict.items(): if key in ("visual_feats", "visual_pos"): pt_inputs_dict[key] = value.to(torch.float32) else: pt_inputs_dict[key] = value.to(torch.long) with torch.no_grad(): pto = pt_model(**pt_inputs_dict) tfo = tf_model(self._prepare_for_class(inputs_dict, model_class)) tfo = tfo[0].numpy() pto = pto[0].numpy() tf_nans = np.copy(np.isnan(tfo)) pt_nans = np.copy(np.isnan(pto)) pto[tf_nans] = 0 tfo[tf_nans] = 0 pto[pt_nans] = 0 tfo[pt_nans] = 0 max_diff = np.amax(np.abs(tfo - pto)) self.assertLessEqual(max_diff, 6e-2)
def test_pt_tf_model_equivalence(self): import torch import transformers config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: pt_model_class_name = model_class.__name__[2:] # Skip the "TF" at the beginning pt_model_class = getattr(transformers, pt_model_class_name) config.output_hidden_states = True tf_model = model_class(config) pt_model = pt_model_class(config) # Check we can load pt model in tf and vice-versa with model => model functions tf_model = transformers.load_pytorch_model_in_tf2_model( tf_model, pt_model, tf_inputs=self._prepare_for_class(inputs_dict, model_class) ) pt_model = transformers.load_tf2_model_in_pytorch_model(pt_model, tf_model) # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences pt_model.eval() pt_inputs_dict = {} for name, key in self._prepare_for_class(inputs_dict, model_class).items(): if type(key) == bool: pt_inputs_dict[name] = key elif name == "input_values": pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32) elif name == "pixel_values": pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32) else: pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.long) # need to rename encoder-decoder "inputs" for PyTorch if "inputs" in pt_inputs_dict and self.is_encoder_decoder: pt_inputs_dict["input_ids"] = pt_inputs_dict.pop("inputs") with torch.no_grad(): pto = pt_model(**pt_inputs_dict) tfo = tf_model(self._prepare_for_class(inputs_dict, model_class), training=False) self.assertEqual(len(tfo), len(pto), "Output lengths differ between TF and PyTorch") for tf_output, pt_output in zip(tfo.to_tuple(), pto.to_tuple()): if not (isinstance(tf_output, tf.Tensor) and isinstance(pt_output, torch.Tensor)): continue tf_out = tf_output.numpy() pt_out = pt_output.numpy() self.assertEqual(tf_out.shape, pt_out.shape, "Output component shapes differ between TF and PyTorch") if len(tf_out.shape) > 0: tf_nans = np.copy(np.isnan(tf_out)) pt_nans = np.copy(np.isnan(pt_out)) pt_out[tf_nans] = 0 tf_out[tf_nans] = 0 pt_out[pt_nans] = 0 tf_out[pt_nans] = 0 max_diff = np.amax(np.abs(tf_out - pt_out)) self.assertLessEqual(max_diff, 4e-2) # Check we can load pt model in tf and vice-versa with checkpoint => model functions with tempfile.TemporaryDirectory() as tmpdirname: pt_checkpoint_path = os.path.join(tmpdirname, "pt_model.bin") torch.save(pt_model.state_dict(), pt_checkpoint_path) tf_model = transformers.load_pytorch_checkpoint_in_tf2_model(tf_model, pt_checkpoint_path) tf_checkpoint_path = os.path.join(tmpdirname, "tf_model.h5") tf_model.save_weights(tf_checkpoint_path) pt_model = transformers.load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path) # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences pt_model.eval() pt_inputs_dict = {} for name, key in self._prepare_for_class(inputs_dict, model_class).items(): if type(key) == bool: key = np.array(key, dtype=bool) pt_inputs_dict[name] = torch.from_numpy(key).to(torch.long) elif name == "input_values": pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32) elif name == "pixel_values": pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32) else: pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.long) # need to rename encoder-decoder "inputs" for PyTorch if "inputs" in pt_inputs_dict and self.is_encoder_decoder: pt_inputs_dict["input_ids"] = pt_inputs_dict.pop("inputs") with torch.no_grad(): pto = pt_model(**pt_inputs_dict) tfo = tf_model(self._prepare_for_class(inputs_dict, model_class)) self.assertEqual(len(tfo), len(pto), "Output lengths differ between TF and PyTorch") for tf_output, pt_output in zip(tfo.to_tuple(), pto.to_tuple()): if not (isinstance(tf_output, tf.Tensor) and isinstance(pt_output, torch.Tensor)): continue tf_out = tf_output.numpy() pt_out = pt_output.numpy() self.assertEqual(tf_out.shape, pt_out.shape, "Output component shapes differ between TF and PyTorch") if len(tf_out.shape) > 0: tf_nans = np.copy(np.isnan(tf_out)) pt_nans = np.copy(np.isnan(pt_out)) pt_out[tf_nans] = 0 tf_out[tf_nans] = 0 pt_out[pt_nans] = 0 tf_out[pt_nans] = 0 max_diff = np.amax(np.abs(tf_out - pt_out)) self.assertLessEqual(max_diff, 4e-2)
def test_pt_tf_model_equivalence(self): import numpy as np import tensorflow as tf import transformers # make masks reproducible np.random.seed(2) config, _ = self.model_tester.prepare_config_and_inputs_for_common() num_patches = int((config.image_size // config.patch_size)**2) noise = np.random.uniform(size=(self.model_tester.batch_size, num_patches)) pt_noise = torch.from_numpy(noise).to(device=torch_device) tf_noise = tf.constant(noise) def prepare_tf_inputs_from_pt_inputs(pt_inputs_dict): tf_inputs_dict = {} for key, tensor in pt_inputs_dict.items(): tf_inputs_dict[key] = tf.convert_to_tensor( tensor.cpu().numpy(), dtype=tf.float32) return tf_inputs_dict def check_outputs(tf_outputs, pt_outputs, model_class, names): """ Args: model_class: The class of the model that is currently testing. For example, `TFBertModel`, TFBertForMaskedLM`, `TFBertForSequenceClassification`, etc. Currently unused, but it could make debugging easier and faster. names: A string, or a tuple of strings. These specify what tf_outputs/pt_outputs represent in the model outputs. Currently unused, but in the future, we could use this information to make the error message clearer by giving the name(s) of the output tensor(s) with large difference(s) between PT and TF. """ # Allow `list` because `(TF)TransfoXLModelOutput.mems` is a list of tensors. if type(tf_outputs) in [tuple, list]: self.assertEqual(type(tf_outputs), type(pt_outputs)) self.assertEqual(len(tf_outputs), len(pt_outputs)) if type(names) == tuple: for tf_output, pt_output, name in zip( tf_outputs, pt_outputs, names): check_outputs(tf_output, pt_output, model_class, names=name) elif type(names) == str: for idx, (tf_output, pt_output) in enumerate( zip(tf_outputs, pt_outputs)): check_outputs(tf_output, pt_output, model_class, names=f"{names}_{idx}") else: raise ValueError( f"`names` should be a `tuple` or a string. Got {type(names)} instead." ) elif isinstance(tf_outputs, tf.Tensor): self.assertTrue(isinstance(pt_outputs, torch.Tensor)) tf_outputs = tf_outputs.numpy() if isinstance(tf_outputs, np.float32): tf_outputs = np.array(tf_outputs, dtype=np.float32) pt_outputs = pt_outputs.detach().to("cpu").numpy() tf_nans = np.isnan(tf_outputs) pt_nans = np.isnan(pt_outputs) pt_outputs[tf_nans] = 0 tf_outputs[tf_nans] = 0 pt_outputs[pt_nans] = 0 tf_outputs[pt_nans] = 0 max_diff = np.amax(np.abs(tf_outputs - pt_outputs)) self.assertLessEqual(max_diff, 1e-5) else: raise ValueError( f"`tf_outputs` should be a `tuple` or an instance of `tf.Tensor`. Got {type(tf_outputs)} instead." ) def check_pt_tf_models(tf_model, pt_model, pt_inputs_dict): # we are not preparing a model with labels because of the formation # of the ViT MAE model # send pytorch model to the correct device pt_model.to(torch_device) # Check predictions on first output (logits/hidden-states) are close enough given low-level computational differences pt_model.eval() tf_inputs_dict = prepare_tf_inputs_from_pt_inputs(pt_inputs_dict) # send pytorch inputs to the correct device pt_inputs_dict = { k: v.to(device=torch_device) if isinstance(v, torch.Tensor) else v for k, v in pt_inputs_dict.items() } # Original test: check without `labels` with torch.no_grad(): pt_outputs = pt_model(**pt_inputs_dict, noise=pt_noise) tf_outputs = tf_model(tf_inputs_dict, noise=tf_noise) tf_keys = tuple( [k for k, v in tf_outputs.items() if v is not None]) pt_keys = tuple( [k for k, v in pt_outputs.items() if v is not None]) self.assertEqual(tf_keys, pt_keys) check_outputs(tf_outputs.to_tuple(), pt_outputs.to_tuple(), model_class, names=tf_keys) config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common( ) for model_class in self.all_model_classes: tf_model_class_name = "TF" + model_class.__name__ # Add the "TF" at the beginning # Output all for aggressive testing config.output_hidden_states = True config.output_attentions = self.has_attentions tf_model_class = getattr(transformers, tf_model_class_name) tf_model = tf_model_class(config) pt_model = model_class(config) # make sure only tf inputs are forward that actually exist in function args tf_input_keys = set( inspect.signature(tf_model.call).parameters.keys()) # remove all head masks tf_input_keys.discard("head_mask") tf_input_keys.discard("cross_attn_head_mask") tf_input_keys.discard("decoder_head_mask") pt_inputs_dict = self._prepare_for_class(inputs_dict, model_class) pt_inputs_dict = { k: v for k, v in pt_inputs_dict.items() if k in tf_input_keys } # Check we can load pt model in tf and vice-versa with model => model functions tf_inputs_dict = prepare_tf_inputs_from_pt_inputs(pt_inputs_dict) tf_model = transformers.load_pytorch_model_in_tf2_model( tf_model, pt_model, tf_inputs=tf_inputs_dict) pt_model = transformers.load_tf2_model_in_pytorch_model( pt_model, tf_model) check_pt_tf_models(tf_model, pt_model, pt_inputs_dict) # Check we can load pt model in tf and vice-versa with checkpoint => model functions with tempfile.TemporaryDirectory() as tmpdirname: pt_checkpoint_path = os.path.join(tmpdirname, "pt_model.bin") torch.save(pt_model.state_dict(), pt_checkpoint_path) tf_model = transformers.load_pytorch_checkpoint_in_tf2_model( tf_model, pt_checkpoint_path) tf_checkpoint_path = os.path.join(tmpdirname, "tf_model.h5") tf_model.save_weights(tf_checkpoint_path) pt_model = transformers.load_tf2_checkpoint_in_pytorch_model( pt_model, tf_checkpoint_path) pt_model = pt_model.to(torch_device) check_pt_tf_models(tf_model, pt_model, pt_inputs_dict)