def run_tensorflow(use_gpu, model_names, model_class, precision, num_threads, batch_sizes, sequence_lengths, repeat_times, cache_dir, verbose): results = [] import tensorflow as tf tf.config.threading.set_intra_op_parallelism_threads(num_threads) if not use_gpu: tf.config.set_visible_devices([], 'GPU') if use_gpu and not tf.test.is_built_with_cuda(): logger.error( "Please install Tensorflow-gpu, and use a machine with GPU for testing gpu performance." ) return results if use_gpu: # Restrict TensorFlow to only use the first GPU physical_devices = tf.config.list_physical_devices('GPU') try: tf.config.set_visible_devices(physical_devices[0], 'GPU') except RuntimeError as e: logger.exception(e) if precision == Precision.FLOAT16 or precision == Precision.INT8: raise NotImplementedError( "Mixed precision is currently not supported.") for model_name in model_names: config = AutoConfig.from_pretrained(model_name, cache_dir=cache_dir) model = load_pretrained_model(model_name, config=config, cache_dir=cache_dir, custom_model_class=model_class, is_tf_model=True) tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir) max_input_size = tokenizer.max_model_input_sizes[ model_name] if model_name in tokenizer.max_model_input_sizes else 1024 for batch_size in batch_sizes: if batch_size <= 0: continue for sequence_length in sequence_lengths: if max_input_size is not None and sequence_length > max_input_size: continue logger.info("Run Tensorflow on {} with input shape {}".format( model_name, [batch_size, sequence_length])) import random rng = random.Random() values = [ rng.randint(0, config.vocab_size - 1) for i in range(batch_size * sequence_length) ] input_ids = tf.constant(values, shape=(batch_size, sequence_length), dtype=tf.int32) try: def encoder_forward(): return model(input_ids, training=False) def encoder_decoder_forward(): return model(input_ids, decoder_input_ids=input_ids, training=False) inference = encoder_decoder_forward if config.is_encoder_decoder else encoder_forward inference() runtimes = timeit.repeat(lambda: inference(), repeat=repeat_times, number=1) result = { "engine": "tensorflow", "version": tf.__version__, "device": "cuda" if use_gpu else "cpu", "optimizer": "", "precision": precision, "io_binding": "", "model_name": model_name, "inputs": 1, "threads": num_threads, "batch_size": batch_size, "sequence_length": sequence_length, "datetime": str(datetime.now()), } result.update(get_latency_result(runtimes, batch_size)) logger.info(result) results.append(result) except RuntimeError as e: logger.exception(e) from numba import cuda device = cuda.get_current_device() device.reset() return results
def run_pytorch(use_gpu, model_names, model_class, precision, num_threads, batch_sizes, sequence_lengths, repeat_times, torchscript, cache_dir, verbose): results = [] if use_gpu and not torch.cuda.is_available(): logger.error( "Please install PyTorch with Cuda, and use a machine with GPU for testing gpu performance." ) return results torch.set_grad_enabled(False) for model_name in model_names: config = AutoConfig.from_pretrained(model_name, torchscript=torchscript, cache_dir=cache_dir) model = load_pretrained_model(model_name, config=config, cache_dir=cache_dir, custom_model_class=model_class) tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir) max_input_size = tokenizer.max_model_input_sizes[ model_name] if model_name in tokenizer.max_model_input_sizes else 1024 logger.debug(f"Model {model}") logger.debug(f"Number of parameters {model.num_parameters()}") if precision == Precision.FLOAT16: model.half() device = torch.device("cuda:0" if use_gpu else "cpu") model.to(device) if precision == Precision.INT8: model = QuantizeHelper.quantize_torch_model(model) for batch_size in batch_sizes: if batch_size <= 0: continue for sequence_length in sequence_lengths: if max_input_size is not None and sequence_length > max_input_size: continue logger.info("Run PyTorch on {} with input shape {}".format( model_name, [batch_size, sequence_length])) input_ids = torch.randint(low=0, high=config.vocab_size - 1, size=(batch_size, sequence_length), dtype=torch.long, device=device) try: inference = torch.jit.trace( model, input_ids) if torchscript else model inference(input_ids) runtimes = timeit.repeat(lambda: inference(input_ids), repeat=repeat_times, number=1) result = { "engine": "torchscript" if torchscript else "torch", "version": torch.__version__, "device": "cuda" if use_gpu else "cpu", "optimizer": "", "precision": precision, "io_binding": "", "model_name": model_name, "inputs": 1, "threads": num_threads, "batch_size": batch_size, "sequence_length": sequence_length, "datetime": str(datetime.now()), } result.update(get_latency_result(runtimes, batch_size)) logger.info(result) results.append(result) except RuntimeError as e: logger.exception(e) torch.cuda.empty_cache() return results