class Temperature: """A wrapper, non-instance class to do one-time initialization of get_system()""" system = get_system() @classmethod def _get_core_temps(cls): if cls.system.arch == Architecture.Xavier: # Because we don't have nvidia-smi on xavier, we need to use sysfs to read out the temperature # The type of the thermal_zone is in /sys/devices/virtual/thermal/termal_zone<N>/type. # To avoid doing a bunch of process spawn to check if a given node is a GPU node, we're gonna hardcode the GPU_therm node: # AGX_Xavier: thermal_zone1 # Xavier_NX: thermal_zone1 # NOTE, this may change in subsequent/previous submission models. try: out_text = run_command( "cat /sys/devices/virtual/thermal/thermal_zone1/temp", get_output=True, tee=False) # The temperature is in units of milli degC, so scale the result: temps = [int(str_temp) / 1000 for str_temp in out_text] except Exception as e: print("Bad temp reading") raise e else: # Non-xavier branch try: out_text = run_command( "nvidia-smi --query-gpu=temperature.gpu --format=csv,noheader", get_output=True, tee=False) # multi-gpu instance return a list of strings corresponding to temp of each core temps = [int(str_temp) for str_temp in out_text] except Exception as e: print("Bad temp reading") raise e return temps @classmethod def logged_temp_wait(cls, temp, timeout=None): """ Spinwait on GPU temperature with optional timeout. Returns last measured temperature For multi-GPU systems, we use mean temperature. """ start_time = time.perf_counter() # Poll with timeout succ = False while (time.perf_counter() - start_time) < timeout: temps = cls._get_core_temps() mean_temp = sum(temps) / len(temps) if mean_temp <= temp: print("GPU has finished cooling") succ = True break print(mean_temp) time.sleep(2) if not succ: print("GPU failed to fully cool") return mean_temp
def __init__(self, args): workspace_size = dict_get(args, "workspace_size", default=(5 << 30)) logging.info("Using workspace size: {:,}".format(workspace_size)) super().__init__(args, name=BENCHMARKS.BERT, workspace_size=workspace_size) self.bert_config_path = "code/bert/tensorrt/bert_config.json" self.seq_len = 384 # default sequence length self.batch_size = dict_get(args, "batch_size", default=1) self.num_profiles = 1 if 'gpu_inference_streams' in args: # use gpu_inference_streams to determine the number of duplicated profiles # in the engine when not using lwis mode self.num_profiles = args['gpu_inference_streams'] self.is_int8 = args['precision'] == 'int8' if self.is_int8: self.model_path = dict_get( args, "model_path", default="build/models/bert/bert_large_v1_1_fake_quant.onnx") else: self.model_path = dict_get( args, "model_path", default="build/models/bert/bert_large_v1_1.onnx") self.bert_config = BertConfig(self.bert_config_path) self.enable_interleaved = False if self.is_int8 and 'enable_interleaved' in args: self.enable_interleaved = args['enable_interleaved'] # Small-Tile GEMM Plugin # Since it doesn't support interleaved format, two options are mutually exclusive self.use_small_tile_gemm_plugin = self.args.get( "use_small_tile_gemm_plugin", False) self.gemm_plugin_fairshare_cache_size = self.args.get( "gemm_plugin_fairshare_cache_size", -1) if self.enable_interleaved and self.use_small_tile_gemm_plugin: assert False, "Small-Tile GEMM Plugin doesn't support interleaved format." # Query system id for architecture self.system = get_system() self.gpu_arch = self.system.arch if self.batch_size > 512: # tactics selection is limited at very large batch sizes self.builder_config.max_workspace_size = 7 << 30 if 'nx' in self.system.gpu.lower(): # use 1GB only for XavierNX self.builder_config.max_workspace_size = 1 << 30
def initialize(self): """ Parse input ONNX file to a TRT network. Apply layer optimizations and fusion plugins on network. """ # Query system id for architecture self.system = get_system() self.gpu_arch = self.system.arch # Create network. self.network = self.builder.create_network( 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) # Parse from onnx file. parser = trt.OnnxParser(self.network, self.logger) rn50_gs = RN50GraphSurgeon(self.model_path, self.gpu_arch, self.device_type, self.precision, self.cache_file, self.need_calibration) model = rn50_gs.process_onnx() success = parser.parse(onnx._serialize(model)) if not success: raise RuntimeError( "ResNet50 onnx model processing failed! Error: {:}".format( parser.get_error(0).desc())) # unmarking topk_layer_output_value, just leaving topk_layer_output_index assert self.network.num_outputs == 2, "Two outputs expected" assert self.network.get_output(0).name == "topk_layer_output_value",\ "unexpected tensor: {}".format(self.network.get_output(0).name) assert self.network.get_output(1).name == "topk_layer_output_index",\ "unexpected tensor: {}".format(self.network.get_output(1).name) logging.info("Unmarking output: {:}".format( self.network.get_output(0).name)) self.network.unmark_output(self.network.get_output(0)) # Set input dtype and format input_tensor = self.network.get_input(0) if self.input_dtype == "int8": input_tensor.dtype = trt.int8 input_tensor.dynamic_range = (-128, 127) if self.input_format == "linear": input_tensor.allowed_formats = 1 << int(trt.TensorFormat.LINEAR) elif self.input_format == "chw4": input_tensor.allowed_formats = 1 << int(trt.TensorFormat.CHW4) self.initialized = True
def logged_temp_wait(cls, temp, timeout=None): """ Spinwait on GPU temperature with optional timeout. Returns last measured temperature For multi-GPU systems, we use mean temperature. """ if cls.system is None: cls.system = get_system() start_time = time.perf_counter() # Poll with timeout succ = False while (time.perf_counter() - start_time) < timeout: temps = cls._get_core_temps() mean_temp = sum(temps) / len(temps) if mean_temp <= temp: print("GPU has finished cooling") succ = True break print(mean_temp) time.sleep(2) if not succ: print("GPU failed to fully cool") return mean_temp
# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import sys sys.path.insert(0, os.getcwd()) import argparse from code.common import get_system if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "--gpu_architecture", help="Print GPU architecture class instead of system id", action="store_true" ) args = parser.parse_args() system = get_system() if args.gpu_architecture: print(system.arch.name) else: print(system.get_id())
def initialize(self): """ Parse input ONNX file to a TRT network. Apply layer optimizations and fusion plugins on network. """ # Query system id for architecture self.system = get_system() self.gpu_arch = self.system.arch # Create network. self.network = self.builder.create_network( 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) # Parse from onnx file. parser = trt.OnnxParser(self.network, self.logger) with open(self.model_path, "rb") as f: model = f.read() success = parser.parse(model) if not success: raise RuntimeError( "ofa_autusinian onnx model processing failed! Error: {:}". format(parser.get_error(0).desc())) # Set input dtype and format input_tensor = self.network.get_input(0) if self.input_dtype == "int8": input_tensor.dtype = trt.int8 scale = struct.unpack('!f', bytes.fromhex('3caa5293'))[0] input_tensor.dynamic_range = (-scale * 127.0, scale * 127.0) if self.input_format == "linear": input_tensor.allowed_formats = 1 << int(trt.TensorFormat.LINEAR) elif self.input_format == "chw4": input_tensor.allowed_formats = 1 << int(trt.TensorFormat.CHW4) # Get the layers we care about. nb_layers = self.network.num_layers while self.network.num_outputs > 0: logging.info("Unmarking output: {:}".format( self.network.get_output(0).name)) self.network.unmark_output(self.network.get_output(0)) #add top-k last_fc_layer = self.network.get_layer(nb_layers - 1) topk_layer = self.network.add_topk(last_fc_layer.get_output(0), trt.TopKOperation.MAX, 1, 2) topk_layer.name = "topk_layer" topk_layer.get_output(0).name = "topk_layer_output_value" topk_layer.get_output(1).name = "topk_layer_output_index" self.network.mark_output(topk_layer.get_output(1)) if self.network.num_outputs != 1: logging.warning( "num outputs should be 1 after unmarking! Has {:}".format( self.network.num_outputs)) raise Exception if self.precision == "int8" and self.batch_size > 1 and ( not self.need_calibration): self.autosinian_optimize() self.initialized = True
def __init__(self, bench, scen, config_dict, config_funcs=None): """ Construct a ConfigGrid Args: bench (str): The benchmark requested (fuzzy match behavior using BENCHMARKS.alias) scen (str): The scenario requested (fuzzy match behavior using SCENARIOS.alias) config_dict (Dict[str, List]): A config dictionary. Refer to 'Config Schema' in the README for format config_funcs (Dict[str, Callable]): A dictionary of META* functions. Refer to 'Config Schema' in the README for requirements. """ if args.spoof_system_id: self.system_id = args.spoof_system_id else: self.system = get_system() self.system_id = self.system.get_id() self.benchmark = BENCHMARKS.alias(bench) self.scenario = SCENARIOS.alias(scen) candidate_configs = find_config_files(benchmarks=[self.benchmark], scenarios=[self.scenario]) configs = load_configs(candidate_configs) assert len(configs) == 1 # To work with "extends" and "scales", we need to call into another config helper: self.base_config = get_system_benchmark_config(configs[0], self.system_id) self.default_config = configs[0]['default'] griddict = config_dict self.no_rebuild_params = None # No-op self.is_config_valid = lambda x: True # No-op self.search_callback = None self.replay = None funcs_processed = set() if config_funcs: if config_funcs.get("META_search_callback"): funcs_processed.add("META_search_callback") self.search_callback = config_funcs['META_search_callback'] if not args.use_cached: raise RuntimeError(f"META_search_callback must be used with --use_cached for reproducibility.") if config_funcs.get("META_get_no_rebuild_params"): funcs_processed.add("META_get_no_rebuild_params") norebuild_params = config_funcs.get("META_get_no_rebuild_params")() assert isinstance(norebuild_params, list) # Make sure these keys all exist in our grid params: # But we might not know grid params if a search_callback is being used: if self.search_callback is None: missing_keys = set(norebuild_params) - set(griddict.keys()) if len(missing_keys) > 0: raise RuntimeError(f"The keys: {missing_keys} were mentioned in META_get_no_rebuild_params, but are not a specified parameter in:\n{griddict.keys()}") else: print("WARNING: Not checking get_no_rebuild_params against grid parameters, be careful") # For use later, we're gonna turn this into a set: self.no_rebuild_params = set(norebuild_params) if config_funcs.get("META_is_config_valid"): funcs_processed.add("META_is_config_valid") self.is_config_valid = config_funcs["META_is_config_valid"] # Make sure we aren't scanning other params: # Other META handling goes here unmatched_funcs = set(config_funcs.keys()) - funcs_processed if len(unmatched_funcs) > 0: raise RuntimeError(f"Found the following META functions which haven't been implemented, refer to README for proper naming {unmatched_funcs}") # Make sure we can set all keys are in our config: if not args.no_check_keys: for grid_key in griddict.keys(): if grid_key not in self.base_config: print(f"{grid_key} not found in base config") print(f"{self.base_config}") assert False # Make sure all values are non-empty lists of something that isn't a list or a dict # TODO expand this to something reasonable to help META_bisect if "META_search_callback" in funcs_processed: print("WARNING: Skipping parameter validation because META_search_callback was provided") else: for val in griddict.values(): assert isinstance(val, list) #assert len(val) >= 1 assert all(not isinstance(el, list) and not isinstance(el, dict) for el in val) self.grid = griddict
def __init__(self, bench, scen, config_dict, config_funcs=None): """ Construct a ConfigGrid Args: bench (str): The benchmark requested (fuzzy match behavior using BENCHMARKS.alias) scen (str): The scenario requested (fuzzy match behavior using SCENARIOS.alias) config_dict (Dict[str, List]): A config dictionary. Refer to 'Config Schema' in the README for format config_funcs (Dict[str, Callable]): A dictionary of META* functions. Refer to 'Config Schema' in the README for requirements. """ self.system = get_system() self.system_id = self.system.get_id() self.benchmark = BENCHMARKS.alias(bench) self.scenario = SCENARIOS.alias(scen) candidate_configs = find_config_files(benchmarks=[self.benchmark], scenarios=[self.scenario]) configs = load_configs(candidate_configs) assert len(configs) == 1 # To work with "extends" and "scales", we need to call into another config helper: self.base_config = get_system_benchmark_config(configs[0], self.system_id) self.default_config = configs[0]['default'] griddict = config_dict self.no_rebuild_params = None # No-op self.is_config_valid = lambda x: True funcs_processed = set() if config_funcs: if config_funcs.get("META_get_no_rebuild_params"): funcs_processed.add("META_get_no_rebuild_params") norebuild_params = config_funcs.get( "META_get_no_rebuild_params")() assert isinstance(norebuild_params, list) # Make sure these keys all exist in our grid params: missing_keys = set(norebuild_params) - set(griddict.keys()) if len(missing_keys) > 0: raise RuntimeError( f"The keys: {missing_keys} were mentioned in META_get_no_rebuild_params, but are not a specified parameter in:\n{griddict.keys()}" ) # For use later, we're gonna turn this into a set: self.no_rebuild_params = set(norebuild_params) if config_funcs.get("META_is_config_valid"): funcs_processed.add("META_is_config_valid") self.is_config_valid = config_funcs["META_is_config_valid"] # Other META handling goes here unmatched_funcs = set(config_funcs.keys()) - funcs_processed if len(unmatched_funcs) > 0: raise RuntimeError( f"Found the following META functions which haven't been implemented, refer to README for proper naming {unmatched_funcs}" ) # Make sure we can set all keys are in our config: for grid_key in griddict.keys(): if grid_key not in self.base_config: print(f"{grid_key} not found in base config") print(f"{self.base_config}") assert False # Make sure all values are non-empty lists of something that isn't a list or a dict for val in griddict.values(): assert isinstance(val, list) assert len(val) >= 1 assert all(not isinstance(el, list) and not isinstance(el, dict) for el in val) self.grid = griddict
def initialize(self): """Create DLRM network using TRT API and plugins and set the weights.""" useConvForFC_bottom = (self.precision == "int8") useConvForFC_top = (self.precision == "int8") interactionsOutputInterleaved = False if self.need_calibration or self.input_dtype != "int8" else True # Turn off interleaved format if top_mlp use non-interleaved format if not self.enable_interleaved_top_mlp: interactionsOutputInterleaved = False else: print("Using batch-interleaved format for top_mlp.") # Check if we should split the model into the binary file with embedding weights quantized and model without embeddings if not (os.path.isfile(self.embedding_weights_binary_filepath) and os.path.isfile(self.model_without_embedding_weights_filepath)): logging.info("Loading checkpoint from " + self.model_filepath) self.weights = torch.load(self.model_filepath, map_location="cpu")["state_dict"] self.dump_embedding_weights_to_binary_file() logging.info("Writing model without embedding weights to " + self.model_without_embedding_weights_filepath) torch.save(self.weights, self.model_without_embedding_weights_filepath) del self.weights # Dump row frequencies to file in binary format if self.use_row_frequencies and not os.path.isfile(self.row_frequencies_binary_filepath): logging.info("Writing row frequencies to " + self.row_frequencies_binary_filepath) self.dump_row_frequencies_to_binary_file() # Load weights self.weights = torch.load(self.model_without_embedding_weights_filepath, map_location="cpu") # Create network. self.network = self.builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) # Numerical input numerical_input = self.network.add_input("numerical_input", trt.DataType.FLOAT, (-1, self.num_numerical_inputs, 1, 1)) if not self.need_calibration: if self.input_dtype == "int8": numerical_input.dtype = trt.int8 elif self.input_dtype == "fp16": numerical_input.dtype = trt.float16 if self.input_format == "linear": numerical_input.allowed_formats = 1 << int(trt.TensorFormat.LINEAR) elif self.input_format == "chw4": numerical_input.allowed_formats = 1 << int(trt.TensorFormat.CHW4) elif self.input_format == "chw32": numerical_input.allowed_formats = 1 << int(trt.TensorFormat.CHW32) # Bottom MLP if self.need_calibration or self.input_dtype != "int8": bottom_mlp = self.add_mlp(numerical_input, self.num_numerical_inputs, self.bottom_mlp_channels, self.bottom_mlp_names, last_relu=True, useConvForFC=useConvForFC_bottom) else: bottom_mlp_plugin, output_tensor_name = self.add_fused_bottom_mlp("DLRM_BOTTOM_MLP_TRT", numerical_input, self.num_numerical_inputs, self.bottom_mlp_channels, self.bottom_mlp_names) bottom_mlp = self.network.add_plugin_v2([numerical_input], bottom_mlp_plugin) bottom_mlp.get_output(0).name = output_tensor_name bottom_mlp_shuffle = self.network.add_shuffle(bottom_mlp.get_output(0)) bottom_mlp_shuffle.reshape_dims = trt.Dims((-1, 1, self.embedding_size)) # Index input index_input = self.network.add_input("index_input", trt.DataType.INT32, (-1, self.num_features)) # Embedding lookup and interactions dlrm_interactions_plugin = self.get_dlrm_interactions_plugin("DLRM_INTERACTIONS_TRT", np.cumsum( np.array([0] + self.embedding_rows[:-1]).astype(np.int32)).astype(np.int32), interactionsOutputInterleaved) interaction_output_concat = self.network.add_plugin_v2([bottom_mlp.get_output(0), index_input], dlrm_interactions_plugin) interaction_output_concat.name = "interaction_plugin" interaction_output_concat.get_output(0).name = "interaction_output_concat_output" if self.enable_interleaved_top_mlp and not interactionsOutputInterleaved: # Shuffle from [BS, C, 1, 1] to [BS//2, C, 2, 1] before top_mlp interleave_pre_top_mlp = self.network.add_shuffle(interaction_output_concat.get_output(0)) interleave_pre_top_mlp.reshape_dims = trt.Dims((-1, 2, interaction_output_concat.get_output(0).shape[1], 0)) interleave_pre_top_mlp.second_transpose = trt.Permutation([0, 2, 1, 3]) interleave_pre_top_mlp.name = "interleave_pre_top_mlp" top_mlp_input = interleave_pre_top_mlp.get_output(0) top_mlp_input.name = "interleave_pre_top_mlp" else: top_mlp_input = interaction_output_concat.get_output(0) # Insert small-tile GEMM plugin. The plugin supports Ampere-only. gpu_arch = get_system().arch system_id = get_system().gpu if self.use_small_tile_gemm_plugin: if gpu_arch != Architecture.Ampere: print("Small-Tile GEMM plugin does not support {}. Plugin disabled.".format(system_id)) self.use_small_tile_gemm_plugin = False # Enable gemm plugin with interleaved format is not recommended. # Note (2/7/21): GEMM plugin doesn't perform well when H*W > 1 if self.use_small_tile_gemm_plugin and self.enable_interleaved_top_mlp: print("Warning: small-Tile GEMM plugin performance will be " "significantly impacted by interleaved format. Turn off " "interleaved format for the best performance") tmp_mlp_input = top_mlp_input tmp_input_size = self.top_mlp_input_size # Helper function to check whether the provided shape is supported by # Small-Tile GEMM plugin def support_small_tile_gemm_func(C, K): return \ (C >= 256) and (C <= 1280) and (C % 128 == 0) and (K % 128 == 0) # Split the top_mlp layers, and use GEMM plugin for 2,4,6 # C, K for top_mlp.0,2,4,6,8: [480,1024],[1024,1024],[1024,512],[512,256],[256,1] for i in range(len(self.top_mlp_channels)): # Insert plugin if the layer meets the restriction if support_small_tile_gemm_func(tmp_input_size, self.top_mlp_channels[i]) and \ self.use_small_tile_gemm_plugin: print("Replacing {} with Small-Tile GEMM Plugin, with fairshare cache size {}". format(self.top_mlp_names[i], self.gemm_plugin_fairshare_cache_size)) layer_top_mlp = self.add_small_tile_gemm_top_mlp( tmp_mlp_input, tmp_input_size, self.top_mlp_channels[i], self.top_mlp_names[i], self.gemm_plugin_fairshare_cache_size ) else: layer_top_mlp = self.add_single_mlp( tmp_mlp_input, tmp_input_size, self.top_mlp_channels[i], self.top_mlp_names[i], useConvForFC=useConvForFC_top, add_relu=(i != len(self.top_mlp_channels) - 1)) tmp_mlp_input = layer_top_mlp.get_output(0) tmp_input_size = self.top_mlp_channels[i] top_mlp = layer_top_mlp if self.enable_interleaved_top_mlp: # Shuffle [BS//2, 1, 2, 1] back to [BS, 1, 1, 1] interleave_post_top_mlp = self.network.add_shuffle(top_mlp.get_output(0)) interleave_post_top_mlp.reshape_dims = trt.Dims((-1, 0, 1, 0)) interleave_post_top_mlp.name = "interleave_post_top_mlp" sigmoid_input = interleave_post_top_mlp.get_output(0) sigmoid_input.name = "interleave_post_top_mlp" else: sigmoid_input = top_mlp.get_output(0) # Sigmoid sigmoid_layer = self.network.add_activation(sigmoid_input, trt.ActivationType.SIGMOID) sigmoid_layer.name = "sigmoid" sigmoid_layer.get_output(0).name = "sigmoid_output" # Output self.network.mark_output(sigmoid_layer.get_output(0)) # Make sure we release the memory to system del self.weights self.initialized = True