def accelerator_split_module_inputs( trace: torch.jit.ScriptFunction, export_options: ExportConfig ): import torch_glow seq_padding_control, batch_padding_control = get_seq_and_batch_padding_control( trace, export_options ) input_examples = [] for seq_len in seq_padding_control: if seq_len <= 0: continue for batch_size in batch_padding_control: if batch_size <= 0: continue input1 = torch.randint(3, (batch_size, seq_len)) input2 = torch.randint(3, (batch_size, seq_len)) input3 = torch.rand(batch_size, seq_len).bool() input_specs = torch_glow.input_specs_from_tensors([input1, input2, input3]) input_examples.append(input_specs) return input_examples
def test_save_preprocessed_module(self): with torch.no_grad(): x = torch.randn([1, 4, 4, 4], dtype=torch.float32) model = Bar() model.eval() model = torch.jit.trace(model, x) spec = torch_glow.CompilationSpec() spec.get_settings().set_glow_backend("Interpreter") compilation_group = torch_glow.CompilationGroup() spec.compilation_groups_append(compilation_group) compilation_group.input_sets_append( torch_glow.input_specs_from_tensors([x])) torch_glow.disableFusionPass() torch_glow.enable_convert_to_fp16() glow_mod = torch_glow.to_glow(model, spec) reloaded = utils.save_and_reload_model(glow_mod) wrappername = "__loweredModule__" attrname = "__processed_module" wp = getattr(reloaded._c, wrappername) pp = getattr(wp, attrname) pt_model = torch.jit._recursive.wrap_cpp_module(pp) graph = pt_model.graph_for(x) found = False for node in graph.nodes(): if node.kind() == "quantized::conv2d": found = True assert found
def accelerator_transformerLayers_inputs( model: nn.Module, trace: torch.jit.ScriptFunction, export_options: ExportConfig, dataset_iterable: Iterable, module_path, ): import torch_glow seq_padding_control, batch_padding_control = get_seq_and_batch_padding_control( trace, export_options ) # this should use a method, or module_path, instead of being hardcoded # embedding_dim = model.encoder.encoder.transformer.token_embedding.embedding_dim embedding_dim = accelerator.get_embedding_module_from_path(model, module_path) input_examples = [] for seq_len in seq_padding_control: if seq_len <= 0: continue for batch_size in batch_padding_control: if batch_size <= 0: continue # Todo: We directly generate data input instead of using dataset_iterable, enhance later input1 = torch.randn( [seq_len, batch_size, embedding_dim], dtype=torch.float32 ) input2 = torch.randn([batch_size, seq_len]).bool() input_specs = torch_glow.input_specs_from_tensors([input1, input2]) input_examples.append(input_specs) return input_examples
def test_to_glow_multiple_groups_and_input_sets(self): x1 = torch.randn(1, 4) y1 = torch.randn(2, 4) x2 = torch.randn(1, 2) y2 = torch.randn(5, 2) x3 = torch.randn(7) y3 = torch.randn(3, 7) mod = Foo() scripted_mod = torch.jit.script(mod) x1_y1_set = torch_glow.input_specs_from_tensors([x1, y1]) x2_y2_set = torch_glow.input_specs_from_tensors([x2, y2]) x3_y3_set = torch_glow.input_specs_from_tensors([x3, y3]) # Create two CompilationGroup, first one contains two input sets # and the second CompilationGroup has the third input set spec = torch_glow.CompilationSpec() spec.get_settings().set_glow_backend("Interpreter") compilation_group_1 = torch_glow.CompilationGroup() compilation_group_2 = torch_glow.CompilationGroup() spec.compilation_groups_append(compilation_group_1) spec.compilation_groups_append(compilation_group_2) compilation_group_1.input_sets_append(x1_y1_set) compilation_group_1.input_sets_append(x2_y2_set) compilation_group_2.input_sets_append(x3_y3_set) lowered_module = torch_glow.to_glow(scripted_mod, spec) torch_res1 = mod(x1, y1) torch_res2 = mod(x2, y2) torch_res3 = mod(x3, y3) glow_res1 = lowered_module(x1, y1) glow_res2 = lowered_module(x2, y2) glow_res3 = lowered_module(x3, y3) assert torch.allclose(torch_res1, glow_res1) assert torch.allclose(torch_res2, glow_res2) assert torch.allclose(torch_res3, glow_res3)
def get_compilation_spec(inputs): """helper function to get the compilation spec of the submodule""" spec = torch_glow.CompilationSpec() spec.get_settings().set_glow_backend("Interpreter") compilation_group = torch_glow.CompilationGroup() spec.compilation_groups_append(compilation_group) compilation_group.input_sets_append(torch_glow.input_specs_from_tensors(inputs)) return spec
def accelerator_transformerLayers_inputs( model: nn.Module, trace: torch.jit.ScriptFunction, export_options: ExportConfig, dataset_iterable: Iterable, module_path, ): import torch_glow # we use the padding control from the Export Config: if export_options is None: export_options = ExportConfig() if export_options.seq_padding_control is None: raise RuntimeError("seq padding control not specified") if export_options.batch_padding_control is None: raise RuntimeError("batch padding control not specified") batch_padding_control = export_options.batch_padding_control # Restrict seq_padding_control to valid ranges seq_padding_control = [] max_seq_len = trace.get_max_seq_len() for pad in export_options.seq_padding_control: if pad < max_seq_len: seq_padding_control.append(pad) seq_padding_control.append(max_seq_len) # this should use a method, or module_path, instead of being hardcoded # embedding_dim = model.encoder.encoder.transformer.token_embedding.embedding_dim embedding_dim = accelerator.get_embedding_module_from_path( model, module_path) input_examples = [] for seq_len in seq_padding_control: if seq_len <= 0: continue for batch_size in batch_padding_control: if batch_size <= 0: continue # Todo: We directly generate data input instead of using dataset_iterable, enhance later input1 = torch.randn([seq_len, batch_size, embedding_dim], dtype=torch.float32) input2 = torch.randn([batch_size, seq_len]).bool() input_specs = torch_glow.input_specs_from_tensors([input1, input2]) input_examples.append(input_specs) return input_examples
def build_compiliation_spec(self): compilation_spec = torch_glow.CompilationSpec() compilation_spec_settings = compilation_spec.get_settings() compilation_spec_settings.set_glow_backend("CPU") compilation_spec_settings.set_enable_fuser(True) fuser_settings = compilation_spec.get_fuser_settings() fuser_settings.set_min_fusion_group_size(3) fuser_settings.set_max_fusion_merge_size(4) fuser_settings.set_fusion_start_index(5) fuser_settings.set_fusion_end_index(6) fuser_settings.op_blacklist_append("aten::mean") fuser_settings.op_blacklist_append("aten::dropout") compilation_group = torch_glow.CompilationGroup() input1_spec = torch_glow.input_spec_from_tensor(torch.randn(2, 3, 224, 224)) input2_spec = torch_glow.input_spec_from_tensor( torch.randn(3, 2).to(torch.float16) ) compilation_group.input_sets_append([input1_spec, input2_spec]) compilation_group.input_sets_append( torch_glow.input_specs_from_tensors( [torch.randn(1, 3, 224, 224), torch.randn(4, 1)] ) ) compilation_group_settings = compilation_group.get_settings() compilation_group_settings.set_convert_to_fp16(True) compilation_group_settings.set_num_devices_to_use(50) compilation_group_settings.set_replication_count(52) compilation_group_settings.backend_specific_opts_insert("apple", "orange") compilation_spec.compilation_groups_append(compilation_group) default_compilation_group_settings = ( compilation_spec.get_default_compilation_group_settings() ) default_compilation_group_settings.set_convert_to_fp16(False) default_compilation_group_settings.set_num_devices_to_use(89) default_compilation_group_settings.set_replication_count(90) default_compilation_group_settings.backend_specific_opts_insert( "hello", "goodbye" ) return compilation_spec
def accelerator_lstm_inputs( model: nn.Module, trace: torch.jit.ScriptFunction, export_options: ExportConfig, dataset_iterable: Iterable, module_path, ): import torch_glow # we use the padding control from the Export Config: if export_options is None: export_options = ExportConfig() if export_options.seq_padding_control is None: raise RuntimeError("seq padding control not specified") if export_options.batch_padding_control is None: raise RuntimeError("batch padding control not specified") batch_padding_control = export_options.batch_padding_control seq_padding_control = export_options.seq_padding_control embedding_dim = trace.embedding.word_embedding.embedding_dim * 2 lstm_num_layers = trace.lstm_num_layers lstm_dim = trace.lstm_dim input_examples = [] for seq_len in seq_padding_control: if seq_len <= 0: continue for batch_size in batch_padding_control: if batch_size <= 0: continue # Todo: We directly generate data input instead of using dataset_iterable, enhance later input_embedding = torch.randn( [batch_size, seq_len, embedding_dim], dtype=torch.float32 ) input_hidden = torch.randn( [batch_size, lstm_num_layers, lstm_dim], dtype=torch.float32 ) input_cell = torch.randn( [batch_size, lstm_num_layers, lstm_dim], dtype=torch.float32 ) input_specs = torch_glow.input_specs_from_tensors( [input_embedding, input_hidden, input_cell] ) input_examples.append(input_specs) return input_examples
def lower_modules_to_accelerator(model, trace, seq_padding_control, batch_padding_control): import torch_glow if hasattr(model, "encoder") and isinstance(model.encoder, RoBERTaEncoder): backend = "NNPI" submod_modelpath, compilation_spec_dict = accelerator.get_modules( model, backend)[0] submod_tracepath = accelerator.model2trace_path(submod_modelpath) embedding_dim = model.encoder.encoder.transformer.token_embedding.embedding_dim spec = torch_glow.CompilationSpec() spec.get_settings().set_glow_backend(backend) compilation_group = torch_glow.CompilationGroup() spec.compilation_groups_append(compilation_group) compilation_group_settings = compilation_group.get_settings() compilation_group_settings.set_convert_to_fp16(True) for k, v in compilation_spec_dict.items(): compilation_group.get_settings().backend_specific_opts_insert(k, v) for seq_len in seq_padding_control: if seq_len <= 0: continue for batch_size in batch_padding_control: if batch_size <= 0: continue input1 = torch.randn([seq_len, batch_size, embedding_dim], dtype=torch.float32) input2 = torch.randn([batch_size, seq_len]).bool() input_specs = torch_glow.input_specs_from_tensors( [input1, input2]) compilation_group.input_sets_append(input_specs) trace = torch_glow.to_glow_selective( trace, {submod_tracepath: spec}, inplace=False, ) return trace else: return trace
def test_serialization(self): with torch.no_grad(): x = torch.randn([1, 4, 4, 4], dtype=torch.float32) y = torch.randn([1, 4, 4, 4], dtype=torch.float32) model = Bar() model = torch.jit.trace(model, (x, y)) spec = torch_glow.CompilationSpec() spec_settings = spec.get_settings() spec_settings.set_glow_backend("NNPI") # Enabled the serialize in this spec spec_settings.set_enable_serialize(True) compilation_group = torch_glow.CompilationGroup() compilation_group_settings = compilation_group.get_settings() compilation_group_settings.set_replication_count(1) compilation_group_settings.backend_specific_opts_insert( "NNPI_IceCores", "1") compilation_group.input_sets_append( torch_glow.input_specs_from_tensors([x, y])) spec.compilation_groups_append(compilation_group) torch_glow.disableFusionPass() torch_glow.enable_convert_to_fp16() # Enable global serialize # then compile(serialize) the model and save it torch_glow.enable_dump_serialized_model() glow_mod = torch_glow.to_glow(model, spec) res1 = glow_mod(x, y) torch.jit.save(glow_mod, "/tmp/serialize_to_glow.pt") # Enable global deserialize and disable serialize # and load(deserialize) the model to loaded_glow_mod torch_glow.enable_deserialize() torch_glow.disable_dump_serialized_model() loaded_glow_mod = torch.jit.load("/tmp/serialize_to_glow.pt") res2 = loaded_glow_mod(x, y) assert torch.allclose(res1, res2, 1e-5, 1e-5)
def accelerator_transformerLayers_inputs(model: nn.Module, export_options: ExportConfig, dataset_iterable: iter, module_path): import torch_glow # we use the padding control from the Export Config: if export_options is None: export_options = ExportConfig() seq_padding_control = export_options.seq_padding_control batch_padding_control = export_options.batch_padding_control if seq_padding_control is None: raise RuntimeError("seq padding control not specified") if batch_padding_control is None: raise RuntimeError("batch padding control not specified") max_seq_len = model.get_max_seq_len() seq_padding_control = [ pad if pad <= max_seq_len else max_seq_len for pad in seq_padding_control ] + [max_seq_len] # this should use a method, or module_path, instead of being hardcoded embedding_dim = model.encoder.encoder.transformer.token_embedding.embedding_dim input_examples = [] for seq_len in seq_padding_control: if seq_len <= 0: continue for batch_size in batch_padding_control: if batch_size <= 0: continue # Todo: We directly generate data input instead of using dataset_iterable, enhance later input1 = torch.randn([seq_len, batch_size, embedding_dim], dtype=torch.float32) input2 = torch.randn([batch_size, seq_len]).bool() input_specs = torch_glow.input_specs_from_tensors([input1, input2]) input_examples.append(input_specs) return input_examples
def lower_modules_to_accelerator(model, trace, seq_padding_control, batch_padding_control): import torch_glow if hasattr(model, "encoder") and isinstance(model.encoder, RoBERTaEncoder): embedding_dim = model.encoder.encoder.transformer.token_embedding.embedding_dim spec = torch_glow.CompilationSpec() spec.get_settings().set_glow_backend("NNPI") compilation_group = torch_glow.CompilationGroup() spec.compilation_groups_append(compilation_group) compilation_group_settings = compilation_group.get_settings() compilation_group_settings.set_convert_to_fp16(True) compilation_group.get_settings().backend_specific_opts_insert( "NNPI_IceCores", "12") compilation_group.get_settings().backend_specific_opts_insert( "NNPINumParallelChunks", "12") for seq_len in seq_padding_control: if seq_len <= 0: continue for batch_size in batch_padding_control: if batch_size <= 0: continue input1 = torch.randn([seq_len, batch_size, embedding_dim], dtype=torch.float32) input2 = torch.randn([batch_size, seq_len]).bool() input_specs = torch_glow.input_specs_from_tensors( [input1, input2]) compilation_group.input_sets_append(input_specs) trace = torch_glow.to_glow_selective( trace, {"model.encoder.encoder.transformer.layers": spec}, inplace=False, ) return trace else: return trace