def load_model(weights_fpath: Path, device=None): """ Loads the model in memory. If this function is not explicitely called, it will be run on the first call to embed_frames() with the default weights file. :param weights_fpath: the path to saved model weights. :param device: either a torch device or the name of a torch device (e.g. "cpu", "cuda"). The model will be loaded and will run on this device. Outputs will however always be on the cpu. If None, will default to your GPU if it"s available, otherwise your CPU. """ # TODO: I think the slow loading of the encoder might have something to do with the device it # was saved on. Worth investigating. global _model, _device if device is None: _device = torch.device("cuda" if torch.cuda.is_available() else "cpu") elif isinstance(device, str): _device = torch.device(device) checkpoint = torch.load(weights_fpath) if 'quantized' in checkpoint and checkpoint['quantized'] == True: _quantized = checkpoint['quantized'] _device = torch.device("cpu") _model = SpeakerEncoder(_device, torch.device("cpu")) _model = torch.quantization.quantize_dynamic(_model, {nn.LSTM, nn.Linear}, dtype=torch.qint8) _model.load_state_dict(checkpoint["model_state"]) else: _model = SpeakerEncoder(_device, torch.device("cpu")) _model.load_state_dict(checkpoint["model_state"]) _model.eval() print("Loaded encoder \"%s\" trained to step %d" % (weights_fpath.name, checkpoint["step"]))
def main(): saved_model_dir = './saved_model/' float_model_file = 'pretrained.pt' # create directory if not os.path.exists(saved_model_dir): try: os.makedirs(saved_model_dir) except OSError as e: raise Exception("Could not create directory {0:}. Please check file system permissions.".format(saved_model_dir)) if not os.path.exists(saved_model_dir + float_model_file): raise Exception("Cannot perform static quantization without trained model. Please provide weights file.") num_calibration_batches = 10 # set default device to cpu since pytorch only supports quantization on CPU _device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") _device = torch.device("cpu") myModel = SpeakerEncoder(_device, torch.device("cpu")) checkpoint = torch.load(saved_model_dir + float_model_file) myModel.load_state_dict(checkpoint["model_state"]) myModel.to(_device) myModel.eval() # Fuse Conv, bn and relu # myModel.fuse_model() # Specify quantization configuration # Start with simple min/max range estimation and per-tensor quantization of weights # myModel.qconfig = torch.quantization.default_qconfig # print(myModel.qconfig) # torch.quantization.prepare(myModel, inplace=True) # Calibrate first print('Post Training Quantization Prepare: Inserting Observers') #print('\n Inverted Residual Block:After observer insertion \n\n', myModel.features[1].conv) # Calibrate with the training set print('Post Training Quantization: Calibration done') # Convert to quantized model myModel = torch.quantization.quantize_dynamic(myModel, {nn.LSTM, nn.Linear}, dtype=torch.qint8) # torch.quantization.convert(myModel, inplace=True) print('Post Training Quantization: Convert done') #print('\n Inverted Residual Block: After fusion and quantization, note fused modules: \n\n',myModel.features[1].conv) print("Size of model before quantization: ", end="") print('{0:} (MB):'.format(os.path.getsize(saved_model_dir + float_model_file)/1e6)) print("Size of model after quantization: ", end="") step = checkpoint['step'] store_file = {'step': step, 'model_state': myModel.state_dict(), 'quantized': True} torch.save(store_file, saved_model_dir + "quantized.pt") print('{0:} (MB):'.format(os.path.getsize(saved_model_dir + "quantized.pt")/1e6))
def load(self, weights_fpath: Path, device=None, model_name='default'): """ Loads the model in memory. If this function is not explicitely called, it will be run on the first call to embed_frames() with the default weights file. :param weights_fpath: the path to saved model weights. :param device: either a torch device or the name of a torch device (e.g. "cpu", "cuda"). The model will be loaded and will run on this device. Outputs will however always be on the cpu. If None, will default to your GPU if it"s available, otherwise your CPU. :param model_name: an identifier to uniquely identify a model in order to avoid loading the same model more than once """ if model_name in self.models: return # TODO: I think the slow loading of the encoder might have something to do with the device it # was saved on. Worth investigating. if device is None: device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") elif isinstance(device, str): device = torch.device(device) model = SpeakerEncoder(device, torch.device("cpu")) checkpoint = torch.load( weights_fpath, map_location=None if torch.cuda.is_available() else 'cpu') model.load_state_dict(checkpoint["model_state"]) model.eval() logging.info('Loaded encoder {} trained to step {}'.format( weights_fpath.name, checkpoint["step"])) self.models[model_name] = model self.devices[model_name] = device
def load_model(weights_fpath: Path, device=None): """ Loads the model in memory. If this function is not explicitely called, it will be run on the first call to embed_frames() with the default weights file. :param weights_fpath: the path to saved model weights. :param device: either a torch device or the name of a torch device (e.g. "cpu", "cuda"). The model will be loaded and will run on this device. Outputs will however always be on the cpu. If None, will default to your GPU if it"s available, otherwise your CPU. """ # TODO: I think the slow loading of the encoder might have something to do with the device it # was saved on. Worth investigating. global _model, _device if device is None: _device = torch.device("cuda" if torch.cuda.is_available() else "cpu") elif isinstance(device, str): _device = torch.device(device) _model = SpeakerEncoder(_device, torch.device("cpu")) weights_fpath = Path(weights_fpath) weights_fpath = "/home/project/zhrtvc/models-gmw/models/encoder/saved_models/ge2e_pretrained.pt" checkpoint = torch.load(weights_fpath, map_location=_device.type) _model.load_state_dict(checkpoint["model_state"]) _model.eval() print("Loaded encoder \"%s\" trained to step %d" % (Path(weights_fpath).name, checkpoint["step"])) # 有错误屏蔽掉
def load_model(weights_fpath: Path): """ Loads the model in memory. If this function is not explicitely called, it will be run on the first call to embed_frames() with the default weights file. :param weights_fpath: the path to saved model weights. """ # TODO: I think the slow loading of the encoder might have something to do with the device it # was saved on. Worth investigating. global _model, _device _device = torch.device("cpu") _model = SpeakerEncoder(_device, torch.device("cpu")) checkpoint = torch.load(weights_fpath, map_location=torch.device('cpu')) _model.load_state_dict(checkpoint["model_state"]) _model.eval()
def load_model(weights_fpath: Path, device=None): ''' This Method loads the model PARAMS: weights_fpath: The path to the weights of the pretrained model(Must be a Path object) ''' global _model, _device if device is None: _device = torch.device("cuda" if torch.cuda.is_available() else "cpu") elif isinstance(device, str): _device = torch.device(device) _model = SpeakerEncoder(_device, torch.device("cpu")) checkpoint = torch.load(weights_fpath) _model.load_state_dict(checkpoint["model_state"]) _model.eval()
def get_model(weights_fpath: Path, device=None): """ Loads the model in memory. If this function is not explicitely called, it will be run on the first call to embed_frames() with the default weights file. :param weights_fpath: the path to saved model weights. :param device: either a torch device or the name of a torch device (e.g. "cpu", "cuda"). The model will be loaded and will run on this device. Outputs will however always be on the cpu. If None, will default to your GPU if it"s available, otherwise your CPU. """ # TODO: I think the slow loading of the encoder might have something to do with the device it # was saved on. Worth investigating. model = SpeakerEncoder(_device, torch.device("cpu")) checkpoint = torch.load(weights_fpath) model.load_state_dict(checkpoint["model_state"]) model.eval() print("Loaded encoder \"%s\" trained to step %d" % (weights_fpath.name, checkpoint["step"])) return model
def load_model(weights_fpath: Path, device=None): """ Loads the model into memory. If this funciton is not explicitly called, it will be run on the first call to embed_frames() with the default weights file. :param weights_fpath: The path to the saved model weights :param device: either a torch device or the name of a torch device (e.g. "cpu", "cuda"). The model loaded will be loaded and will run on this device. Outputs however will always be on the cpu. If None, will default to your GPU if available, otherwise your CPU. """ global _model, _device if device is None: _device = torch.device("cuda" if torch.cuda.is_available() else "cpu") elif isinstance(device, str): _device = torch.device(device) _model = SpeakerEncoder(_device, torch.device("cpu")) checkpoint = torch.load(weights_fpath) _model.load_state_dict(checkpoint["model_state"]) _model.eval() print("Loaded encoder \"%s\" trained to step %d" % (weights_fpath.name, checkpoint["step"]))
def train(run_id: str, clean_data_root: Path, models_dir: Path, umap_every: int, save_every: int, backup_every: int, vis_every: int, force_restart: bool, visdom_server: str, no_visdom: bool): # Create a dataset and a dataloader dataset = SpeakerVerificationDataset(clean_data_root) loader = SpeakerVerificationDataLoader( dataset, speakers_per_batch, utterances_per_speaker, num_workers=8, ) # Setup the device on which to run the forward pass and the loss. These can be different, # because the forward pass is faster on the GPU whereas the loss is often (depending on your # hyperparameters) faster on the CPU. device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # FIXME: currently, the gradient is None if loss_device is cuda loss_device = torch.device("cpu") # Create the model and the optimizer model = SpeakerEncoder(device, loss_device) optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate_init) init_step = 1 # Configure file path for the model state_fpath = models_dir.joinpath(run_id + ".pt") backup_dir = models_dir.joinpath(run_id + "_backups") # Load any existing model if not force_restart: if state_fpath.exists(): print( "Found existing model \"%s\", loading it and resuming training." % run_id) checkpoint = torch.load(state_fpath) init_step = checkpoint["step"] model.load_state_dict(checkpoint["model_state"]) optimizer.load_state_dict(checkpoint["optimizer_state"]) optimizer.param_groups[0]["lr"] = learning_rate_init else: print("No model \"%s\" found, starting training from scratch." % run_id) else: print("Starting the training from scratch.") model.train() # Initialize the visualization environment vis = Visualizations(run_id, vis_every, server=visdom_server, disabled=no_visdom) vis.log_dataset(dataset) vis.log_params() device_name = str( torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU") vis.log_implementation({"Device": device_name}) # Training loop profiler = Profiler(summarize_every=10, disabled=False) for step, speaker_batch in enumerate(loader, init_step): profiler.tick("Blocking, waiting for batch (threaded)") # Forward pass inputs = torch.from_numpy(speaker_batch.data).to(device) sync(device) profiler.tick("Data to %s" % device) embeds = model(inputs) sync(device) profiler.tick("Forward pass") embeds_loss = embeds.view( (speakers_per_batch, utterances_per_speaker, -1)).to(loss_device) loss, eer = model.loss(embeds_loss) sync(loss_device) profiler.tick("Loss") # Backward pass model.zero_grad() loss.backward() profiler.tick("Backward pass") model.do_gradient_ops() optimizer.step() profiler.tick("Parameter update") # Update visualizations # learning_rate = optimizer.param_groups[0]["lr"] vis.update(loss.item(), eer, step) # Draw projections and save them to the backup folder if umap_every != 0 and step % umap_every == 0: print("Drawing and saving projections (step %d)" % step) backup_dir.mkdir(exist_ok=True) projection_fpath = backup_dir.joinpath("%s_umap_%06d.png" % (run_id, step)) embeds = embeds.detach().cpu().numpy() vis.draw_projections(embeds, utterances_per_speaker, step, projection_fpath) vis.save() # Overwrite the latest version of the model if save_every != 0 and step % save_every == 0: print("Saving the model (step %d)" % step) torch.save( { "step": step + 1, "model_state": model.state_dict(), "optimizer_state": optimizer.state_dict(), }, state_fpath) # Make a backup if backup_every != 0 and step % backup_every == 0: print("Making a backup (step %d)" % step) backup_dir.mkdir(exist_ok=True) backup_fpath = backup_dir.joinpath("%s_bak_%06d.pt" % (run_id, step)) torch.save( { "step": step + 1, "model_state": model.state_dict(), "optimizer_state": optimizer.state_dict(), }, backup_fpath) profiler.tick("Extras (visualizations, saving)")
def train(run_id: str, clean_data_root: Path, models_dir: Path, umap_every: int, save_every: int, backup_every: int, vis_every: int, force_restart: bool, visdom_server: str, no_visdom: bool): dataset = SpeakerVerificationDataset(clean_data_root) loader = SpeakerVerificationDataLoader( dataset, speakers_per_batch, utterances_per_speaker, num_workers=8, ) # cuda device = torch.device("cuda" if torch.cuda.is_available() else "cpu") loss_device = torch.device("cpu") # 创建模型和优化器 model = SpeakerEncoder(device, loss_device) optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate_init) init_step = 1 # 为模型配置文件路径 state_fpath = models_dir.joinpath(run_id + ".pt") backup_dir = models_dir.joinpath(run_id + "_backups") model.train() # 初始化可视化环境(visdom) vis = Visualizations(run_id, vis_every, server=visdom_server, disabled=no_visdom) device_name = str( torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU") # 开始训练 profiler = Profiler(summarize_every=10, disabled=False) for step, speaker_batch in enumerate(loader, init_step): profiler.tick("Blocking, waiting for batch (threaded)") # 正向传播 inputs = torch.from_numpy(speaker_batch.data).to(device) sync(device) profiler.tick("Data to %s" % device) embeds = model(inputs) sync(device) profiler.tick("Forward pass") embeds_loss = embeds.view( (speakers_per_batch, utterances_per_speaker, -1)).to(loss_device) loss, eer = model.loss(embeds_loss) sync(loss_device) profiler.tick("Loss") # 反向传播 model.zero_grad() loss.backward() profiler.tick("Backward pass") model.do_gradient_ops() optimizer.step() profiler.tick("Parameter update") vis.update(loss.item(), eer, step) # 进行一次UMAP投影可视化并保存图片 if umap_every != 0 and step % umap_every == 0: # print("Drawing and saving projections (step %d)" % step) backup_dir.mkdir(exist_ok=True) projection_fpath = backup_dir.joinpath("%s_umap_%06d.png" % (run_id, step)) embeds = embeds.detach().cpu().numpy() vis.draw_projections(embeds, utterances_per_speaker, step, projection_fpath) vis.save() # 更新模型 if save_every != 0 and step % save_every == 0: # print("Saving the model (step %d)" % step) torch.save( { "step": step + 1, "model_state": model.state_dict(), "optimizer_state": optimizer.state_dict(), }, state_fpath) # 进行一次备份 if backup_every != 0 and step % backup_every == 0: # print("Making a backup (step %d)" % step) backup_dir.mkdir(exist_ok=True) backup_fpath = backup_dir.joinpath("%s_bak_%06d.pt" % (run_id, step)) torch.save( { "step": step + 1, "model_state": model.state_dict(), "optimizer_state": optimizer.state_dict(), }, backup_fpath) profiler.tick("Extras (visualizations, saving)")
def train(run_id: str, train_data_root: Path, test_data_root: Path, models_dir: Path, save_every: int, backup_every: int, vis_every: int, force_restart: bool, visdom_server: str, no_visdom: bool): # Create a dataset and a dataloader dataset = SpeakerVerificationDataset(train_data_root) loader = SpeakerVerificationDataLoader( dataset, speakers_per_batch, utterances_per_speaker, num_workers=dataloader_workers, # pin_memory=True, ) test_dataset = SpeakerVerificationDataset(test_data_root) testdata_loader = SpeakerVerificationDataLoader( test_dataset, speakers_per_batch, utterances_per_speaker, num_workers=dataloader_workers, # pin_memory=True, ) # Setup the device on which to run the forward pass and the loss. These can be different, # because the forward pass is faster on the GPU whereas the loss is often (depending on your # hyperparameters) faster on the CPU. device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Create the model and the optimizer model = SpeakerEncoder(device) raw_model = model if torch.cuda.device_count() > 1: print("Use", torch.cuda.device_count(), "GPUs.") # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs model = torch.nn.DataParallel(model) optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate_init) init_step = 1 # Configure file path for the model state_fpath = models_dir.joinpath(run_id + ".pt") backup_dir = models_dir.joinpath(run_id + "_backups") # Load any existing model if not force_restart: if state_fpath.exists(): print( "Found existing model \"%s\", loading it and resuming training." % run_id) checkpoint = torch.load(str(state_fpath)) init_step = checkpoint["step"] raw_model.load_state_dict(checkpoint["model_state"]) optimizer.load_state_dict(checkpoint["optimizer_state"]) optimizer.param_groups[0]["lr"] = learning_rate_init else: print("No model \"%s\" found, starting training from scratch." % run_id) else: print("Starting the training from scratch.") model.train() save_interval_s_time = time.time() prt_interval_s_time = time.time() total_loss, total_eer = 0, 0 # Training loop profiler = Profiler(summarize_every=1, disabled=True) for step, speaker_batch in enumerate(loader, init_step): # step_s_time = time.time() sync(device) profiler.tick("Blocking, waiting for batch (threaded)") # Forward pass inputs = torch.from_numpy(speaker_batch.data).to(device) sync(device) profiler.tick("Data to %s" % device) embeds = model(inputs) sync(device) profiler.tick("Forward pass") embeds_loss = embeds.view( (speakers_per_batch, utterances_per_speaker, -1)) loss, eer = raw_model.loss(embeds_loss) # print(loss.item(), flush=True) total_loss += loss.item() total_eer += eer sync(device) profiler.tick("Loss") # Backward pass model.zero_grad() loss.backward() profiler.tick("Backward pass") raw_model.do_gradient_ops() optimizer.step() sync(device) profiler.tick("Parameter update") if step % vis_every == 0: learning_rate = optimizer.param_groups[0]["lr"] prt_interval_e_time = time.time() cost_time = prt_interval_e_time - prt_interval_s_time prt_interval_s_time = prt_interval_e_time print( " Step %06d> %d step cost %d seconds, lr:%.4f, Avg_loss:%.4f, Avg_eer:%.4f." % ( # step, save_every, cost_time, loss.detach().numpy(), eer), flush=True) step, vis_every, cost_time, learning_rate, total_loss / vis_every, total_eer / vis_every), flush=True) total_loss, total_eer = 0, 0 # Overwrite the latest version of the model && test model # save_every = 20 if save_every != 0 and step % save_every == 0: # save torch.save( { "step": step + 1, "model_state": model.state_dict(), "optimizer_state": optimizer.state_dict(), }, str(state_fpath)) # test test_total_loss, test_total_eer = 0.0, 0.0 for test_step, test_batch in enumerate(testdata_loader, 1): testinputs = torch.from_numpy(test_batch.data).to(device) with torch.no_grad(): test_embeds = model(testinputs) test_embeds_loss = test_embeds.view( (speakers_per_batch, utterances_per_speaker, -1)) test_loss, test_eer = raw_model.loss(test_embeds_loss) # print(loss.item(), flush=True) test_total_loss += test_loss.item() test_total_eer += test_eer test_prt_interval = 10 if test_step % test_prt_interval == 0: print( " |--Test Step %06d> Avg_loss:%.4f, Avg_eer:%.4f." % (test_step, test_total_loss / test_step, test_total_eer / test_step), flush=True) if test_step == 50: break # print log save_interval_e_time = time.time() cost_time = save_interval_e_time - save_interval_s_time print( "\n" "++++Step %06d> Saving the model, %d step cost %d seconds." % ( # step, save_every, cost_time, loss.detach().numpy(), eer), flush=True) step, save_every, cost_time), flush=True) save_interval_s_time = save_interval_e_time # Make a backup if backup_every != 0 and step % backup_every == 0: print("Making a backup (step %d)" % step) backup_dir.mkdir(exist_ok=True) backup_fpath = str( backup_dir.joinpath("%s_bak_%06d.pt" % (run_id, step))) torch.save( { "step": step + 1, "model_state": model.state_dict(), "optimizer_state": optimizer.state_dict(), }, backup_fpath) sync(device) profiler.tick("Extras (visualizations, saving)")
encoder_model_fpath=Path("/data/real-time/test/encoder_2_lstm_80_mel_channel.pt") encoder_out_dir=Path("/data/test/") dataset = SpeakerVerificationDataset(datasets_root) loader = SpeakerVerificationDataLoader( dataset, speakers_per_batch, utterances_per_speaker, num_workers=2, ) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # FIXME: currently, the gradient is None if loss_device is cuda loss_device = torch.device("cpu") # Load the model model = SpeakerEncoder(device, loss_device) state_fpath = encoder_model_fpath checkpoint = torch.load(state_fpath) init_step = checkpoint["step"] model.load_state_dict(checkpoint["model_state"]) vis = Visualizations("encoder_model") count=0 for step, speaker_batch in enumerate(loader, init_step): print(speaker_batch) projection_fpath = encoder_out_dir.joinpath("%s_umap_%06d.png" % ("encoder_model", step)) inputs = torch.from_numpy(speaker_batch.data).to(device) name=speaker_batch.names embeds = model(inputs) embeds = embeds.detach().cpu().numpy() vis.draw_projections(embeds, utterances_per_speaker, step=step, out_fpath=projection_fpath,name=name)