def main(): model = Tacotron().to(DEVICE) print('Model {} is working...'.format(model.name)) print('{} threads are used...'.format(torch.get_num_threads())) ckpt_dir = os.path.join(args.logdir, model.name) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) scheduler = StepLR(optimizer, step_size=args.lr_decay_step // 10, gamma=0.933) # around 1/2 per decay step if not os.path.exists(ckpt_dir): os.makedirs(os.path.join(ckpt_dir, 'A', 'train')) elif not os.path.exists(os.path.join(ckpt_dir, 'ckpt.csv')): shutil.rmtree(ckpt_dir) os.makedirs(os.path.join(ckpt_dir, 'A', 'train')) else: print('Already exists. Retrain the model.') ckpt = pd.read_csv(os.path.join(ckpt_dir, 'ckpt.csv'), sep=',', header=None) ckpt.columns = ['models', 'loss'] ckpt = ckpt.sort_values(by='loss', ascending=True) state = torch.load(os.path.join(ckpt_dir, ckpt.models.loc[0])) model.load_state_dict(state['model']) args.global_step = state['global_step'] optimizer.load_state_dict(state['optimizer']) scheduler.load_state_dict(state['scheduler']) # model = torch.nn.DataParallel(model, device_ids=list(range(args.no_gpu))).to(DEVICE) dataset = SpeechDataset(args.data_path, args.meta_train, model.name, mem_mode=args.mem_mode) validset = SpeechDataset(args.data_path, args.meta_eval, model.name, mem_mode=args.mem_mode) data_loader = DataLoader(dataset=dataset, batch_size=args.batch_size, shuffle=True, collate_fn=collate_fn, drop_last=True, pin_memory=True) valid_loader = DataLoader(dataset=validset, batch_size=args.test_batch, shuffle=False, collate_fn=collate_fn, pin_memory=True) writer = SummaryWriter(ckpt_dir) train(model, data_loader, valid_loader, optimizer, scheduler, batch_size=args.batch_size, ckpt_dir=ckpt_dir, writer=writer) return None
def main(): G = SSRN().to(DEVICE) D = MultiScaleDiscriminator().to(DEVICE) print('{} threads are used...'.format(torch.get_num_threads())) ckpt_dir = os.path.join(args.logdir, type(G).__name__) G_optim = torch.optim.Adam(G.parameters(), lr=args.lr) D_optim = torch.optim.Adam(D.parameters(), lr=args.lr) # scheduler = MultiStepLR(optimizer, milestones=[100000, 200000], gamma=0.5) if not os.path.exists(ckpt_dir): os.makedirs(os.path.join(ckpt_dir, 'A', 'train')) else: print('Already exists. Retrain the model.') import pdb pdb.set_trace() ckpt = sorted( glob.glob( os.path.join(ckpt_dir, '{}-*k.pth.tar'.format(type(G).__name__)))) state = torch.load(ckpt[-1]) args.global_step = state['global_step'] G.load_state_dict(state['model']) G_optim.load_state_dict(state['optimizer']) # ckpt = sorted(glob.glob(os.path.join(ckpt_dir, '{}-*k.pth'.format(type(D).__name__)))) # state = torch.load(ckpt[-1]) # D.load_state_dict(state['model']) # D_optim.load_state_dict(state['optimizer']) dataset = SpeechDataset(args.data_path, args.meta_train, type(G).__name__, mem_mode=args.mem_mode) validset = SpeechDataset(args.data_path, args.meta_eval, type(G).__name__, mem_mode=args.mem_mode) data_loader = DataLoader(dataset=dataset, batch_size=args.batch_size, shuffle=True, collate_fn=collate_fn, drop_last=True, pin_memory=True, num_workers=args.n_workers) valid_loader = DataLoader(dataset=validset, batch_size=args.test_batch, shuffle=False, collate_fn=collate_fn) writer = SummaryWriter(ckpt_dir) train(G, D, data_loader, valid_loader, G_optim, D_optim, batch_size=args.batch_size, ckpt_dir=ckpt_dir, writer=writer) return None
def main(network): if network == 'text2mel': model = Text2Mel().to(DEVICE) elif network == 'ssrn': model = SSRN().to(DEVICE) else: print('Wrong network. {text2mel, ssrn}') return print('Model {} is working...'.format(type(model).__name__)) print('{} threads are used...'.format(torch.get_num_threads())) ckpt_dir = os.path.join(args.logdir, type(model).__name__) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) scheduler = MultiStepLR(optimizer, milestones=[50000, 150000, 300000], gamma=0.5) # if not os.path.exists(ckpt_dir): os.makedirs(os.path.join(ckpt_dir, 'A', 'train')) else: print('Already exists. Retrain the model.') ckpt = sorted(glob.glob(os.path.join(ckpt_dir, '*k.pth.tar')))[-1] state = torch.load(ckpt) model.load_state_dict(state['model']) args.global_step = state['global_step'] optimizer.load_state_dict(state['optimizer']) # scheduler.load_state_dict(state['scheduler']) # model = torch.nn.DataParallel(model, device_ids=list(range(args.no_gpu))).to(DEVICE) if type(model).__name__ == 'Text2Mel': if args.ga_mode: cfn_train, cfn_eval = t2m_ga_collate_fn, t2m_collate_fn else: cfn_train, cfn_eval = t2m_collate_fn, t2m_collate_fn else: cfn_train, cfn_eval = collate_fn, collate_fn dataset = SpeechDataset(args.data_path, args.meta_train, type(model).__name__, mem_mode=args.mem_mode, ga_mode=args.ga_mode) validset = SpeechDataset(args.data_path, args.meta_eval, type(model).__name__, mem_mode=args.mem_mode) data_loader = DataLoader(dataset=dataset, batch_size=args.batch_size, shuffle=True, collate_fn=cfn_train, drop_last=True, pin_memory=True) valid_loader = DataLoader(dataset=validset, batch_size=args.test_batch, shuffle=False, collate_fn=cfn_eval, pin_memory=True) writer = SummaryWriter(ckpt_dir) train(model, data_loader, valid_loader, optimizer, scheduler, batch_size=args.batch_size, ckpt_dir=ckpt_dir, writer=writer) return None
def main(mode): t2m = Text2Mel().to(DEVICE) ssrn = SSRN().to(DEVICE) if mode == "train": dataset = SpeechDataset(args.data_path, args.meta_train, "Text2Mel", mem_mode=args.mem_mode) elif mode=="test": dataset = SpeechDataset(args.data_path, args.meta_test, "Text2Mel", mem_mode=args.mem_mode) elif mode=="eval": dataset = SpeechDataset(args.data_path, args.meta_eval, "Text2Mel", mem_mode=args.mem_mode) else: print('[ERROR] Please set correct type: TRAIN or TEST!' ) exit(0) data_loader = DataLoader(dataset=dataset, batch_size=args.mse_batch, shuffle=False, collate_fn=t2m_collate_fn, pin_memory=True) ckpt = pd.read_csv(os.path.join(args.logdir, t2m.name, 'ckpt.csv'), sep=',', header=None) ckpt.columns = ['models', 'loss'] ckpt = ckpt.sort_values(by='loss', ascending=True) state = torch.load(os.path.join(args.logdir, t2m.name, ckpt.models.loc[0])) t2m.load_state_dict(state['model']) args.global_step = state['global_step'] ckpt = pd.read_csv(os.path.join(args.logdir, ssrn.name, 'ckpt.csv'), sep=',', header=None) ckpt.columns = ['models', 'loss'] ckpt = ckpt.sort_values(by='loss', ascending=True) state = torch.load(os.path.join(args.logdir, ssrn.name, ckpt.models.loc[0])) ssrn.load_state_dict(state['model']) print('All of models are loaded.') t2m.eval() ssrn.eval() if not os.path.exists(os.path.join(args.sampledir, 'A')): os.makedirs(os.path.join(args.sampledir, 'A')) return calculate_MSE(t2m=t2m, ssrn=ssrn, data_loader=data_loader, batch_size=args.mse_batch)
def main(): model = DCTTS(args).to(DEVICE) print('Model {} is working...'.format(args.model_name)) print('{} threads are used...'.format(torch.get_num_threads())) ckpt_dir = os.path.join(args.logdir, args.model_name) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) # scheduler = MultiStepLR(optimizer, milestones=[50000, 150000, 300000], gamma=0.5) # scheduler = LambdaLR(optimizer, lr_policy) if not os.path.exists(ckpt_dir): os.makedirs(os.path.join(ckpt_dir, 'A', 'train')) if args.pretrained_path is not None: print('Train with pretrained model {}'.format(args.pretrained_path)) state = torch.load(args.pretrained_path) model.custom_load_state_dict(state['model']) else: print('Already exists. Retrain the model.') ckpt = sorted(glob.glob(os.path.join(ckpt_dir, '*k.pth.tar')))[-1] state = torch.load(ckpt) model.load_state_dict(state['model']) args.global_step = state['global_step'] optimizer.load_state_dict(state['optimizer']) # scheduler.load_state_dict(state['scheduler']) # model = torch.nn.DataParallel(model, device_ids=list(range(args.no_gpu))).to(DEVICE) dataset = SpeechDataset(args.data_path, args.meta_train, mem_mode=args.mem_mode) validset = SpeechDataset(args.data_path, args.meta_eval, mem_mode=args.mem_mode) data_loader = DataLoader(dataset=dataset, batch_size=args.batch_size, shuffle=True, collate_fn=t2m_ga_collate_fn, drop_last=True, pin_memory=True) valid_loader = DataLoader(dataset=validset, batch_size=args.test_batch, shuffle=False, collate_fn=t2m_ga_collate_fn, pin_memory=True) writer = SummaryWriter(ckpt_dir) train(model, data_loader, valid_loader, optimizer, scheduler, batch_size=args.batch_size, ckpt_dir=ckpt_dir, writer=writer) return None
def main(): ssrn = SSRN().to(DEVICE) mname = type(ssrn).__name__ ckpt = sorted( glob.glob(os.path.join(args.logdir, mname, '{}-*k.pth'.format(mname)))) state = torch.load(ckpt[-1]) ssrn.load_state_dict(state['model']) if not os.path.exists(args.testdir): os.makedirs(args.testdir) validset = SpeechDataset(args.data_path, args.meta_eval, type(ssrn).__name__, mem_mode=args.mem_mode) valid_loader = DataLoader(dataset=validset, batch_size=args.test_batch, shuffle=False, collate_fn=collate_fn) evaluate(ssrn, valid_loader, args.test_batch) return None
def main(DEVICE): """ main function :param DEVICE: 'cpu' or 'gpu' """ model = TPGST().to(DEVICE) print('Model {} is working...'.format(type(model).__name__)) ckpt_dir = os.path.join(args.logdir, type(model).__name__) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) scheduler = LambdaLR(optimizer, lr_policy) if not os.path.exists(ckpt_dir): os.makedirs(os.path.join(ckpt_dir, 'A', 'train')) else: print('Already exists. Retrain the model.') model_path = sorted(glob.glob(os.path.join( ckpt_dir, 'model-*.tar')))[-1] # latest model state = torch.load(model_path) model.load_state_dict(state['model']) args.global_step = state['global_step'] optimizer.load_state_dict(state['optimizer']) scheduler.last_epoch = state['scheduler']['last_epoch'] scheduler.base_lrs = state['scheduler']['base_lrs'] dataset = SpeechDataset(args.data_path, args.meta, mem_mode=args.mem_mode, training=True) validset = SpeechDataset(args.data_path, args.meta, mem_mode=args.mem_mode, training=False) data_loader = DataLoader(dataset=dataset, batch_size=args.batch_size, shuffle=True, collate_fn=collate_fn, drop_last=True, pin_memory=True, num_workers=args.n_workers) valid_loader = DataLoader(dataset=validset, batch_size=args.test_batch, shuffle=False, collate_fn=collate_fn, pin_memory=True) # torch.set_num_threads(4) print('{} threads are used...'.format(torch.get_num_threads())) writer = SummaryWriter(ckpt_dir) train(model, data_loader, valid_loader, optimizer, scheduler, batch_size=args.batch_size, ckpt_dir=ckpt_dir, writer=writer, DEVICE=DEVICE) return None
def main(): #if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("-config") parser.add_argument("-model_path") parser.add_argument("-data") parser.add_argument("-data_path", default='', type=str, help="path of data files") parser.add_argument("-prior_path", default=None, help="the path to load the final.occs file") parser.add_argument("-transform", help="feature transformation matrix or mvn statistics") parser.add_argument("-out_file", help="write out the log-probs to this file") parser.add_argument("-batch_size", default=32, type=int, help="Override the batch size in the config") parser.add_argument("-sweep_size", default=200, type=float, help="process n hours of data per sweep (default:60)") parser.add_argument("-frame_subsampling_factor", default=1, type=int, help="the factor to subsample the features") parser.add_argument("-data_loader_threads", default=4, type=int, help="number of workers for data loading") args = parser.parse_args() with open(args.config) as f: config = yaml.safe_load(f) config["sweep_size"] = args.sweep_size config["source_paths"] = list() data_config = dict() data_config["type"] = "Eval" data_config["wav"] = args.data config["source_paths"].append(data_config) config["data_path"] = args.data_path print("job starts with config {}".format( json.dumps(config, sort_keys=True, indent=4))) transform = None if args.transform is not None and os.path.isfile(args.transform): with open(args.transform, 'rb') as f: transform = pickle.load(f) dataset = SpeechDataset(config) print(transform) test_dataloader = SeqDataloader(dataset, batch_size=args.batch_size, test_only=True, global_mvn=True, transform=transform) print("Data loader set up successfully!") print("Number of minibatches: {}".format(len(test_dataloader))) # ceate model model_config = config["model_config"] lstm = LSTMStack(model_config["feat_dim"], model_config["hidden_size"], model_config["num_layers"], model_config["dropout"], True) model = NnetAM(lstm, model_config["hidden_size"] * 2, model_config["label_size"]) device = th.device("cuda:1" if th.cuda.is_available() else "cpu") model.cuda() assert os.path.isfile( args.model_path), "ERROR: model file {} does not exit!".format( args.model_path) checkpoint = th.load(args.model_path, map_location='cuda:0') state_dict = checkpoint['model'] from collections import OrderedDict new_state_dict = OrderedDict() for k, v in state_dict.items(): header = k[:7] name = k[7:] # remove 'module.' of dataparallel new_state_dict[name] = v if header == "module.": model.load_state_dict(new_state_dict) else: model.load_state_dict(state_dict) print("=> loaded checkpoint '{}' ".format(args.model_path)) log_prior = None if (args.prior_path): prior = read_matrix(args.prior_path).numpy() log_prior = th.tensor(np.log(prior[0] / np.sum(prior[0])), dtype=th.float) model.eval() with th.no_grad(): with MatrixWriter("ark:" + args.out_file) as llout: for i, data in enumerate(test_dataloader): feat = data["x"] num_frs = data["num_frs"] utt_ids = data["utt_ids"] x = feat.to(th.float32) if (args.frame_subsampling_factor > 1): x = x.unfold(1, 1, args.frame_subsampling_factor).squeeze(-1) x = x.cuda() prediction = model(x) # save only unpadded part for each utt in batch for j in range(len(num_frs)): loglikes = prediction[j, :, :].data.cpu() loglikes_j = loglikes[:num_frs[j], :] if (log_prior): loglikes_j = loglikes_j - log_prior llout[utt_ids[j][0]] = loglikes_j print("Process batch [{}/{}]".format(i + 1, len(test_dataloader)))
def main(): parser = argparse.ArgumentParser() parser.add_argument("-config") parser.add_argument("-model_path") parser.add_argument("-data_path") parser.add_argument("-prior_path", help="the path to load the final.occs file") parser.add_argument("-out_file", help="write out the log-probs to this file") parser.add_argument("-transform", help="feature transformation matrix or mvn statistics") parser.add_argument( "-trans_model", help="the HMM transistion model, used for lattice generation") parser.add_argument("-graph_dir", help="the decoding graph directory") parser.add_argument("-batch_size", default=32, type=int, help="Override the batch size in the config") parser.add_argument("-sweep_size", default=200, type=float, help="process n hours of data per sweep (default:60)") parser.add_argument("-data_loader_threads", default=4, type=int, help="number of workers for data loading") args = parser.parse_args() with open(args.config) as f: config = yaml.safe_load(f) config["sweep_size"] = args.sweep_size config["source_paths"] = list() data_config = dict() data_config["type"] = "Eval" data_config["wav"] = args.data_path config["source_paths"].append(data_config) print("job starts with config {}".format( json.dumps(config, sort_keys=True, indent=4))) transform = None if args.transform is not None and os.path.isfile(args.transform): with open(args.transform, 'rb') as f: transform = pickle.load(f) dataset = SpeechDataset(config) #data = trainset.__getitem__(0) test_dataloader = SeqDataloader(dataset, batch_size=args.batch_size, test_only=True, global_mvn=True, transform=transform) print("Data loader set up successfully!") print("Number of minibatches: {}".format(len(test_dataloader))) # ceate model model_config = config["model_config"] lstm = LSTMStack(model_config["feat_dim"], model_config["hidden_size"], model_config["num_layers"], model_config["dropout"], True) model = NnetAM(lstm, model_config["hidden_size"] * 2, model_config["label_size"]) device = th.device("cuda" if th.cuda.is_available() else "cpu") model.cuda() assert os.path.isfile( args.model_path), "ERROR: model file {} does not exit!".format( args.model_path) checkpoint = th.load(args.model_path, map_location='cuda:0') state_dict = checkpoint['model'] from collections import OrderedDict new_state_dict = OrderedDict() for k, v in state_dict.items(): header = k[:7] name = k[7:] # remove 'module.' of dataparallel new_state_dict[name] = v if header == "module.": model.load_state_dict(new_state_dict) else: model.load_state_dict(state_dict) print("=> loaded checkpoint '{}' ".format(args.model_path)) HCLG = args.graph_dir + "/HCLG.fst" words_txt = args.graph_dir + "/words.txt" if not os.path.isfile(HCLG): sys.stderr.write('ERROR: The HCLG file %s does not exist!\n' % (HCLG)) sys.exit(0) if not os.path.isfile(words_txt): sys.stderr.write('ERROR: The words.txt file %s does not exist!\n' % (words_txt)) sys.exit(0) if os.path.isfile(args.trans_model): trans_model = kaldi_hmm.TransitionModel() with kaldi_util.io.xopen(args.trans_model) as ki: trans_model.read(ki.stream(), ki.binary) else: sys.stderr.write('ERROR: The trans_model %s does not exist!\n' % (args.trans_model)) sys.exit(0) prior = read_matrix(args.prior_path).numpy() log_prior = th.tensor(np.log(prior[0] / np.sum(prior[0])), dtype=th.float) # now we can setup the decoder decoder_opts = LatticeFasterDecoderOptions() decoder_opts.beam = config["decoder_config"]["beam"] decoder_opts.lattice_beam = config["decoder_config"]["lattice_beam"] decoder_opts.max_active = config["decoder_config"]["max_active"] acoustic_scale = config["decoder_config"]["acoustic_scale"] decoder_opts.determinize_lattice = True #To produce compact lattice asr_decoder = MappedLatticeFasterRecognizer.from_files( args.trans_model, HCLG, words_txt, acoustic_scale=acoustic_scale, decoder_opts=decoder_opts) model.eval() with th.no_grad(): with kaldi_util.table.CompactLatticeWriter("ark:" + args.out_file) as lat_out: for data in test_dataloader: feat = data["x"] num_frs = data["num_frs"] utt_ids = data["utt_ids"] x = feat.to(th.float32) x = x.cuda() prediction = model(x) for j in range(len(num_frs)): loglikes = prediction[j, :, :].data.cpu() loglikes_j = loglikes[:num_frs[j], :] loglikes_j = loglikes_j - log_prior decoder_out = asr_decoder.decode( kaldi_matrix.Matrix(loglikes_j.numpy())) key = utt_ids[j][0] print(key, decoder_out["text"]) print("Log-like per-frame for utterance {} is {}".format( key, decoder_out["likelihood"] / num_frs[j])) # save lattice lat_out[key] = decoder_out["lattice"]
eos_token = char_to_token['<eos>'] # #test_dataset = SpeechDataset(test_df, dataset_dir) # #test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn) model = load_model(args.load_dir, args.epoch, device=DEVICE) num_sent = 10 model.eval() model.tf_ratio = 0.9 for i in range(num_sent): if args.first_ten: idx = i else: idx = random.randint(0, train_df.shape[0]) trial_dataset = SpeechDataset(train_df, root_dir, char_to_token) x, y = trial_dataset.__getitem__(idx) # plt.imshow(x[0,:,:].detach()) # Model output target = y.unsqueeze(dim=0).to(DEVICE) data = x.permute(0, 2, 1).to(DEVICE) loss, output = model(data, target) print("True sent : ", decode_true_sent(y)) print("Pred sent : ", decode_pred_sent(output)) print("Loss :", loss.item()) print("\n")
def main(): parser = argparse.ArgumentParser() parser.add_argument("-config") parser.add_argument("-data", help="data yaml file") parser.add_argument("-dataPath", default='', type=str, help="path of data files") parser.add_argument("-seed_model", default='', help="the seed nerual network model") parser.add_argument("-exp_dir", help="the directory to save the outputs") parser.add_argument("-transform", help="feature transformation matrix or mvn statistics") parser.add_argument( "-ali_dir", help="the directory to load trans_model and tree used for alignments") parser.add_argument("-lang_dir", help="the lexicon directory to load L.fst") parser.add_argument( "-chain_dir", help= "the directory to load trans_model, tree and den.fst for chain model") parser.add_argument("-lr", type=float, help="set the base learning rate") parser.add_argument( "-warmup_steps", default=4000, type=int, help="the number of warmup steps to adjust the learning rate") parser.add_argument("-xent_regularize", default=0, type=float, help="cross-entropy regularization weight") parser.add_argument("-momentum", default=0, type=float, help="set the momentum") parser.add_argument("-weight_decay", default=1e-4, type=float, help="set the L2 regularization weight") parser.add_argument("-batch_size", default=32, type=int, help="Override the batch size in the config") parser.add_argument("-data_loader_threads", default=0, type=int, help="number of workers for data loading") parser.add_argument("-max_grad_norm", default=5, type=float, help="max_grad_norm for gradient clipping") parser.add_argument("-sweep_size", default=100, type=float, help="process n hours of data per sweep (default:100)") parser.add_argument("-num_epochs", default=1, type=int, help="number of training epochs (default:1)") parser.add_argument( "-anneal_lr_epoch", default=2, type=int, help="start to anneal the learning rate from this epoch") parser.add_argument("-anneal_lr_ratio", default=0.5, type=float, help="the ratio to anneal the learning rate ratio") parser.add_argument('-print_freq', default=10, type=int, metavar='N', help='print frequency (default: 10)') parser.add_argument('-save_freq', default=1000, type=int, metavar='N', help='save model frequency (default: 1000)') args = parser.parse_args() with open(args.config) as f: config = yaml.safe_load(f) config["sweep_size"] = args.sweep_size print("pytorch version:{}".format(th.__version__)) with open(args.data) as f: data = yaml.safe_load(f) config["source_paths"] = [j for i, j in data['clean_source'].items()] if 'dir_noise' in data: config["dir_noise_paths"] = [ j for i, j in data['dir_noise'].items() ] if 'rir' in data: config["rir_paths"] = [j for i, j in data['rir'].items()] config['data_path'] = args.dataPath print("Experiment starts with config {}".format( json.dumps(config, sort_keys=True, indent=4))) # Initialize Horovod hvd.init() th.cuda.set_device(hvd.local_rank()) print("Run experiments with world size {}".format(hvd.size())) dataset = SpeechDataset(config) transform = None if args.transform is not None and os.path.isfile(args.transform): with open(args.transform, 'rb') as f: transform = pickle.load(f) dataset.transform = transform train_dataloader = SeqDataloader(dataset, batch_size=args.batch_size, num_workers=args.data_loader_threads, distributed=True, test_only=False) print("Data loader set up successfully!") print("Number of minibatches: {}".format(len(train_dataloader))) if not os.path.isdir(args.exp_dir): os.makedirs(args.exp_dir) # ceate model model_config = config["model_config"] model = lstm.LSTMAM(model_config["feat_dim"], model_config["label_size"], model_config["hidden_size"], model_config["num_layers"], model_config["dropout"], True) model.cuda() # setup the optimizer optimizer = th.optim.Adam(model.parameters(), lr=args.lr, amsgrad=True) # Broadcast parameters and opterimizer state from rank 0 to all other processes. hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) # Add Horovod Distributed Optimizer optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters()) if os.path.isfile(args.seed_model): checkpoint = th.load(args.seed_model) state_dict = checkpoint['model'] from collections import OrderedDict new_state_dict = OrderedDict() for k, v in state_dict.items(): header = k[:7] name = k[7:] # remove 'module.' of dataparallel new_state_dict[name] = v if header == "module.": model.load_state_dict(new_state_dict) else: model.load_state_dict(state_dict) print("=> loaded checkpoint '{}' ".format(args.seed_model)) ali_model = args.ali_dir + "/final.mdl" ali_tree = args.ali_dir + "/tree" L_fst = args.lang_dir + "/L.fst" disambig = args.lang_dir + "/phones/disambig.int" den_fst = kaldi_fst.StdVectorFst.read(args.chain_dir + "/den.fst") chain_model_path = args.chain_dir + "/0.trans_mdl" chain_tree_path = args.chain_dir + "/tree" if os.path.isfile(chain_model_path): chain_trans_model = kaldi_hmm.TransitionModel() with kaldi_util.io.xopen(chain_model_path) as ki: chain_trans_model.read(ki.stream(), ki.binary) else: sys.stderr.write('ERROR: The trans_model %s does not exist!\n' % (trans_model)) sys.exit(0) chain_tree = kaldi_tree.ContextDependency() with kaldi_util.io.xopen(chain_tree_path) as ki: chain_tree.read(ki.stream(), ki.binary) # chain supervision options supervision_opts = kaldi_chain.SupervisionOptions() supervision_opts.convert_to_pdfs = True supervision_opts.frame_subsampling_factor = 3 supervision_opts.left_tolerance = 5 supervision_opts.right_tolerance = 5 # chain training options chain_opts = kaldi_chain.ChainTrainingOptions() chain_opts.leaky_hmm_coefficient = 1e-4 chain_opts.xent_regularize = args.xent_regularize # setup the aligner aligner = kaldi_align.MappedAligner.from_files(ali_model, ali_tree, L_fst, None, disambig, None, beam=10, transition_scale=1.0, self_loop_scale=0.1, acoustic_scale=0.1) den_graph = kaldi_chain.DenominatorGraph(den_fst, model_config["label_size"]) #encoder_layer = nn.TransformerEncoderLayer(512, 8) #print(encoder_layer) model.train() for epoch in range(args.num_epochs): # anneal learning rate if epoch > args.anneal_lr_epoch: for param_group in optimizer.param_groups: param_group['lr'] *= args.anneal_lr_ratio run_train_epoch(model, optimizer, train_dataloader, epoch, chain_trans_model, chain_tree, supervision_opts, aligner, den_graph, chain_opts, args) # save model if hvd.rank() == 0: checkpoint = {} checkpoint['model'] = model.state_dict() checkpoint['optimizer'] = optimizer.state_dict() checkpoint['epoch'] = epoch output_file = args.exp_dir + '/chain.model.' + str(epoch) + '.tar' th.save(checkpoint, output_file)
train_data_df = pd.read_csv(os.path.join(train_dir, 'train_data.csv'), skiprows=[0], header=None, names=['index', 'clip', 'sentence']) test_data_df = pd.read_csv(os.path.join(test_dir, 'test_data.csv'), skiprows=[0], header=None, names=['index', 'clip', 'sentence']) max_data_len = 2500 max_sent_len = 100 bs = int(input("Enter batch_size:")) train_dataset = SpeechDataset(train_data_df, train_dir, max_data_len, max_sent_len) train_loader = DataLoader(train_dataset, batch_size=bs, shuffle=True) test_dataset = SpeechDataset(test_data_df, test_dir, max_data_len, max_sent_len) test_loader = DataLoader(test_dataset, batch_size=bs) device = torch.device("cuda") if torch.cuda.is_available() else 'cpu' print('device:',device) input_len = 201 hidden_size = 50 num_layers = 3 output_shape = 28 bidirectional = True model = BasicASR(input_len, hidden_size, num_layers, output_shape, bidirectional).to(device)
args = get_args() pkg = torch.load(args.model_file) model_config = pkg['model_config'] vocab = load_vocab(args.vocab_file) id2token = [None] * len(vocab) for k, v in vocab.items(): id2token[v] = k collate = Collate(model_config["left_context"], model_config["right_context"], model_config["skip_frame"], model_config["norm_mean"], model_config["norm_var"]) testset = SpeechDataset(args.data_file) test_loader = torch.utils.data.DataLoader(testset, collate_fn=collate, shuffle=False) # check dim match if model_config["feat_dim"] != testset[0]["feat"]["dim"]: raise ValueError(("Dim mismatch: " + "model {} vs. feat {}.").format( model_config["feat_dim"], testset[0]["feat"]["dim"])) model_load_timer = Timer() model_load_timer.tic() # build encoder and decoder if model_config["encoder"]["type"] == "BiRNN": encoder = BiRNN(model_config["encoder"]) elif model_config["encoder"]["type"] == "BiRNN_Torch":
def train_model(model_class, preprocess_fun, is_1d, reshape_size, BATCH_SIZE, epochs, CODER, preprocess_param={}, bagging_num=1, semi_train_path=None, pretrained=None, pretraining=False, MGPU=False): """ :param model_class: model class. e.g. vgg, resnet, senet :param preprocess_fun: preprocess function. e.g. mel, mfcc, raw wave :param is_1d: boolean. True for conv1d models and false for conv2d :param reshape_size: int. only for conv2d, reshape the image size :param BATCH_SIZE: batch size. :param epochs: number of epochs :param CODER: string for saving and loading model/files :param preprocess_param: parameters for preprocessing function :param bagging_num: number of training per model, aka bagging models :param semi_train_path: path to semi supervised learning file. :param pretrained: path to pretrained model :param pretraining: boolean. if this is pretraining :param MGPU: whether using multiple gpus """ # 학습에 사용되는 모델을 정의하는 get_model() 함수이다 def get_model(model=model_class, m=MGPU, pretrained=pretrained): # multi-GPU일 경우, Data Parallelism mdl = torch.nn.DataParallel(model()) if m else model() if not pretrained: return mdl else: print("load pretrained model here...") # 기학습된 torch.load()로 모델을 불러온다 mdl.load_state_dict(torch.load(pretrained)) if 'vgg' in pretrained: # VGG 모델의 경우, 최상위층 파라미터 외 모든 파라미터를 학습이 안되도록 requires_grad=False로 지정한다. fixed_layers = list(mdl.features) for l in fixed_layers: for p in l.parameters(): p.requires_grad = False return mdl label_to_int, int_to_label = get_label_dict() # bagging_num 만큼 모델 학습을 반복 수행한다 for b in range(bagging_num): print("training model # ", b) # 학습에 사용되는 loss function을 정의한다 loss_fn = torch.nn.CrossEntropyLoss() # 모델을 정의하고, .cuda()로 GPU, CUDA와 연동한다 speechmodel = get_model() speechmodel = speechmodel.cuda() # 학습 중간에 성능 표시를 위한 값을 준비한다 total_correct = 0 num_labels = 0 start_time = time() # 지정된 epoch 만큼 학습을 수행한다. for e in range(epochs): print("training epoch ", e) # 10 epoch 이후에는 learning_rate를 1/10로 줄인다 learning_rate = 0.01 if e < 10 else 0.001 # 학습에 사용할 SGD optimizer + momentum을 정의한다 optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, speechmodel.parameters()), lr=learning_rate, momentum=0.9, weight_decay=0.00001) # 모델 내부 모듈을 학습 직전에 활성화시킨다 speechmodel.train() if semi_train_path: # semi-supervised 학습일 경우에는 훈련 데이터를 불러오는 기준이 다르다. [ Semi-Supervised 모델 학습 ] 에서 자세하게 다룬다. # 학습에 사용할 파일 목록 train_list에 테스트 데이터를 추가한다. train_list, label_list = get_semi_list(words=label_to_int.keys(), sub_path=semi_train_path, test_ratio=choice([0.2, 0.25, 0.3, 0.35])) print("semi training list length: ", len(train_list)) else: # Supervised 학습의 경우, 훈련 데이터 목록을 받아온다. train_list, label_list, _ = get_wav_list(words=label_to_int.keys()) if pretraining: traindataset = PreDataset(label_words_dict=label_to_int, add_noise=True, preprocess_fun=preprocess_fun, preprocess_param=preprocess_param, resize_shape=reshape_size, is_1d=is_1d) else: traindataset = SpeechDataset(mode='train', label_words_dict=label_to_int, wav_list=(train_list, label_list), add_noise=True, preprocess_fun=preprocess_fun, preprocess_param=preprocess_param, resize_shape=reshape_size, is_1d=is_1d) # Dataloader를 통해 Data Queue를 생성한다. Shuffle=True 설정을 통하여 매 epoch마다 읽어오는 데이터를 랜덤하게 선정한다. trainloader = DataLoader(traindataset, BATCH_SIZE, shuffle=True) # trainloader를 통해 batch_size 만큼의 훈련 데이터를 읽어온다 for batch_idx, batch_data in enumerate(trainloader): # spec은 스펙트로그램의 약자로 음성 데이터를 의미하고, label은 정답값을 의미한다 spec = batch_data['spec'] label = batch_data['label'] spec, label = Variable(spec.cuda()), Variable(label.cuda()) # 현재 모델(speechmodel)에 데이터(spec)을 입력하여, 예측 결과물(y_pred)을 얻는다 y_pred = speechmodel(spec) # 예측 결과물과 정답값으로 현재 모델의 Loss값을 구한다 loss = loss_fn(y_pred, label) optimizer.zero_grad() # backpropagation을 수행하여, Loss 값을 개선하기 위해 모델 파라미터를 수정해야하는 방향을 얻는다. loss.backward() # optimizer.step() 함수를 통해 모델 파라미터를 업데이트한다. 이전보다 loss 값이 줄어들도록 하는 방향으로 모델 파라미터가 업데이트 되었다. optimizer.step() # 확률값인 y_pred에서 max값을 구하여 현재 모델의 정확률(correct)을 구한다 _, pred_labels = torch.max(y_pred.data, 1) correct = (pred_labels == label.data).sum() total_correct += correct num_labels += len(label) # 훈련 데이터에 대한 정확률을 중간마다 출력해준다. print("training loss:", 100. * total_correct / num_labels, time()-start_time) # 학습이 완료된 모델 파라미터를 저장한다 create_directory("model") torch.save(speechmodel.state_dict(), "model/model_%s_%s.pth" % (CODER, b)) if not pretraining: print("doing prediction...") softmax = Softmax() # 저장된 학습 모델 경로를 지정한다. Bagging_num 개수만큼의 모델을 읽어온다 trained_models = ["model/model_%s_%s.pth" % (CODER, b) for b in range(bagging_num)] # 테스트 데이터에 대한 Dataset을 생성하고, DataLoader를 통해 Data Queue를 생성한다. _, _, test_list = get_wav_list(words=label_to_int.keys()) testdataset = SpeechDataset(mode='test', label_words_dict=label_to_int, wav_list=(test_list, []), add_noise=False, preprocess_fun=preprocess_fun, preprocess_param=preprocess_param, resize_shape=reshape_size, is_1d=is_1d) testloader = DataLoader(testdataset, BATCH_SIZE, shuffle=False) for e, m in enumerate(trained_models): print("predicting ", m) speechmodel = get_model(m=MGPU) # torch.load() 함수를 통해 학습이 완료된 모델을 읽어온다. speechmodel.load_state_dict(torch.load(m)) # 모델을 cuda와 연동하고, evaluation 모드로 지정한다. speechmodel = speechmodel.cuda() speechmodel.eval() test_fnames, test_labels = [], [] pred_scores = [] # 테스트 데이터를 batch_size 만큼 받아와 예측 결과물을 생성한다. for batch_idx, batch_data in enumerate(testloader): spec = Variable(batch_data['spec'].cuda()) fname = batch_data['id'] # y_pred는 테스트 데이터에 대한 모델의 예측값이다. y_pred = softmax(speechmodel(spec)) pred_scores.append(y_pred.data.cpu().numpy()) test_fnames += fname # bagging_num 개의 모델이 출력한 확률값 y_pred를 더하여 앙상블 예측값을 구한다. if e == 0: final_pred = np.vstack(pred_scores) final_test_fnames = test_fnames else: final_pred += np.vstack(pred_scores) assert final_test_fnames == test_fnames # bagging_num 개수로 나누어, 최종 예측 확률값(final_pred)을 기반으로 최종 예측값(final_labels)를 생성한다. final_pred /= len(trained_models) final_labels = [int_to_label[x] for x in np.argmax(final_pred, 1)] # 캐글 제출용 파일 생성을 위한 파일 이름(test_fnames)를 정의한다. test_fnames = [x.split("/")[-1] for x in final_test_fnames] labels = ['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go', 'unknown', 'silence'] # 캐글 제출용 파일을 저장한다. (파일명과 최종 예측값이 기록된다) create_directory("sub") pd.DataFrame({'fname': test_fnames, 'label': final_labels}).to_csv("sub/%s.csv" % CODER, index=False) # 서로 다른 모델의 앙상블, 학습 성능 향상을 목적으로 bagging 앙상블 모델의 예측 확률값을 별도 파일로 저장한다. pred_scores = pd.DataFrame(np.vstack(final_pred), columns=labels) pred_scores['fname'] = test_fnames create_directory("pred_scores") pred_scores.to_csv("pred_scores/%s.csv" % CODER, index=False)
def main(): args = parser.parse_args() cf = ConfigParser.ConfigParser() try: cf.read(args.conf) except: print("conf file not exists") sys.exit(1) USE_CUDA = cf.getboolean('Training', 'use_cuda') try: seed = long(cf.get('Training', 'seed')) except: seed = torch.cuda.initial_seed() cf.set('Training', 'seed', seed) cf.write(open(args.conf, 'w')) torch.manual_seed(seed) if USE_CUDA: torch.cuda.manual_seed(seed) log_dir = cf.get('Data', 'log_dir') log_file = os.path.join(log_dir, cf.get('Data', 'log_file')) logger = init_logger(log_file) # Define Model rnn_input_size = cf.getint('Model', 'rnn_input_size') rnn_hidden_size = cf.getint('Model', 'rnn_hidden_size') rnn_layers = cf.getint('Model', 'rnn_layers') rnn_type = RNN[cf.get('Model', 'rnn_type')] bidirectional = cf.getboolean('Model', 'bidirectional') batch_norm = cf.getboolean('Model', 'batch_norm') rnn_param = { "rnn_input_size": rnn_input_size, "rnn_hidden_size": rnn_hidden_size, "rnn_layers": rnn_layers, "rnn_type": rnn_type, "bidirectional": bidirectional, "batch_norm": batch_norm } num_class = cf.getint('Model', 'num_class') drop_out = cf.getfloat('Model', 'drop_out') model = CTC_Model(rnn_param=rnn_param, num_class=num_class, drop_out=drop_out) print("Model Structure:") logger.info("Model Structure:") for idx, m in enumerate(model.children()): print(idx, m) logger.info(str(idx) + "->" + str(m)) data_dir = cf.get('Data', 'data_dir') batch_size = cf.getint("Training", 'batch_size') # Data Loader train_dataset = SpeechDataset(data_dir, data_set='train') dev_dataset = SpeechDataset(data_dir, data_set="dev") train_loader = SpeechDataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=False) dev_loader = SpeechDataLoader(dev_dataset, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=False) # ensure the feats is equal to the rnn_input_Size assert train_dataset.n_feats == rnn_input_size # decoder for dev set decoder = GreedyDecoder(int2char, space_idx=len(int2char) - 1, blank_index=0) # Training init_lr = cf.getfloat('Training', 'init_lr') num_epoches = cf.getint('Training', 'num_epoches') end_adjust_acc = cf.getfloat('Training', 'end_adjust_acc') decay = cf.getfloat("Training", 'lr_decay') weight_decay = cf.getfloat("Training", 'weight_decay') params = { 'num_epoches': num_epoches, 'end_adjust_acc': end_adjust_acc, 'seed': seed, 'decay': decay, 'learning_rate': init_lr, 'weight_decay': weight_decay, 'batch_size': batch_size, 'n_feats': train_dataset.n_feats } print(params) if USE_CUDA: model = model.cuda() loss_fn = CTCLoss() optimizer = torch.optim.Adam(model.parameters(), lr=init_lr, weight_decay=weight_decay) # visualization for training from visdom import Visdom viz = Visdom() title = 'TIMIT LSTM_CTC Acoustic Model' opts = [ dict(title=title + " Loss", ylabel='Loss', xlabel='Epoch'), dict(title=title + " Loss on Dev", ylabel='DEV Loss', xlabel='Epoch'), dict(title=title + ' CER on DEV', ylabel='DEV CER', xlabel='Epoch') ] viz_window = [None, None, None] count = 0 learning_rate = init_lr loss_best = 1000 loss_best_true = 1000 adjust_rate_flag = False stop_train = False adjust_time = 0 acc_best = 0 start_time = time.time() loss_results = [] dev_loss_results = [] dev_cer_results = [] while not stop_train: if count >= num_epoches: break count += 1 if adjust_rate_flag: learning_rate *= decay adjust_rate_flag = False for param in optimizer.param_groups: param['lr'] *= decay print("Start training epoch: %d, learning_rate: %.5f" % (count, learning_rate)) logger.info("Start training epoch: %d, learning_rate: %.5f" % (count, learning_rate)) loss = train(model, train_loader, loss_fn, optimizer, logger, print_every=20, USE_CUDA=USE_CUDA) loss_results.append(loss) acc, dev_loss = dev(model, dev_loader, loss_fn, decoder, logger, USE_CUDA=USE_CUDA) print("loss on dev set is %.4f" % dev_loss) logger.info("loss on dev set is %.4f" % dev_loss) dev_loss_results.append(dev_loss) dev_cer_results.append(acc) # adjust learning rate by dev_loss #adjust_rate_count : 表示连续超过count个epoch的loss在end_adjust_acc区间内认为稳定 if dev_loss < (loss_best - end_adjust_acc): loss_best = dev_loss loss_best_true = dev_loss adjust_rate_count = 0 acc_best = acc best_model_state = copy.deepcopy(model.state_dict()) best_op_state = copy.deepcopy(optimizer.state_dict()) elif (dev_loss < loss_best + end_adjust_acc): adjust_rate_count += 1 if dev_loss < loss_best and dev_loss < loss_best_true: loss_best_true = dev_loss acc_best = acc best_model_state = copy.deepcopy(model.state_dict()) best_op_state = copy.deepcopy(optimizer.state_dict()) else: adjust_rate_count = 10 print("adjust_rate_count: %d" % adjust_rate_count) print('adjust_time: %d' % adjust_time) logger.info("adjust_rate_count: %d" % adjust_rate_count) logger.info('adjust_time: %d' % adjust_time) if adjust_rate_count == 10: adjust_rate_flag = True adjust_time += 1 adjust_rate_count = 0 if loss_best > loss_best_true: loss_best = loss_best_true model.load_state_dict(best_model_state) optimizer.load_state_dict(best_op_state) if adjust_time == 8: stop_train = True time_used = (time.time() - start_time) / 60 print("epoch %d done, dev acc is: %.4f, time_used: %.4f minutes" % (count, acc, time_used)) logger.info( "epoch %d done, dev acc is: %.4f, time_used: %.4f minutes" % (count, acc, time_used)) x_axis = range(count) y_axis = [ loss_results[0:count], dev_loss_results[0:count], dev_cer_results[0:count] ] for x in range(len(viz_window)): if viz_window[x] is None: viz_window[x] = viz.line( X=np.array(x_axis), Y=np.array(y_axis[x]), opts=opts[x], ) else: viz.line( X=np.array(x_axis), Y=np.array(y_axis[x]), win=viz_window[x], update='replace', ) print("End training, best dev loss is: %.4f, acc is: %.4f" % (loss_best_true, acc_best)) logger.info("End training, best dev loss acc is: %.4f, acc is: %.4f" % (loss_best_true, acc_best)) model.load_state_dict(best_model_state) optimizer.load_state_dict(best_op_state) best_path = os.path.join(log_dir, 'best_model' + '_dev' + str(acc_best) + '.pkl') cf.set('Model', 'model_file', best_path) cf.write(open(args.conf, 'w')) params['epoch'] = count torch.save( CTC_Model.save_package(model, optimizer=optimizer, epoch=params, loss_results=loss_results, dev_loss_results=dev_loss_results, dev_cer_results=dev_cer_results), best_path)
'yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go' ] label_to_int = dict(zip(labels, range(len(labels)))) int_to_label = dict(zip(range(len(labels)), labels)) int_to_label.update({len(labels): 'unknown', len(labels) + 1: 'silence'}) # 모드에 따라 학습 및 검증에 사용할 파일을 선택한다 trn = 'input/trn.txt' if mode == 'cv' else 'input/trn_all.txt' tst = 'input/val.txt' if mode == 'cv' else 'input/tst.txt' trn = [line.strip() for line in open(trn, 'r').readlines()] wav_list = [line.split(',')[-1] for line in trn] label_list = [line.split(',')[0] for line in trn] # 학습용 SpeechDataset을 불러온다 traindataset = SpeechDataset(mode='train', label_to_int=label_to_int, wav_list=wav_list, label_list=label_list) start_time = time() for e in range(epochs): print("training epoch ", e) # learning_rate를 epoch마다 다르게 지정한다 learning_rate = 0.01 if e < 10 else 0.001 optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, speechmodel.parameters()), lr=learning_rate, momentum=0.9, weight_decay=0.00001) # 모델을 학습하기 위하여 .train() 함수를 실행한다 speechmodel.train()
if utils.TENSORBOARD_LOGGING == 1: utils.visulizer.set_writer( os.path.join(trainer_config["exp_dir"], 'log')) collate = Collate(model_config["left_context"], model_config["right_context"], model_config["skip_frame"], model_config["norm_mean"], model_config["norm_var"]) batch_frames = trainer_config["batch_frames"] valid_batch_size = 20 if "multi_gpu" in trainer_config and trainer_config["multi_gpu"] == True: batch_frames *= torch.cuda.device_count() valid_batch_size *= torch.cuda.device_count() trainset = SpeechDataset(os.path.join(args.data_dir, "train.json")) validset = SpeechDataset(os.path.join(args.data_dir, "dev.json")) logger.info("Loaded {} utterances for training.".format(len(trainset))) logger.info("Loaded {} utterances for validation.".format(len(validset))) trainsampler = FrameBasedSampler(trainset, frame_num=batch_frames) tr_loader = torch.utils.data.DataLoader(trainset, batch_sampler=trainsampler, collate_fn=collate, shuffle=False, num_workers=16, pin_memory=True) cv_loader = torch.utils.data.DataLoader(validset, collate_fn=collate, batch_size=valid_batch_size, num_workers=16,
lengths=output_lengths, batch_first=True) output_padded = nn.utils.rnn.pad_packed_sequence(output, batch_first=True) output, output_lengths = output_padded output = output.index_select(0, idx_unsort.to(output.device)) output_lengths = output_lengths.index_select( 0, idx_unsort.to(output_lengths.device)) return output, output_lengths if __name__ == "__main__": from data import FrameBasedSampler, Collate, SpeechDataset fn = "/home/baiye/Speech/las/egs/timit/data/test.json" dataset = SpeechDataset(fn) sampler = FrameBasedSampler(dataset) collate = Collate(left=0, right=0) dataloader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, collate_fn=collate, shuffle=False) dataiter = iter(dataloader) feats, feat_lengths, targets, target_lengths = next(dataiter) config = { "input_dim": 40, "hidden_size": 256, "num_layers": 3, } rnn = PyramidBiRNN(config) output, output_lengths = rnn(feats, feat_lengths)
def train_model(model_class, preprocess_fun, is_1d, reshape_size, BATCH_SIZE, epochs, CODER, preprocess_param={}, bagging_num=1, semi_train_path=None, pretrained=None, pretraining=False, MGPU=False): """ :param model_class: model class. e.g. vgg, resnet, senet :param preprocess_fun: preprocess function. e.g. mel, mfcc, raw wave :param is_1d: boolean. True for conv1d models and false for conv2d :param reshape_size: int. only for conv2d, reshape the image size :param BATCH_SIZE: batch size. :param epochs: number of epochs :param CODER: string for saving and loading model/files :param preprocess_param: parameters for preprocessing function :param bagging_num: number of training per model, aka bagging models :param semi_train_path: path to semi supervised learning file. :param pretrained: path to pretrained model :param pretraining: boolean. if this is pretraining :param MGPU: whether using multiple gpus """ def get_model(model=model_class, m=MGPU, pretrained=pretrained): mdl = torch.nn.DataParallel(model()) if m else model() if not pretrained: return mdl else: print("load pretrained model here...") mdl.load_state_dict(torch.load(pretrained)) if 'vgg' in pretrained: fixed_layers = list(mdl.features) for l in fixed_layers: for p in l.parameters(): p.requires_grad = False return mdl label_to_int, int_to_label = get_label_dict() for b in range(bagging_num): print("training model # ", b) loss_fn = torch.nn.CrossEntropyLoss() speechmodel = get_model() speechmodel = speechmodel.cuda() total_correct = 0 num_labels = 0 start_time = time() for e in range(epochs): print("training epoch ", e) learning_rate = 0.01 if e < 10 else 0.001 optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, speechmodel.parameters()), lr=learning_rate, momentum=0.9, weight_decay=0.00001) speechmodel.train() if semi_train_path: train_list = get_semi_list(words=label_to_int.keys(), sub_path=semi_train_path, test_ratio=choice( [0.2, 0.25, 0.3, 0.35])) print("semi training list length: ", len(train_list)) else: train_list, _ = get_wav_list(words=label_to_int.keys()) if pretraining: traindataset = PreDataset(label_words_dict=label_to_int, add_noise=True, preprocess_fun=preprocess_fun, preprocess_param=preprocess_param, resize_shape=reshape_size, is_1d=is_1d) else: traindataset = SpeechDataset(mode='train', label_words_dict=label_to_int, wav_list=train_list, add_noise=True, preprocess_fun=preprocess_fun, preprocess_param=preprocess_param, resize_shape=reshape_size, is_1d=is_1d) trainloader = DataLoader(traindataset, BATCH_SIZE, shuffle=True) for batch_idx, batch_data in enumerate(trainloader): spec = batch_data['spec'] label = batch_data['label'] spec, label = Variable(spec.cuda()), Variable(label.cuda()) y_pred = speechmodel(spec) _, pred_labels = torch.max(y_pred.data, 1) correct = (pred_labels == label.data).sum() loss = loss_fn(y_pred, label) total_correct += correct num_labels += len(label) optimizer.zero_grad() loss.backward() optimizer.step() print("training loss:", 100. * total_correct / num_labels, time() - start_time) # save model create_directory("model") torch.save(speechmodel.state_dict(), "model/model_%s_%s.pth" % (CODER, b)) if not pretraining: print("doing prediction...") softmax = Softmax() trained_models = [ "model/model_%s_%s.pth" % (CODER, b) for b in range(bagging_num) ] # prediction _, test_list = get_wav_list(words=label_to_int.keys()) testdataset = SpeechDataset(mode='test', label_words_dict=label_to_int, wav_list=test_list, add_noise=False, preprocess_fun=preprocess_fun, preprocess_param=preprocess_param, resize_shape=reshape_size, is_1d=is_1d) testloader = DataLoader(testdataset, BATCH_SIZE, shuffle=False) for e, m in enumerate(trained_models): print("predicting ", m) speechmodel = get_model(m=MGPU) speechmodel.load_state_dict(torch.load(m)) speechmodel = speechmodel.cuda() speechmodel.eval() test_fnames, test_labels = [], [] pred_scores = [] # do prediction and make a submission file for batch_idx, batch_data in enumerate(testloader): spec = Variable(batch_data['spec'].cuda()) fname = batch_data['id'] y_pred = softmax(speechmodel(spec)) pred_scores.append(y_pred.data.cpu().numpy()) test_fnames += fname if e == 0: final_pred = np.vstack(pred_scores) final_test_fnames = test_fnames else: final_pred += np.vstack(pred_scores) assert final_test_fnames == test_fnames final_pred /= len(trained_models) final_labels = [int_to_label[x] for x in np.argmax(final_pred, 1)] test_fnames = [x.split("/")[-1] for x in final_test_fnames] labels = [ 'yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go', 'unknown', 'silence' ] pred_scores = pd.DataFrame(np.vstack(final_pred), columns=labels) pred_scores['fname'] = test_fnames create_directory("pred_scores") pred_scores.to_csv("pred_scores/%s.csv" % CODER, index=False) create_directory("sub") pd.DataFrame({ 'fname': test_fnames, 'label': final_labels }).to_csv("sub/%s.csv" % CODER, index=False)
def main(): parser = argparse.ArgumentParser() parser.add_argument("-config") parser.add_argument("-data", help="data yaml file") parser.add_argument("-data_path", default='', type=str, help="path of data files") parser.add_argument("-seed_model", help="the seed nerual network model") parser.add_argument("-exp_dir", help="the directory to save the outputs") parser.add_argument("-transform", help="feature transformation matrix or mvn statistics") parser.add_argument("-criterion", type=str, choices=["mmi", "mpfe", "smbr"], help="set the sequence training crtierion") parser.add_argument( "-trans_model", help="the HMM transistion model, used for lattice generation") parser.add_argument( "-prior_path", help="the prior for decoder, usually named as final.occs in kaldi setup" ) parser.add_argument( "-den_dir", help="the decoding graph directory to find HCLG and words.txt files") parser.add_argument("-lr", type=float, help="set the learning rate") parser.add_argument("-ce_ratio", default=0.1, type=float, help="the ratio for ce regularization") parser.add_argument("-momentum", default=0, type=float, help="set the momentum") parser.add_argument("-batch_size", default=32, type=int, help="Override the batch size in the config") parser.add_argument("-data_loader_threads", default=0, type=int, help="number of workers for data loading") parser.add_argument("-max_grad_norm", default=5, type=float, help="max_grad_norm for gradient clipping") parser.add_argument("-sweep_size", default=100, type=float, help="process n hours of data per sweep (default:60)") parser.add_argument("-num_epochs", default=1, type=int, help="number of training epochs (default:1)") parser.add_argument('-print_freq', default=10, type=int, metavar='N', help='print frequency (default: 10)') parser.add_argument('-save_freq', default=1000, type=int, metavar='N', help='save model frequency (default: 1000)') args = parser.parse_args() with open(args.config) as f: config = yaml.safe_load(f) config['data_path'] = args.data_path config["sweep_size"] = args.sweep_size print("pytorch version:{}".format(th.__version__)) with open(args.data) as f: data = yaml.safe_load(f) config["source_paths"] = [j for i, j in data['clean_source'].items()] print("Experiment starts with config {}".format( json.dumps(config, sort_keys=True, indent=4))) # Initialize Horovod hvd.init() th.cuda.set_device(hvd.local_rank()) print("Run experiments with world size {}".format(hvd.size())) dataset = SpeechDataset(config) transform = None if args.transform is not None and os.path.isfile(args.transform): with open(args.transform, 'rb') as f: transform = pickle.load(f) dataset.transform = transform train_dataloader = SeqDataloader(dataset, batch_size=args.batch_size, num_workers=args.data_loader_threads, distributed=True, test_only=False) print("Data loader set up successfully!") print("Number of minibatches: {}".format(len(train_dataloader))) if not os.path.isdir(args.exp_dir): os.makedirs(args.exp_dir) # ceate model model_config = config["model_config"] lstm = LSTMStack(model_config["feat_dim"], model_config["hidden_size"], model_config["num_layers"], model_config["dropout"], True) model = NnetAM(lstm, model_config["hidden_size"] * 2, model_config["label_size"]) model.cuda() # setup the optimizer optimizer = th.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) # Broadcast parameters and opterimizer state from rank 0 to all other processes. hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) # Add Horovod Distributed Optimizer optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters()) if os.path.isfile(args.seed_model): checkpoint = th.load(args.seed_model) state_dict = checkpoint['model'] from collections import OrderedDict new_state_dict = OrderedDict() for k, v in state_dict.items(): name = k[7:] # remove 'module.' of dataparallel new_state_dict[name] = v model.load_state_dict(new_state_dict) print("=> loaded checkpoint '{}' ".format(args.seed_model)) else: sys.stderr.write('ERROR: The model file %s does not exist!\n' % (model_file)) sys.exit(0) HCLG = args.den_dir + "/HCLG.fst" words_txt = args.den_dir + "/words.txt" silence_phones = args.den_dir + "/phones/silence.csl" if not os.path.isfile(HCLG): sys.stderr.write('ERROR: The HCLG file %s does not exist!\n' % (HCLG)) sys.exit(0) if not os.path.isfile(words_txt): sys.stderr.write('ERROR: The words.txt file %s does not exist!\n' % (words_txt)) sys.exit(0) if not os.path.isfile(silence_phones): sys.stderr.write('ERROR: The silence phone file %s does not exist!\n' % (silence_phones)) sys.exit(0) with open(silence_phones) as f: silence_ids = [int(i) for i in f.readline().strip().split(':')] f.close() if os.path.isfile(args.trans_model): trans_model = kaldi_hmm.TransitionModel() with kaldi_util.io.xopen(args.trans_model) as ki: trans_model.read(ki.stream(), ki.binary) else: sys.stderr.write('ERROR: The trans_model %s does not exist!\n' % (args.trans_model)) sys.exit(0) # now we can setup the decoder decoder_opts = LatticeFasterDecoderOptions() decoder_opts.beam = config["decoder_config"]["beam"] decoder_opts.lattice_beam = config["decoder_config"]["lattice_beam"] decoder_opts.max_active = config["decoder_config"]["max_active"] acoustic_scale = config["decoder_config"]["acoustic_scale"] decoder_opts.determinize_lattice = False #To produce raw state-level lattice instead of compact lattice asr_decoder = MappedLatticeFasterRecognizer.from_files( args.trans_model, HCLG, words_txt, acoustic_scale=acoustic_scale, decoder_opts=decoder_opts) prior = kaldi_util.io.read_matrix(args.prior_path).numpy() log_prior = th.tensor(np.log(prior[0] / np.sum(prior[0])), dtype=th.float) model.train() for epoch in range(args.num_epochs): run_train_epoch(model, optimizer, log_prior.cuda(), train_dataloader, epoch, asr_decoder, trans_model, silence_ids, args) # save model if hvd.rank() == 0: checkpoint = {} checkpoint['model'] = model.state_dict() checkpoint['optimizer'] = optimizer.state_dict() checkpoint['epoch'] = epoch output_file = args.exp_dir + '/model.se.' + str(epoch) + '.tar' th.save(checkpoint, output_file)
train_df = pd.read_csv(os.path.join(dataset_dir, 'train_df.csv'), names=['path', 'sent']) train_df = train_df.dropna(how='any') print(train_df.head()) # test_df = pd.read_csv('test_df.csv', names=['id', 'sent']) save_file = os.path.join('save', 'chars') chars = get_chars('chinese', save_file, train_df) char_to_token = {c: i for i, c in enumerate(chars)} token_to_char = {i: c for c, i in char_to_token.items()} sos_token = char_to_token['<sos>'] eos_token = char_to_token['<eos>'] pad_token = char_to_token['<pad>'] train_dataset = SpeechDataset(train_df, dataset_dir, char_to_token) train_loader = AudioDataLoader(pad_token, train_dataset, batch_size=32, shuffle=True, drop_last=True) # #test_dataset = SpeechDataset(test_df, dataset_dir) # #test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn) input_size = 128 # num rows in instagram hidden_dim = 64 # 256*2 nodes in each LSTM num_layers = 3 dropout = 0.1 layer_norm = False encoder = Listener(input_size,
def main(): parser = argparse.ArgumentParser() parser.add_argument("-exp_dir") parser.add_argument("-dataPath", default='', type=str, help="path of data files") parser.add_argument("-train_config") parser.add_argument("-data_config") parser.add_argument("-lr", default=0.0001, type=float, help="Override the LR in the config") parser.add_argument("-batch_size", default=32, type=int, help="Override the batch size in the config") parser.add_argument("-data_loader_threads", default=0, type=int, help="number of workers for data loading") parser.add_argument("-max_grad_norm", default=5, type=float, help="max_grad_norm for gradient clipping") parser.add_argument("-sweep_size", default=200, type=float, help="process n hours of data per sweep (default:200)") parser.add_argument("-num_epochs", default=1, type=int, help="number of training epochs (default:1)") parser.add_argument("-global_mvn", default=False, type=bool, help="if apply global mean and variance normalization") parser.add_argument( "-resume_from_model", type=str, help="the model from which you want to resume training") parser.add_argument("-dropout", type=float, help="set the dropout ratio") parser.add_argument("-aneal_lr_epoch", default=2, type=int, help="start to aneal the learning rate from this epoch" ) # aneal -> anneal? parser.add_argument("-aneal_lr_ratio", default=0.5, type=float, help="the ratio to aneal the learning rate") parser.add_argument('-p', '--print-freq', default=100, type=int, metavar='N', help='print frequency (default: 100)') parser.add_argument('-hvd', default=False, type=bool, help="whether to use horovod for training") args = parser.parse_args() with open(args.train_config) as f: config = yaml.safe_load(f) config["sweep_size"] = args.sweep_size with open(args.data_config) as f: data = yaml.safe_load(f) config["source_paths"] = [j for i, j in data['clean_source'].items()] if 'dir_noise' in data: config["dir_noise_paths"] = [ j for i, j in data['dir_noise'].items() ] if 'rir' in data: config["rir_paths"] = [j for i, j in data['rir'].items()] config['data_path'] = args.dataPath print("Experiment starts with config {}".format( json.dumps(config, sort_keys=True, indent=4))) # Initialize Horovod if args.hvd: import horovod.torch as hvd hvd.init() th.cuda.set_device(hvd.local_rank()) print("Run experiments with world size {}".format(hvd.size())) if not os.path.isdir(args.exp_dir): os.makedirs(args.exp_dir) trainset = SpeechDataset(config) train_dataloader = ChunkDataloader(trainset, batch_size=args.batch_size, distributed=args.multi_gpu, num_workers=args.data_loader_threads) if args.global_mvn: transform = GlobalMeanVarianceNormalization() print("Estimating global mean and variance of feature vectors...") transform.learn_mean_and_variance_from_train_loader( trainset, trainset.stream_idx_for_transform, n_sample_to_use=2000) trainset.transform = transform print("Global mean and variance transform trained successfully!") with open(args.exp_dir + "/transform.pkl", 'wb') as f: pickle.dump(transform, f, pickle.HIGHEST_PROTOCOL) print("Data loader set up successfully!") print("Number of minibatches: {}".format(len(train_dataloader))) # ceate model model_config = config["model_config"] lstm = LSTMStack(model_config["feat_dim"], model_config["hidden_size"], model_config["num_layers"], model_config["dropout"], True) model = NnetAM(lstm, model_config["hidden_size"] * 2, model_config["label_size"]) # Start training th.backends.cudnn.enabled = True if th.cuda.is_available(): model.cuda() # optimizer optimizer = th.optim.Adam(model.parameters(), lr=args.lr, amsgrad=True) if args.hvd: # Broadcast parameters and opterimizer state from rank 0 to all other processes. hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) # Add Horovod Distributed Optimizer optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters()) # criterion criterion = nn.CrossEntropyLoss(ignore_index=-100) start_epoch = 0 if args.resume_from_model: assert os.path.isfile(args.resume_from_model ), "ERROR: model file {} does not exit!".format( args.resume_from_model) checkpoint = th.load(args.resume_from_model) state_dict = checkpoint['model'] start_epoch = checkpoint['epoch'] model.load_state_dict(state_dict) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' ".format(args.resume_from_model)) model.train() for epoch in range(start_epoch, args.num_epochs): # aneal learning rate if epoch > args.aneal_lr_epoch: for param_group in optimizer.param_groups: param_group['lr'] *= args.aneal_lr_ratio run_train_epoch(model, optimizer, criterion, train_dataloader, epoch, args) # save model if not args.hvd or hvd.rank() == 0: checkpoint = {} checkpoint['model'] = model.state_dict() checkpoint['optimizer'] = optimizer.state_dict() checkpoint['epoch'] = epoch output_file = args.exp_dir + '/model.' + str(epoch) + '.tar' th.save(checkpoint, output_file)
def test(): args = parser.parse_args() cf = ConfigParser.ConfigParser() cf.read(args.conf) USE_CUDA = cf.getboolean('Training', 'USE_CUDA') model_path = cf.get('Model', 'model_file') data_dir = cf.get('Data', 'data_dir') beam_width = cf.getint('Decode', 'beam_width') package = torch.load(model_path) rnn_param = package["rnn_param"] num_class = package["num_class"] n_feats = package['epoch']['n_feats'] drop_out = package['_drop_out'] decoder_type = cf.get('Decode', 'decoder_type') data_set = cf.get('Decode', 'eval_dataset') test_dataset = SpeechDataset(data_dir, data_set=data_set) model = CTC_Model(rnn_param=rnn_param, num_class=num_class, drop_out=drop_out) test_loader = SpeechDataLoader(test_dataset, batch_size=8, shuffle=False, num_workers=4, pin_memory=False) model.load_state_dict(package['state_dict']) model.eval() if USE_CUDA: model = model.cuda() if decoder_type == 'Greedy': decoder = GreedyDecoder(int2char, space_idx=len(int2char) - 1, blank_index = 0) else: decoder = BeamDecoder(int2char, beam_width=beam_width, blank_index = 0, space_idx = len(int2char) - 1) total_wer = 0 total_cer = 0 start = time.time() for data in test_loader: inputs, target, input_sizes, input_size_list, target_sizes = data inputs = inputs.transpose(0,1) inputs = Variable(inputs, volatile=True, requires_grad=False) if USE_CUDA: inputs = inputs.cuda() inputs = nn.utils.rnn.pack_padded_sequence(inputs, input_size_list) probs = model(inputs) probs = probs.data.cpu() decoded = decoder.decode(probs, input_size_list) targets = decoder._unflatten_targets(target, target_sizes) labels = decoder._process_strings(decoder._convert_to_strings(targets)) for x in range(len(labels)): print("origin : " + labels[x]) print("decoded: " + decoded[x]) cer = 0 wer = 0 for x in range(len(labels)): cer += decoder.cer(decoded[x], labels[x]) wer += decoder.wer(decoded[x], labels[x]) decoder.num_word += len(labels[x].split()) decoder.num_char += len(labels[x]) total_cer += cer total_wer += wer CER = (1 - float(total_cer) / decoder.num_char)*100 WER = (1 - float(total_wer) / decoder.num_word)*100 print("Character error rate on test set: %.4f" % CER) print("Word error rate on test set: %.4f" % WER) end = time.time() time_used = (end - start) / 60.0 print("time used for decode %d sentences: %.4f minutes." % (len(test_dataset), time_used))