def smoke_test(): """Smoke test for training Wav2Letter Model using randomly generated data. This is used just to quickly verify if the model can run without errors. expects the model to perform poorly. """ # 26 letters in the english alphabet + blank token grapheme_count = 26 + 1 in_frame_len = 500 # arbitrary frame length sample_size = 50 # arbitrary sample size mfcc_features = 13 # 13 mfcc features, discard 13 - 29 batch_size = 25 # arbitrary batch size seq_length = 20 # arbitrary max sequence length print("Randomly generating input and output data...") # create dummy X inputs data inputs = torch.randn(sample_size, in_frame_len, mfcc_features) # create dummy Y target data of class labels # from 1 - 26 (0 reservered for blank) targets = torch.randint(1, grapheme_count, (sample_size, seq_length)) print("inputs shape", inputs.shape) print("target shape", targets.shape) model = Wav2Letter(mfcc_features, grapheme_count) print(model.layers) ctc_loss = nn.CTCLoss() optimizer = optim.Adam(model.parameters()) # Each mfcc feature is a channel # https://pytorch.org/docs/stable/nn.html#torch.nn.Conv1d # transpose (sample_size, in_frame_len, mfcc_features) # to (sample_size, mfcc_features, in_frame_len) inputs = inputs.transpose(1, 2) model.fit(inputs, targets, optimizer, ctc_loss, batch_size, epoch=1, print_every=1) log_probs = model.eval(inputs[0]) out_put = GreedyDecoder(log_probs) # print class labels per time step print("output labels", out_put) # print true labels print("true", targets[0])
def train(batch_size, epochs, data_dir): # load saved numpy arrays for google speech command gs = ImageCommand(data_dir) _inputs, _targets = gs.load_vectors() # parameters # batch_size = batch_size mfcc_features = 3 grapheme_count = gs.intencode.grapheme_count print("training google speech dataset") print("data size", len(_inputs)) print("batch_size", batch_size) print("epochs", epochs) print("num_mfcc_features", mfcc_features) print("grapheme_count", grapheme_count) # torch tensors inputs = torch.Tensor(_inputs) targets = torch.IntTensor(_targets) print("input shape", inputs.shape) print("target shape", targets.shape) # Initialize model, loss, optimizer model = Wav2Letter(mfcc_features, grapheme_count) print(model.layers) ctc_loss = nn.CTCLoss() optimizer = optim.Adam(model.parameters(), lr=1e-4) # Each mfcc feature is a channel # https://pytorch.org/docs/stable/nn.html#torch.nn.Conv1d # transpose (sample_size, in_frame_len, mfcc_features) # to (sample_size, mfcc_features, in_frame_len) inputs = inputs.transpose(1, 2) print("transposed input", inputs.shape) model.fit(inputs, targets, optimizer, ctc_loss, batch_size, epoch=epochs) sample = inputs[0] sample_target = targets[0] log_probs = model.eval(sample) output = GreedyDecoder(log_probs) print("sample target", sample_target) print("predicted", output)
def infer(opt): mfcc_features = opt.mfcc_features datasets_path = opt.datasets_path models_path = opt.output_path # load saved numpy arrays for google speech command gs = GoogleSpeechCommand() _inputs, _targets = gs.load_vectors(datasets_path) grapheme_count = gs.intencode.grapheme_count inputs = flow.Tensor(_inputs).to("cuda") targets = flow.tensor(_targets, dtype=flow.int).to("cuda") model = Wav2Letter(mfcc_features, grapheme_count) model.to("cuda") model.load_state_dict(flow.load(os.path.join(models_path, "model.pth"))) int_encoder = opt.int_encoder with open(int_encoder, "rb") as f: int_to_char = pickle.load(f)["index2char"] decoder = GreedyDecoder(int_to_char) inputs = inputs.transpose(1, 2) sample = inputs[-1000:] sample_target = targets[-1000:] log_probs = model(sample) output = decoder.decode(log_probs) pred_strings, output = decoder.convert_to_strings(output) sample_target_strings = decoder.convert_to_strings( sample_target, remove_repetitions=False, return_offsets=False) wer = decoder.wer(sample_target_strings, pred_strings) print("wer", wer)
def train(batch_size, epochs): gs = GoogleSpeechCommand() inputs, targets, input_lengths = gs.load_vectors("./speech_data") batch_size = batch_size mfcc_features = 13 grapheme_count = gs.intencode.grapheme_count index2char = gs.intencode.index2char # 输出参数 print("training google speech dataset") print("data size", len(inputs)) print("batch_size", batch_size) print("epochs", epochs) print("num_mfcc_features", mfcc_features) print("grapheme_count", grapheme_count) print("index2char", index2char) print("input shape", inputs.shape) print("target shape", targets.shape) model = Wav2Letter(mfcc_features, grapheme_count) model.fit(inputs, targets, input_lengths, batch_size, epoch=epochs) model.save("./model")
def train(opt): batch_size = opt.batch_size epochs = opt.epochs mfcc_features = opt.mfcc_features rate = opt.rate datasets_path = opt.datasets_path # load saved numpy arrays for google speech command gs = GoogleSpeechCommand() _inputs, _targets = gs.load_vectors(datasets_path) grapheme_count = gs.intencode.grapheme_count print("training google speech dataset") print("data size", len(_inputs)) print("batch_size", batch_size) print("epochs", epochs) print("num_mfcc_features", mfcc_features) print("grapheme_count", grapheme_count) inputs = flow.Tensor(_inputs).to("cuda") targets = flow.tensor(_targets, dtype=flow.int).to("cuda") # split train, eval, test data_size = len(_inputs) train_inputs = inputs[0:int(rate * data_size)] train_targets = targets[0:int(rate * data_size)] eval_inputs = inputs[int(rate * data_size):-1000] eval_targets = targets[int(rate * data_size):-1000] # Initialize model, loss, optimizer model = Wav2Letter(mfcc_features, grapheme_count) model.to("cuda") ctc_loss = nn.CTCLoss() optimizer = optim.Adam(model.parameters(), lr=opt.lr) # load pretrained model if opt.pretrained_model != None: model.load_state_dict(flow.load(opt.pretrained_model)) train_total_steps = int(train_inputs.size(0) // batch_size) eval_total_steps = int(eval_inputs.size(0) // batch_size) for epoch in range(epochs): samples_processed = 0 avg_epoch_loss = 0 for step in range(train_total_steps): train_data_batch = train_inputs[samples_processed:batch_size + samples_processed].transpose(1, 2) log_probs = model(train_data_batch) log_probs = log_probs.transpose(1, 2).transpose(0, 1) targets = train_targets[samples_processed:batch_size + samples_processed] input_lengths = flow.tensor(np.full((batch_size, ), log_probs.shape[0]), dtype=flow.int).to("cuda") target_lengths = flow.tensor( [target.shape[0] for target in targets], dtype=flow.int).to("cuda") loss = ctc_loss(log_probs, targets, input_lengths, target_lengths) avg_epoch_loss += loss.numpy().item() loss.backward() optimizer.step() optimizer.zero_grad() samples_processed += batch_size # evaluate int_encoder = opt.int_encoder with open(int_encoder, "rb") as f: int_to_char = pickle.load(f)["index2char"] decoder = GreedyDecoder(int_to_char) wer = 0 start_index = 0 for step in range(eval_total_steps): eval_data_batch = eval_inputs[start_index:batch_size + start_index].transpose(1, 2) eval_targets_batch = eval_targets[start_index:batch_size + start_index] eval_log_props = model(eval_data_batch) output = decoder.decode(eval_log_props) pred_strings, output = decoder.convert_to_strings(output) eval_target_strings = decoder.convert_to_strings( eval_targets_batch, remove_repetitions=False, return_offsets=False) wer += decoder.wer(eval_target_strings, pred_strings) start_index += batch_size print( "epoch", epoch + 1, "average epoch loss", avg_epoch_loss / train_total_steps, "wer", wer / eval_total_steps, ) # save models if (epoch + 1) % 100 == 0: flow.save( model.state_dict(), os.path.join(opt.output_path, "model_{}.pth".format(epoch + 1)), )