def predict(self, path): self.eval() # 加载音频数据集并执行短时傅里叶变换 wav = data.load_audio(path) spec = data.spectrogram(wav) spec.unsqueeze_(0) out = self.cnn(spec) out_len = torch.tensor([out.size(-1)]) # 把预测结果转换成文字 text = self.decode(out, out_len) self.train() return text[0]
def predict(wav_path): wav = data.load_audio(wav_path) spec = data.spectrogram(wav) spec.unsqueeze_(0) with torch.no_grad(): spec = spec.cuda() y = model.cnn(spec) y = F.softmax(y, 1) y_len = torch.tensor([y.size(-1)]) y = y.permute(0, 2, 1) out, score, offset, out_len = decoder.decode(y, y_len) return translate(model.vocabulary, out[0][0], out_len[0][0])
def test(model, test_dir, save_dir, image_size): model.eval() test_dirs = utils.listdir_nohidden(test_dir) for sub_folder in test_dirs: save_test_dir = os.path.join(save_dir, os.path.basename(sub_folder)) audio_feature_files = glob.glob( os.path.join(sub_folder, 'audio_sample/*.mat')) audio_feature_files = utils.sort_filename(audio_feature_files) image_test_file = os.path.join(sub_folder, 'image_sample.jpg') audio_test_file = os.path.join(sub_folder, 'audio_sample.wav') audio_duration = utils.get_wav_duration(audio_test_file) input_image = data.load_image(image_test_file, image_size) input_audios = [ data.load_audio(audio_feature_file) for audio_feature_file in audio_feature_files ] input_images = [input_image] * len(input_audios) # convert to tensor input_images = torch.from_numpy( np.array(input_images).transpose( (0, 3, 1, 2))).cuda() # (seq_len, c, h, w) input_audios = torch.from_numpy( np.array(input_audios).transpose((0, 3, 1, 2))).cuda() model_type = model.module.model_type() if isinstance( model, torch.nn.DataParallel) else model.model_type() with torch.no_grad(): if model_type == 'RNN': input_images = input_images.unsqueeze( 0) # (1, seq_len, c, h, w) input_audios = input_audios.unsqueeze(0) G_images = model(input_images, input_audios, valid_len=torch.tensor( [input_audios.shape[1]], dtype=torch.int32).cuda(), teacher_forcing_ratio=0) G_images = G_images.squeeze(0) else: G_images = model(input_images, input_audios) utils.save_video(audio_duration, audio_test_file, G_images.cpu().detach().numpy(), save_test_dir) model.train()