def generate(mode='random'): speaker_fname = os.path.join(r'C:\Study\SpeechAcquisitionModel\src\VTL', 'JD2.speaker') lib_path = os.path.join(r'C:\Study\SpeechAcquisitionModel\src\VTL', 'VocalTractLab2.dll') max_ep_duration = 5000 timestep = 20 episode_length = 50 env = VTLEnv(lib_path, speaker_fname, timestep, max_episode_duration=max_ep_duration) preproc = AudioPreprocessor(numcep=12, winlen=timestep/1000) replay_buffer = ReplayBuffer(1000000) num_samples = 500000 dt = str(datetime.datetime.now().strftime("%m_%d_%Y_%I_%M_%p_%S")) video_dir = r'C:\Study\SpeechAcquisitionModel\data\raw\VTL_model_dynamics_' + mode + '_' + dt + r'\Videos' buffer_fname = r'C:\Study\SpeechAcquisitionModel\data\raw\VTL_model_dynamics_' + mode + '_' + dt + r'\replay_buffer.pkl' os.makedirs(video_dir, exist_ok=True) if mode == 'random': generate_model_dynamics_training_data_random_policy(env, preproc, replay_buffer, num_samples, episode_length, video_dir=video_dir) elif mode == 'linear_transition': generate_model_dynamics_training_data_linear_transition(env, preproc, replay_buffer, num_samples, episode_length, video_dir=video_dir) elif mode == 'sigmoid_transition': generate_model_dynamics_training_data_sigmoid_transition(env, preproc, replay_buffer, num_samples, episode_length, video_dir=video_dir) with open(buffer_fname, mode='wb') as f: pickle.dump(replay_buffer, f)
def main(): speaker_fname = os.path.join(r'C:\Study\SpeechAcquisitionModel\src\VTL', 'JD2.speaker') lib_path = os.path.join(r'C:\Study\SpeechAcquisitionModel\src\VTL', 'VocalTractLab2.dll') ep_duration = 5000 timestep = 20 env = VTLEnv(lib_path, speaker_fname, timestep, max_episode_duration=ep_duration) preproc = AudioPreprocessor(numcep=13, winlen=timestep / 1000) settings = { 'state_dim': env.state_dim, 'action_dim': env.action_dim, 'state_bound': env.state_bound, 'action_bound': [(p[0] / 5, p[1] / 5) for p in env.action_bound ], #env.action_bound, 'goal_dim': preproc.get_dim(), 'goal_bound': [(-50, 50) for _ in range(preproc.get_dim())], 'episode_length': 40, 'minibatch_size': 512, 'max_train_per_simulation': 50, 'save_video_step': 200, 'actor_tau': 0.01, 'actor_learning_rate': 0.000001, 'model_dynamics_learning_rate': 0.05, 'summary_dir': r'C:\Study\SpeechAcquisitionModel\reports\summaries', 'videos_dir': r'C:\Study\SpeechAcquisitionModel\reports\videos' } replay_buffer = ReplayBuffer(100000) reference_fname = r'C:\Study\SpeechAcquisitionModel\src\VTL\references\a_i.pkl' with open(reference_fname, 'rb') as f: (tract_params, glottis_params) = pickle.load(f) target_trajectory = np.hstack((np.array(tract_params), np.array(glottis_params))) # generate audio and then goal target trajectpry based on given state space target trajectory s0 = env.reset(target_trajectory[0]) g0 = np.zeros(preproc.get_dim()) target_actions = [] target_goals = [] target_goals.append(g0) target_states = [] target_states.append(s0) for i in range(1, len(target_trajectory) - 1): action = np.subtract(target_trajectory[i], s0) s1, audio = env.step(action) wav_audio = np.int16(audio * (2 ** 15 - 1)) mfcc = preproc(wav_audio, env.audio_sampling_rate) isnans = np.isnan(mfcc) if isnans.any(): print(mfcc) print("NAN OCCURED") raise TypeError("NAN in target") g1 = np.reshape(mfcc, (preproc.get_dim())) target_actions.append(action) target_goals.append(g1) target_states.append(s1) s0 = s1 g0 = g1 target_goals[1] = target_goals[2] target_trajectory = (target_actions, target_goals, target_states) train(settings, env, replay_buffer, preproc, target_trajectory) return
return x speaker_fname = os.path.join(r'C:\Study\SpeechAcquisitionModel\src\VTL', 'JD2.speaker') lib_path = os.path.join(r'C:\Study\SpeechAcquisitionModel\src\VTL', 'VocalTractLab2.dll') ep_duration = 5000 timestep = 20 episode_length = 40 env = VTLEnv(lib_path, speaker_fname, timestep, max_episode_duration=ep_duration) win_len = int(timestep * env.audio_sampling_rate) preproc = AudioPreprocessor(numcep=13, winlen=timestep / 1000) dir_name = r'C:\Study\SpeechAcquisitionModel\data\raw\VTL_model_dynamics_simple_transition_08_23_2018_10_52_AM_54' video_dir = dir_name + r'\Videos' buffer_fname = dir_name + r'\replay_buffer.pkl' with open(buffer_fname, mode='rb') as f: replay_buffer = pickle.load(f) s_dim = env.state_dim a_dim = env.action_dim g_dim = preproc.get_dim() s_bound = env.state_bound a_bound = env.action_bound a_bound = [(p[0] / 5, p[1] / 5) for p in env.action_bound] g_bound = [(-40, 40) for _ in range(g_dim)]
def train(*args, **kwargs): print(kwargs) torch.random.manual_seed(0) device = kwargs['train']['device'] # 1. Init audio preprocessing preproc = AudioPreprocessor(**kwargs['preprocessing_params']) sr = kwargs['preprocessing_params']['sample_rate'] # 2. Load preprocessing net preproc_net = torch.load(kwargs['preproc_net_fname']).to(device) # 3. Init model dynamics net md_net = StochasticLstmModelDynamics(**kwargs['model_dynamics_params']).to(device) optim = torch.optim.Adam(md_net.parameters(), lr=kwargs['train']['learning_rate'], eps=kwargs['train']['learning_rate_eps']) # 4. Init Policy policy = SimpleStochasticPolicy(**kwargs['policy_params']).to(device) # 5. Init environment speaker_fname = os.path.join(kwargs['vtl_dir'], 'JD2.speaker') lib_path = os.path.join(kwargs['vtl_dir'], 'VocalTractLab2.dll') ep_duration = 1000 timestep = 20 num_steps_per_ep = ep_duration // timestep env = VTLEnv(lib_path, speaker_fname, timestep, max_episode_duration=ep_duration) # 6. Load reference for policy reference_wav_fname = kwargs['reference_fname'] reference_preproc = torch.from_numpy(preproc(reference_wav_fname)[np.newaxis]).float().to(device) _, _, reference = preproc_net(reference_preproc, seq_lens=np.array([reference_preproc.shape[1]])) reference = reference.detach().cpu().numpy().squeeze() # 7. Init replay buffer replay_buffer = ReplayBuffer(kwargs['buffer_size']) # 8. Train loop params = kwargs['train'] policy.eval() md_net.train() for i in range(params['num_steps']): state = env.reset() goal_state = np.zeros(kwargs['model_dynamics_params']['goal_dim']) states = [state] actions = [] goal_states = [goal_state] hidden = None for step in range(num_steps_per_ep): policy_input = np.concatenate((state, reference[step, :]))[np.newaxis] policy_input = torch.from_numpy(policy_input).float().to(device) action, _, _ = policy(policy_input) # action = (np.random.rand(action_space)) * 100. action = action.detach().cpu().numpy().squeeze() action[env.number_vocal_tract_parameters:] = 0. action = action * 0.1 # reduce amplitude for now new_state, audio = env.step(action, True) preproc_audio = preproc(audio, sr)[np.newaxis] preproc_audio = torch.from_numpy(preproc_audio).float().to(device) _, hidden, new_goal_state = preproc_net(preproc_audio, seq_lens=np.array([preproc_audio.shape[1]]), hidden=hidden) new_goal_state = new_goal_state.detach().cpu().numpy().squeeze() states.append(new_state) goal_states.append(new_goal_state) actions.append(action) state = new_state goal_state = new_goal_state env.render() replay_buffer.add((states, goal_states, actions, None, None)) minibatch_size = kwargs['train']['minibatch_size'] if replay_buffer.size() > minibatch_size: num_updates_per_epoch = kwargs['train']['updates_per_episode'] for k in range(num_updates_per_epoch): # sample minibatch s0, g0, a, _, _ = replay_buffer.sample_batch(minibatch_size) # train seq_len = a.shape[1] goal_dim = kwargs['model_dynamics_params']["goal_dim"] s_bound = env.state_bound a_bound = env.action_bound s = torch.from_numpy(normalize(s0, s_bound)).float().to(device) g = torch.from_numpy(g0).float().to(device) a = torch.from_numpy(normalize(a, a_bound)).float().to(device) # forward prop s_pred, g_pred, s_prob, g_prob, state_dists, goal_dists = md_net(s[:, :-1, :], g[:, :-1, :], a) # compute error mse_loss = MSELoss(reduction='sum')(g_pred, g[:, 1:, :]) / (seq_len * kwargs['train']['minibatch_size']) loss = -goal_dists.log_prob(g[:, 1:, :]).sum(dim=-1, keepdim=True).mean() state_mse_loss = MSELoss(reduction='sum')(s_pred, s[:, 1:, :]) / (seq_len * kwargs['train']['minibatch_size']) state_loss = -state_dists.log_prob(s[:, 1:, :]).sum(dim=-1, keepdim=True).mean() total_loss = loss + state_loss # backprop optim.zero_grad() total_loss.backward() optim.step() dynamics = MSELoss(reduction='sum')(g[:, 1:, :], g[:, :-1, :]) / (seq_len * kwargs['train']['minibatch_size']) print("\rstep: {} | stochastic_loss: {:.4f} | loss: {:.4f}| actual_dynamics: {:.4f} | state stochastic loss: {:.4f} | state_loss: {:.4f}".format(i, loss.detach().cpu().item(), mse_loss.detach().cpu().item(), dynamics.detach().cpu().item(), state_loss.detach().cpu().item(), state_mse_loss.detach().cpu().item()), end="") if step % 100 == 0: print() # 9. Save model dt = str(datetime.datetime.now().strftime("%m_%d_%Y_%I_%M_%p")) md_fname = os.path.join(kwargs['save_dir'], '{}_{}.pt'.format("rnn_md", dt)) torch.save(md_net, md_fname)
def train(*args, **kwargs): print(kwargs) device = kwargs['train']['device'] # 1. Init audio preprocessing preproc = AudioPreprocessor(**kwargs['preprocessing_params']) sr = kwargs['preprocessing_params']['sample_rate'] # 2. Load preprocessing net preproc_net = torch.load(kwargs['preproc_net_fname']).to(device) # 3. Init model dynamics net md_net = LstmModelDynamics(**kwargs['model_dynamics_params']).to(device) optim = torch.optim.Adam(md_net.parameters(), lr=kwargs['train']['learning_rate'], eps=kwargs['train']['learning_rate_eps']) # 4. Load training set data_fname = kwargs['data_fname'] df = pd.read_pickle(data_fname) # 5. Train loop params = kwargs['train'] md_net.train() for i in range(params['num_steps']): sample = df.sample(n=kwargs['train']['minibatch_size']) states = np.stack(sample.loc[:, 'states'].values) actions = np.stack(sample.loc[:, 'actions'].values) audio = np.stack(sample.loc[:, 'audio'].values) preproc_audio = np.array([preproc(audio[j], sr) for j in range(audio.shape[0])]) acoustic_states = torch.from_numpy(preproc_audio).float().to(device) # acoustic_states = acoustic_states.view(-1, kwargs['model_dynamics_params']["acoustic_state_dim"]) # mean_norm = acoustic_states.mean(dim=0) # mean_std = acoustic_states.std(dim=0) # acoustic_states = (acoustic_states - mean_norm.view(1, -1)) / mean_std.view(1, -1) # acoustic_states = acoustic_states.view(kwargs['train']['minibatch_size'], -1, kwargs['model_dynamics_params']["acoustic_state_dim"]) _, _, acoustic_states = preproc_net(torch.from_numpy(preproc_audio).float().to(device), seq_lens=np.array([preproc_audio.shape[-2]])) seq_len = actions.shape[1] acoustic_state_dim = kwargs['model_dynamics_params']["acoustic_state_dim"] # forward prop lstm_outs, predicted_acoustic_states = md_net(acoustic_states, torch.from_numpy(states[:, :seq_len, :]).float().to(device), torch.from_numpy(actions).float().to(device)) # compute error loss = MSELoss(reduction='sum')(predicted_acoustic_states[:, :-1, :].contiguous().view(-1, acoustic_state_dim), acoustic_states[:, 1:, :].contiguous().view(-1, acoustic_state_dim)) / (seq_len * kwargs['train']['minibatch_size']) # backprop optim.zero_grad() loss.backward() optim.step() dynamics = MSELoss(reduction='sum')(acoustic_states[:, :-1, :].contiguous().view(-1, acoustic_state_dim), acoustic_states[:, 1:, :].contiguous().view(-1, acoustic_state_dim)) / (seq_len * kwargs['train']['minibatch_size']) print("\rstep: {} | loss: {:.4f}| actual_dynamics: {:.4f}".format(i, loss.detach().cpu().item(), dynamics.detach().cpu().item()), end="")
lstm_net.load_state_dict(torch.load(lstm_net_fname)) # instantiate environment and its properties speaker_fname = os.path.join(r'C:\Study\SpeechAcquisitionModel\src\VTL', 'JD2.speaker') lib_path = os.path.join(r'C:\Study\SpeechAcquisitionModel\src\VTL', 'VocalTractLab2.dll') ep_duration = 5000 timestep = 20 episode_length = 40 env = VTLEnv(lib_path, speaker_fname, timestep, max_episode_duration=ep_duration) win_len = int(timestep * env.audio_sampling_rate) preproc = AudioPreprocessor(numcep=12, winlen=timestep / 1000) replay_buffer = ReplayBuffer(1000000) s_dim = env.state_dim a_dim = env.action_dim # remember that lstm hidden state is a tuple h, c so we have to predict tuple (h, c) g_dim = 2 * lstm_model_settings['hidden_reccurent_cells_count'] s_bound = env.state_bound a_bound = env.action_bound g_bound = [(-1., 1.) for _ in range(g_dim)] action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(env.action_dim), sigma=0.01) n_minibatch_size = 512
def main(): speaker_fname = os.path.join(r'C:\Study\SpeechAcquisitionModel\src\VTL', 'JD2.speaker') lib_path = os.path.join(r'C:\Study\SpeechAcquisitionModel\src\VTL', 'VocalTractLab2.dll') ep_duration = 5000 timestep = 20 env = VTLEnv(lib_path, speaker_fname, timestep, max_episode_duration=ep_duration) preproc = AudioPreprocessor(numcep=12, winlen=timestep / 1000, winstep=timestep / 1000) # load lstm net for classification lstm_net_fname = r'C:\Study\SpeechAcquisitionModel\reports\VTL_sigmoid_transition_classification\checkpoints\simple_lstm_08_29_2018_03_13_PM_acc_0.9961.pt' lstm_net_classes = 25 lstm_model_settings = { 'dct_coefficient_count': 12, 'label_count': lstm_net_classes + 2, 'hidden_reccurent_cells_count': 50, 'winlen': 0.02, 'winstep': 0.02 } lstm_net = LstmNet(lstm_model_settings) lstm_net.load_state_dict(torch.load(lstm_net_fname)) settings = { 'state_dim': env.state_dim, 'action_dim': env.action_dim, 'state_bound': env.state_bound, 'action_bound': [(p[0] / 5, p[1] / 5) for p in env.action_bound], #env.action_bound, 'goal_dim': lstm_model_settings['hidden_reccurent_cells_count'] * 2, 'goal_bound': [(-1., 1.) for _ in range(lstm_model_settings['hidden_reccurent_cells_count'] * 2)], 'episode_length': 40, 'minibatch_size': 512, 'max_train_per_simulation': 50, 'save_video_step': 200, 'summary_dir': r'C:\Study\SpeechAcquisitionModel\reports\summaries', 'videos_dir': r'C:\Study\SpeechAcquisitionModel\reports\videos' } replay_buffer = ReplayBuffer(100000) # load target sound reference_wav_fname = r'C:\Study\SpeechAcquisitionModel\data\raw\VTL_model_dynamics_sigmoid_transition_08_28_2018_03_57_PM_03\Videos\a_i\episode_08_28_2018_03_57_PM_06.wav' reference_s0 = get_cf('a') reference_mfcc = preproc(reference_wav_fname) # feed target sound to lstm net and get target goal from hidden state hidden = None target_trajectory = [] for i in range(reference_mfcc.shape[0]): net_input = torch.from_numpy( np.reshape(reference_mfcc[i, :], (1, 1, reference_mfcc.shape[1]))).float() _, hidden, _ = lstm_net(net_input, np.array([1]), hidden) t = np.concatenate([ hidden[0].detach().numpy().flatten(), hidden[0].detach().numpy().flatten() ]) target_trajectory.append(t) train(settings, env, replay_buffer, preproc, lstm_net, target_trajectory, reference_s0) return