dynamics = DynamicsModel(model_net, buffer, model_loss_fnc, model_opt, env, model_type='diff') policy_net = SequentialNetwork(network_layers['policy_layers']) value_net = QnetContinuousActions(env) policy_opt = optim.Adam(policy_net.parameters(), lr=learning_rates['policy_lr']) value_opt = optim.Adam(value_net.parameters(), lr=learning_rates['value_lr'], weight_decay=1e-2) actor = PolicyFunction(policy_net, policy_opt, target_net=True, tau=tau) critic = ValueFunction(value_net, value_opt, target_net=True, tau=tau) # Gather data and training # ~~~~~~~~~~~~~~~~~~~~~~~~ global_step = buffer.populate_randomly(env, 0.1) dynamics.train_model(20, params['minibatch_size'], noise_std=0.001) buffer.empty() for episode in tqdm(range(num_episodes)): episode_step = 0 episode_reward = 0 state = env.env.reset() terminal = False while terminal is False: with torch.no_grad(): action = torch.clamp(
] } learning_rates = dict(policy_lr=1e-4, value_lr=1e-3) critic_loss_fnc = torch.nn.SmoothL1Loss() wandb.config.update(network_layers) wandb.config.update(learning_rates) # Initialisation # ~~~~~~~~~~~~~~ policy_net = SequentialNetwork(network_layers['policy_layers']) value_net = SequentialNetwork(network_layers['value_layers']) policy_opt = optim.Adam(policy_net.parameters(), lr=learning_rates['policy_lr']) value_opt = optim.Adam(value_net.parameters(), lr=learning_rates['value_lr']) actor = PolicyFunction(policy_net, policy_opt) critic = ValueFunction(value_net, value_opt) buffer = ReplayMemory(params['buffer_size']) # Get training # ~~~~~~~~~~~~ global_step = 0 for episode in tqdm(range(num_episodes)): episode_reward = 0 episode_step = 0 state = env.env.reset() terminal = False while terminal is False: action, action_log_prob = actor.softmax_action(state) next_state, reward, terminal, _ = env.env.step(action) wandb.log({'reward': reward, 'step': global_step, 'episode': episode}) episode_step += 1
tau = 0.001 wandb.config.update(learning_rates) wandb.config.critic_loss_fnc = critic_loss_fnc wandb.config.tau = tau # Initialisation # ~~~~~~~~~~~~~~ policy_net = GaussianPolicy(env) value_net1 = QnetContinuousActions(env) value_net2 = QnetContinuousActions(env) policy_opt = optim.Adam(policy_net.parameters(), lr=learning_rates['policy_lr']) value_opt1 = optim.Adam(value_net1.parameters(), lr=learning_rates['value_lr']) value_opt2 = optim.Adam(value_net2.parameters(), lr=learning_rates['value_lr']) actor = SACPolicy(policy_net, policy_opt) critic1 = ValueFunction(value_net1, value_opt1, target_net=True, tau=tau) critic2 = ValueFunction(value_net2, value_opt2, target_net=True, tau=tau) buffer = ReplayMemory(params['buffer_size']) # Get training # ~~~~~~~~~~~~ global_step = 0 for episode in tqdm(range(num_episodes)): episode_reward = 0 episode_step = 0 state = env.env.reset() terminal = False while terminal is False: with torch.no_grad(): action, action_log_prob = actor.action_selection( torch.from_numpy(state).float())
critic_loss_fnc = torch.nn.MSELoss() tau = 0.001 wandb.config.update(learning_rates) wandb.config.tau = tau # Initialisation # ~~~~~~~~~~~~~~ policy_net = DeterministicPolicy(env) value_net = QnetContinuousActions(env) policy_opt = optim.Adam(policy_net.parameters(), lr=learning_rates['policy_lr']) value_opt = optim.Adam(value_net.parameters(), lr=learning_rates['value_lr'], weight_decay=1e-2) actor = PolicyFunction(policy_net, policy_opt, target_net=True, tau=tau) critic = ValueFunction(value_net, value_opt, target_net=True, tau=tau) buffer = ReplayMemory(params['buffer_size']) # Get training # ~~~~~~~~~~~~ global_step = 0 for episode in tqdm(range(num_episodes)): episode_reward = 0 episode_step = 0 state = env.env.reset() terminal = False while terminal is False: with torch.no_grad(): action = torch.clamp( actor.get_policy(state) + action_noise.sample(env.action_high.shape), -1, 1)
model_type='diff') dynamics = [dynamics1, dynamics2] policy_net = SequentialNetwork(network_layers['policy_layers']) value_net1 = QnetContinuousActions(env) value_net2 = QnetContinuousActions(env) policy_opt = optim.Adam(policy_net.parameters(), lr=learning_rates['policy_lr']) value_opt1 = optim.Adam(value_net1.parameters(), lr=learning_rates['value_lr'], weight_decay=1e-2) value_opt2 = optim.Adam(value_net2.parameters(), lr=learning_rates['value_lr'], weight_decay=1e-2) actor = PolicyFunction(policy_net, policy_opt, target_net=True, tau=tau) critic1 = ValueFunction(value_net1, value_opt1, target_net=True, tau=tau) critic2 = ValueFunction(value_net2, value_opt2, target_net=True, tau=tau) critics = [critic1, critic2] # Gather data and training # ~~~~~~~~~~~~~~~~~~~~~~~~ global_step = buffer.populate_randomly(env, 0.1) for dynamic in dynamics: dynamic.train_model(20, params['minibatch_size'], noise_std=0.001) for episode in tqdm(range(num_episodes)): episode_reward = 0 episode_step = 0 state = env.env.reset() terminal = False while terminal is False:
tau = 0.001 policy_delay = 2 wandb.config.update(learning_rates) wandb.config.tau = tau wandb.config.policy_delay = policy_delay # Initialisation # ~~~~~~~~~~~~~~ policy_net = DeterministicPolicy(env) value_net1 = QnetContinuousActions(env) value_net2 = QnetContinuousActions(env) policy_opt = optim.Adam(policy_net.parameters(), lr=learning_rates['policy_lr']) value_opt1 = optim.Adam(value_net1.parameters(), lr=learning_rates['value_lr'], weight_decay=1e-2) value_opt2 = optim.Adam(value_net2.parameters(), lr=learning_rates['value_lr'], weight_decay=1e-2) actor = PolicyFunction(policy_net, policy_opt, target_net=True, tau=tau) critic1 = ValueFunction(value_net1, value_opt1, target_net=True, tau=tau) critic2 = ValueFunction(value_net2, value_opt2, target_net=True, tau=tau) buffer = ReplayMemory(params['buffer_size']) # Get training # ~~~~~~~~~~~~ global_step = 0 for episode in tqdm(range(num_episodes)): # env = gym.make('Pendulum-v0') # if episode % 50 == 0: # env = gym.wrappers.Monitor(env, f'./td3_video/{episode}', force=True) episode_reward = 0 episode_step = 0 state = env.env.reset() terminal = False while terminal is False:
nn.ReLU(), nn.Linear(32, 64), nn.ReLU(), nn.Linear(64, env.action_size)], 'dueling_layers': [env.obs_size, 64, 128, env.action_size]} learning_rates = {'Qnet_lr': 2e-4} loss_function = torch.nn.SmoothL1Loss() wandb.config.update(network_layers) wandb.config.update(learning_rates) # Initialisation # ~~~~~~~~~~~~~~ net = SequentialNetwork(network_layers['Qnet_layers']) wandb.config.value_layers = net.layers opt = optim.Adam(net.parameters(), lr=learning_rates['Qnet_lr']) Qnet = ValueFunction(net, opt, epsilon=epsilon, target_net=True) buffer = ReplayMemory(params['buffer_size']) # Get training # ~~~~~~~~~~~~ global_step = 0 for episode in tqdm(range(num_episodes)): episode_reward = 0 episode_step = 0 state = env.env.reset() terminal = False while terminal is False: action = Qnet.epsilon_greedy_action(state, episode) next_state, reward, terminal, _ = env.env.step(action) wandb.log({'reward': reward, 'step': global_step, 'episode': episode}) episode_step += 1