agent.actor.to(device) agent.actor_target.to(device) agent.actor_perturbed.to(device) agent.critic.to(device) agent.critic_target.to(device) end_str = "_{}_{}".format(args.env_name, args.model_suffix) agent.load_model("models/ddpg_actor" + end_str, "models/ddpg_critic" + end_str) while True: episode_reward = 0 state = torch.Tensor([env.reset()]).to(device) env.render() while True: action = agent.select_action(state, None, None) next_state, reward, done, _ = env.step(action.cpu().numpy()[0]) env.render() episode_reward += reward #action = torch.Tensor(action).to(device) mask = torch.Tensor([not done]).to(device) next_state = torch.Tensor([next_state]).to(device) reward = torch.Tensor([reward]).to(device) state = next_state print("Reward: {}; Episode reward: {}".format(reward, episode_reward)) if done: break env.close()
''' ############# The DDPG part ############# ''' state = torch.Tensor([env.reset()]) # algo line 6 ounoise.scale = (args.noise_scale - args.final_noise_scale) * max( 0, args.exploration_end - i_episode) / args.exploration_end + args.final_noise_scale ounoise.reset() episode_reward = 0 for t in range(args.num_steps): # line 7 # forward pass through the actor network action = agent.select_action(state, ounoise) # line 8 next_state, reward, done, _ = env.step(action.numpy()[0]) # line 9 episode_reward += reward action = torch.Tensor(action) mask = torch.Tensor([not done]) next_state = torch.Tensor([next_state]) reward = torch.Tensor([reward]) # if i_episode % 10 == 0: # env.render() memory.push(state, action, mask, next_state, reward) # line 10 state = next_state if len(memory) > args.batch_size * 5:
action = agent.select_action(state) # Sample action from policy if len(memory) > args.start_steps: # Number of updates per step in environment for i in range(args.updates_per_step): # Update parameters of all the networks (critic_1_loss, critic_2_loss, policy_loss, _, _, policy_info, )= agent.update_parameters(memory, args.batch_size, updates) updates += 1 else: pass next_state, reward, done, _ = env.step(action) # Step episode_steps += 1 total_numsteps += 1 eval_steps += 1 ckpt_steps += 1 map_steps +=1 episode_reward += reward mask = 1 if episode_steps == max_episode_steps else float(not done) memory.push(state, action, reward, next_state, mask) # Append transition to memory state = next_state elapsed = round((time.time() - start_time + offset_time),2) logging("Episode: {}"
for i_episode in range(args.num_episodes): state = torch.Tensor([env.reset()]) if args.ou_noise: ounoise.scale = (args.noise_scale - args.final_noise_scale) * max(0, args.exploration_end - i_episode) / args.exploration_end + args.final_noise_scale ounoise.reset() if args.param_noise and args.algo == "DDPG": agent.perturb_actor_parameters(param_noise) episode_reward = 0 while True: action = agent.select_action(state, ounoise, param_noise) next_state, reward, done, _ = env.step(action.numpy()[0]) total_numsteps += 1 episode_reward += reward action = torch.Tensor(action) mask = torch.Tensor([not done]) next_state = torch.Tensor([next_state]) reward = torch.Tensor([reward]) memory.push(state, action, mask, next_state, reward) state = next_state if len(memory) > args.batch_size: for _ in range(args.updates_per_step): transitions = memory.sample(args.batch_size)
frame_skip = args.frame_skip frame_idx = 0 rewards = [] ep_num = 0 state = env.reset() mpc_planner.reset() episode_reward = 0 done = False for step in range(max_steps): action = mpc_planner.update(state) for _ in range(frame_skip): state, reward, done, _ = env.step(action.copy()) if done: break episode_reward += reward frame_idx += 1 if args.render: env.render("rgb_array", width=320 * 2, height=240 * 2) if args.done_util: if done: break print('ep rew', ep_num, episode_reward) rewards.append([frame_idx, episode_reward]) ep_num += 1 env.close()
i_episode) / args.exploration_end + args.final_noise_scale ounoise.reset() if args.param_noise and args.algo == "DDPG" and param_noise is not None: agent.perturb_actor_parameters(param_noise) ''' episode_reward = 0 state = state.to(device) action, action_probs, entropy = agent.select_action( state, ounoise, param_noise) #print(action.cpu().numpy()) if args.discrete: use_action = action.squeeze(1).cpu().numpy() else: use_action = action.cpu().numpy()[0] next_state, reward, done, _ = env.step(use_action) total_numsteps += 1 episode_reward += reward #action = torch.LongTensor(action) reward = torch.Tensor(reward).to(device).unsqueeze(1) episode_rewards += reward mask = torch.Tensor([[1.0] if not x else [0.0] for x in done]).to(device) final_rewards *= mask final_rewards += (1 - mask) * episode_rewards episode_rewards *= mask next_state = torch.Tensor(next_state).to(device) for i in range(args.num_processes): if args.discrete: memory.push(state[i], action_probs[i], mask[i], next_state[i],