def test(arglist): env_name = arglist.env random_seed = arglist.test_seed n_episodes = arglist.n_episodes lr = 0.002 max_timesteps = 3000 render = arglist.render if not arglist.ensemble: filename = "{}_{}_{}".format(arglist.policy, env_name, arglist.train_seed) directory = "./train/{}".format(env_name) else: filename = "{}_{}_{}_ensemble".format(arglist.policy, env_name, arglist.train_seed) directory = "./train/{}".format(env_name) #env = gym.make(env_name) env = gen_envs(arglist) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) # Set random seed env.seed(random_seed) torch.manual_seed(random_seed) np.random.seed(random_seed) kwargs = { "state_dim": state_dim, "action_dim": action_dim, "max_action": max_action, "discount": 0.99, "tau": 0.005, "policy_noise": 0.001, "noise_clip": 1.0, "policy_freq": 2 } policy = TD3.TD3(**kwargs) policy.load(os.path.join(directory, filename)) total_reward_list = [] for ep in range(1, n_episodes + 1): ep_reward = 0.0 state = env.reset() for t in range(max_timesteps): action = policy.select_action(state) state, reward, done, _ = env.step(action) ep_reward += reward if render: env.render() if done: break #print('Episode: {}\tReward: {}'.format(ep, int(ep_reward))) total_reward_list.append(ep_reward) ep_reward = 0.0 env.close() return total_reward_list
def get_policy(arglist, kwargs, max_action): # Initialize policy if arglist.policy == "td3": # Target policy smoothing is scaled wrt the action scale kwargs["policy_noise"] = 0.0 kwargs["noise_clip"] = 0.0 kwargs["policy_freq"] = 2 policy = TD3.TD3(**kwargs) elif arglist.policy == "OurDDPG": policy = OurDDPG.DDPG(**kwargs) elif arglist.policy == "DDPG": policy = DDPG.DDPG(**kwargs) elif arglist.policy == 'adv': kwargs['alpha'] = 0.01 kwargs['adv_epsilon'] = 0.01 kwargs[ 'logdir'] = f'./tensorboard/{arglist.policy}_{arglist.env}_{arglist.train_seed}/' policy = TD3_adv2.TD3(**kwargs) else: raise NotImplementedError return policy
tau = 0.005 # Target network update rate policy_noise = 0.2 # STD of Gaussian noise added to the actions for the exploration purposes noise_clip = 0.5 # Maximum value of the Gaussian noise added to the actions (policy) policy_freq = 2 # total_timesteps = 0 timesteps_since_eval = 0 episode_num = 0 episode_reward=0 t0 = time.time() distance_travelled=0 max_episode_steps = 1000 done = True # Episode over load_model=True # Inference. Set to false for training from scratch state_dim = 4 action_dim = 1 max_action = 5 replay_buffer = ReplayBuffer() policy = TD3(state_dim, action_dim, max_action) obs=np.array([]) new_obs=np.array([]) evaluations=[] if load_model == True: total_timesteps = max_timesteps policy.load("%s" % (file_name), directory="./pytorch_models") CarApp().run()
kwargs = { "state_dim": state_dim, "action_dim": action_dim, "max_action": max_action, "discount": args.discount, "tau": args.tau, } # Initialize policy if args.policy == "TD3": # Target policy smoothing is scaled wrt the action scale kwargs["policy_noise"] = args.policy_noise kwargs["noise_clip"] = args.noise_clip kwargs["policy_freq"] = args.policy_freq policy = TD3.TD3(**kwargs) elif args.policy == "DDPG": policy = DDPG.DDPG(**kwargs) if args.load_model != "": policy_file = file_name if args.load_model == "default" else args.load_model policy.load(f"./checkpoint/{policy_file}") replay_buffer = ReplayBuffer(state_dim, action_dim) # Evaluate untrained policy evaluations = [] # evaluations = [eval_policy(policy, env, args.seed, group_name)] # state, done = env.reset(group_name), False episode_reward = 0
batch_size = 256 # Size of the batch discount = 0.90 # Discount factor gamma, used in the calculation of the total discounted reward polyak = 0.5 # Target network update rate policy_noise = 0.02 # STD of Gaussian noise added to the actions for the exploration purposes during model training noise_clip = 0.5 # Maximum value of the Gaussian noise added to the actions (policy) policy_freq = 2 # Number of iterations to wait before the policy network (Actor model) is updated actor_lr = 0.0001 critic_lr = 0.0001 # Loading the model policy = TD3(action_dim, max_action, batch_size=batch_size, discount=discount, polyak=polyak, policy_noise=policy_noise, noise_clip=noise_clip, policy_freq=policy_freq, actor_lr=actor_lr, critic_lr=critic_lr, device=device) policy.load(file_name, './pytorch_models/') avg_reward = evaluate_policy(policy, env, eval_episodes=3) # Wrapup recording env.close() env.stats_recorder.save_complete() env.stats_recorder.done = True # # Recording in car view
updateRate = 0.005 policyNoise = 0.2 noiseClip = 0.5 policyFreq = 2 totalTimeSteps = 0 timeStepsFromEval = 0 episodeNo = 0 episodeReward = 0 maxEpisodeSteps = 1000 done = True stateDim = 4 actionDim = 1 maxAction = 5 replayBuffer = ReplayBuffer() brain = TD3(stateDim, actionDim, maxAction) observation = np.array([]) newobservation = np.array([]) evaluations = [] # loading model if testCar == True: print("### Model loaded ###") totalTimeSteps = maximumTimeSteps brain.load("%s" % (file_name), directory="./pytorch_models") parent = Environment() startTicks = pygame.time.get_ticks() while True:
def train(env_name, warmup_iter=int(25e3), train_iter=int(1e6)): env = gym.make(env_name) x_dim = env.observation_space.shape[0] u_dim = env.action_space.shape[0] u_max = float(env.action_space.high[0]) td3 = TD3(x_dim, u_dim, u_max) if torch.cuda.is_available(): td3.actor.cuda() td3.critic_1.cuda() td3.critic_2.cuda() print('pre_train: ', eval(env_name, td3.actor)) x = env.reset() for i in range(warmup_iter): u_random = env.action_space.sample() x_next, reward, done, info = env.step(u_random) td3.replay_buffer.append([x, u_random, x_next, reward, done]) #td3.add_replay_buffer_sample(x, u_random, x_next, reward, done) #print(x.shape, x_next.shape, x_dim) #step = torch.tensor([x, u_random, x_next, reward, float(done)]).view(1, 5) #td3.replay_buffer = torch.cat((td3.replay_buffer, step), dim=1) #td3.replay_buffer.append([x, u_random, x_next, reward, done]) x = x_next if done: x = env.reset() done = False #ep_reward = 0 #ep_steps = 0 #ep_num += 1 for i in range(train_iter): if i % 1e4 == 0: print('i: ', i, eval(env_name, td3.actor)) if i % 1e5 == 0: torch.save(td3.actor.state_dict(), f'td3_hop_actor_{time.strftime(r"%m_%d_%H:%M")}.pt') torch.save(td3.critic_1.state_dict(), f'td3_hop_critic_1_{time.strftime(r"%m_%d_%H:%M")}.pt') torch.save(td3.critic_2.state_dict(), f'td3_hop_critic_2_{time.strftime(r"%m_%d_%H:%M")}.pt') u = td3.actor(torch.tensor(x).float()) u_noise = torch.normal(0, .1 * td3.u_max, size=u.shape) u = (u + u_noise).clip(-td3.u_max, td3.u_max).detach().cpu().numpy() x_next, reward, done, info = env.step(u) td3.replay_buffer.append([x, u, x_next, reward, done]) x = x_next batch = td3.sample_replay_buffer() critic_loss = td3.critic_loss(batch) td3.critic_optim.zero_grad() critic_loss.backward() td3.critic_optim.step() #print(critic_loss.item()) # Delayed model updates if i % td3.policy_update_period == 0: actor_loss = td3.actor_loss(batch) td3.actor_optim.zero_grad() actor_loss.backward() td3.actor_optim.step() for target_param, cr_param in zip(td3.critic_1_target.parameters(), td3.critic_1.parameters()): target_param.data.copy_(td3.tau * cr_param.data + (1 - td3.tau) * target_param.data) for target_param, cr_param in zip(td3.critic_2_target.parameters(), td3.critic_2.parameters()): target_param.data.copy_(td3.tau * cr_param.data + (1 - td3.tau) * target_param.data) for target_param, actor_param in zip(td3.actor_target.parameters(), td3.actor.parameters()): target_param.data.copy_(td3.tau * actor_param.data + (1 - td3.tau) * target_param.data) if done: x = env.reset() done = False print('train end: ', eval(env_name, td3.actor)) return td3