def run(): parser = argparse.ArgumentParser() parser.add_argument('--env_id', type=str, default='AntBulletEnv-v0') parser.add_argument('--log_name', type=str, default='') parser.add_argument('--cuda', action='store_true') parser.add_argument('--seed', type=int, default=0) args = parser.parse_args() if args.log_name: log_dir = os.path.join('logs', args.env_id, args.log_name) else: env_dir = os.path.join('logs', args.env_id, '*') dirs = glob.glob(env_dir) log_dir = max(dirs, key=os.path.getctime) print(f'using {log_dir}') env = gym.make(args.env_id) device = torch.device( "cuda" if args.cuda and torch.cuda.is_available() else "cpu") policy = GaussianPolicy( env.observation_space.shape[0], env.action_space.shape[0], hidden_units=[256, 256]).to(device) policy.load(os.path.join(log_dir, 'model', 'policy.pth')) grad_false(policy) def exploit(state): state = torch.FloatTensor(state).unsqueeze(0).to(device) with torch.no_grad(): _, _, action = policy.sample(state) return action.cpu().numpy().reshape(-1) env.render() while True: state = env.reset() episode_reward = 0. done = False while not done: env.render() action = exploit(state) next_state, reward, done, _ = env.step(action) episode_reward += reward state = next_state print(f'total reward: {episode_reward}') time.sleep(1)
def testing(): parser = argparse.ArgumentParser() parser.add_argument('--env_name', type=str, default='HalfCheetah-v2') parser.add_argument('--num_episode', type=int, default=10) args = parser.parse_args() num_episode = args.num_episode env = gym.make(args.env_name) device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") policy = GaussianPolicy( env.observation_space.shape[0], env.action_space.shape[0], hidden_units=[256, 256]).to(device) policy.load(os.path.join('models', args.env_name, 'policy.pth')) grad_false(policy) def exploit(state): state = torch.FloatTensor(state).unsqueeze(0).to(device) with torch.no_grad(): _, _, action = policy.sample(state) return action.cpu().numpy().reshape(-1) e_rewrads = [] for _ in range(num_episode): state = env.reset() episode_reward = 0. done = False while not done: if num_episode <= 1: env.render() action = exploit(state) next_state, reward, done, _ = env.step(action) episode_reward += reward state = next_state e_rewrads.append(episode_reward) print("Average reward of " + args.env_name + " is %.1f"%(np.mean(e_rewrads))) print("Average std of " + args.env_name + " is %.1f"%(np.std(e_rewrads)))
def run(): parser = argparse.ArgumentParser() parser.add_argument('--env_id', type=str, default='HalfCheetah-v2') parser.add_argument('--log_name', type=str, default='sac-seed0-datetime') parser.add_argument('--cuda', action='store_true') parser.add_argument('--seed', type=int, default=0) args = parser.parse_args() log_dir = os.path.join('logs', args.env_id, args.log_name) env = gym.make(args.env_id) device = torch.device( "cuda" if args.cuda and torch.cuda.is_available() else "cpu") policy = GaussianPolicy(env.observation_space.shape[0], env.action_space.shape[0], hidden_units=[256, 256]).to(device) policy.load(os.path.join(log_dir, 'model', 'policy.pth')) grad_false(policy) def exploit(state): state = torch.FloatTensor(state).unsqueeze(0).to(device) with torch.no_grad(): _, _, action = policy.sample(state) return action.cpu().numpy().reshape(-1) state = env.reset() episode_reward = 0. done = False while not done: env.render() action = exploit(state) next_state, reward, done, _ = env.step(action) episode_reward += reward state = next_state
def __init__(self, env, log_dir, num_steps=3000000, batch_size=256, lr=0.0003, hidden_units=[256, 256], memory_size=1e6, gamma=0.99, tau=0.005, entropy_tuning=True, ent_coef=0.2, multi_step=1, per=False, alpha=0.6, beta=0.4, beta_annealing=0.0001, grad_clip=None, updates_per_step=1, start_steps=10000, log_interval=10, target_update_interval=1, eval_interval=1000, cuda=True, seed=0): self.env = env torch.manual_seed(seed) np.random.seed(seed) self.env.seed(seed) torch.backends.cudnn.deterministic = True # It harms a performance. torch.backends.cudnn.benchmark = False self.device = torch.device( "cuda" if cuda and torch.cuda.is_available() else "cpu") self.policy = GaussianPolicy( self.env.observation_space.shape[0], self.env.action_space.shape[0], hidden_units=hidden_units).to(self.device) self.critic = TwinnedQNetwork( self.env.observation_space.shape[0], self.env.action_space.shape[0], hidden_units=hidden_units).to(self.device) self.critic_target = TwinnedQNetwork( self.env.observation_space.shape[0], self.env.action_space.shape[0], hidden_units=hidden_units).to(self.device).eval() # copy parameters of the learning network to the target network hard_update(self.critic_target, self.critic) # disable gradient calculations of the target network grad_false(self.critic_target) self.policy_optim = Adam(self.policy.parameters(), lr=lr) self.q1_optim = Adam(self.critic.Q1.parameters(), lr=lr) self.q2_optim = Adam(self.critic.Q2.parameters(), lr=lr) if entropy_tuning: # Target entropy is -|A|. self.target_entropy = -torch.prod(torch.Tensor( self.env.action_space.shape).to(self.device)).item() # We optimize log(alpha), instead of alpha. self.log_alpha = torch.zeros( 1, requires_grad=True, device=self.device) self.alpha = self.log_alpha.exp() self.alpha_optim = Adam([self.log_alpha], lr=lr) else: # fixed alpha self.alpha = torch.tensor(ent_coef).to(self.device) if per: # replay memory with prioritied experience replay # See https://github.com/ku2482/rltorch/blob/master/rltorch/memory self.memory = PrioritizedMemory( memory_size, self.env.observation_space.shape, self.env.action_space.shape, self.device, gamma, multi_step, alpha=alpha, beta=beta, beta_annealing=beta_annealing) else: # replay memory without prioritied experience replay # See https://github.com/ku2482/rltorch/blob/master/rltorch/memory self.memory = MultiStepMemory( memory_size, self.env.observation_space.shape, self.env.action_space.shape, self.device, gamma, multi_step) self.log_dir = log_dir self.model_dir = os.path.join(log_dir, 'model') self.summary_dir = os.path.join(log_dir, 'summary') if not os.path.exists(self.model_dir): os.makedirs(self.model_dir) if not os.path.exists(self.summary_dir): os.makedirs(self.summary_dir) self.writer = SummaryWriter(log_dir=self.summary_dir) self.train_rewards = RunningMeanStats(log_interval) self.steps = 0 self.learning_steps = 0 self.episodes = 0 self.num_steps = num_steps self.tau = tau self.per = per self.batch_size = batch_size self.start_steps = start_steps self.gamma_n = gamma ** multi_step self.entropy_tuning = entropy_tuning self.grad_clip = grad_clip self.updates_per_step = updates_per_step self.log_interval = log_interval self.target_update_interval = target_update_interval self.eval_interval = eval_interval
def __init__(self, env, log_dir, num_steps=3000000, initial_latent_steps=100000, batch_size=256, latent_batch_size=32, num_sequences=8, lr=0.0003, latent_lr=0.0001, feature_dim=256, latent1_dim=32, latent2_dim=256, hidden_units=[256, 256], memory_size=1e5, gamma=0.99, target_update_interval=1, tau=0.005, entropy_tuning=True, ent_coef=0.2, leaky_slope=0.2, grad_clip=None, updates_per_step=1, start_steps=10000, training_log_interval=10, learning_log_interval=100, eval_interval=50000, cuda=True, seed=0): self.env = env self.observation_shape = self.env.observation_space.shape self.action_shape = self.env.action_space.shape self.action_repeat = self.env.action_repeat torch.manual_seed(seed) np.random.seed(seed) self.env.seed(seed) # torch.backends.cudnn.deterministic = True # It harms a performance. # torch.backends.cudnn.benchmark = False # It harms a performance. self.device = torch.device( "cuda" if cuda and torch.cuda.is_available() else "cpu") self.latent = LatentNetwork(self.observation_shape, self.action_shape, feature_dim, latent1_dim, latent2_dim, hidden_units, leaky_slope).to(self.device) self.policy = GaussianPolicy( num_sequences * feature_dim + (num_sequences - 1) * self.action_shape[0], self.action_shape[0], hidden_units).to(self.device) self.critic = TwinnedQNetwork(latent1_dim + latent2_dim, self.action_shape[0], hidden_units).to(self.device) self.critic_target = TwinnedQNetwork( latent1_dim + latent2_dim, self.action_shape[0], hidden_units).to(self.device).eval() # Copy parameters of the learning network to the target network. soft_update(self.critic_target, self.critic, 1.0) # Disable gradient calculations of the target network. grad_false(self.critic_target) # Policy is updated without the encoder. self.policy_optim = Adam(self.policy.parameters(), lr=lr) self.q_optim = Adam(self.critic.parameters(), lr=lr) self.latent_optim = Adam(self.latent.parameters(), lr=latent_lr) if entropy_tuning: # Target entropy is -|A|. self.target_entropy = -self.action_shape[0] # We optimize log(alpha) because alpha is always larger than 0. self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optim = Adam([self.log_alpha], lr=lr) self.alpha = self.log_alpha.detach().exp() else: self.alpha = ent_coef self.memory = LazyMemory(memory_size, num_sequences, self.observation_shape, self.action_shape, self.device) self.log_dir = log_dir self.model_dir = os.path.join(log_dir, 'model') self.summary_dir = os.path.join(log_dir, 'summary') if not os.path.exists(self.model_dir): os.makedirs(self.model_dir) if not os.path.exists(self.summary_dir): os.makedirs(self.summary_dir) self.writer = SummaryWriter(log_dir=self.summary_dir) self.train_rewards = RunningMeanStats(training_log_interval) self.steps = 0 self.learning_steps = 0 self.episodes = 0 self.initial_latent_steps = initial_latent_steps self.num_sequences = num_sequences self.num_steps = num_steps self.tau = tau self.batch_size = batch_size self.latent_batch_size = latent_batch_size self.start_steps = start_steps self.gamma = gamma self.entropy_tuning = entropy_tuning self.grad_clip = grad_clip self.updates_per_step = updates_per_step self.training_log_interval = training_log_interval self.learning_log_interval = learning_log_interval self.target_update_interval = target_update_interval self.eval_interval = eval_interval