def __init__(self, state_size, action_size, seed, n_hidden_units=128, n_layers=3): self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # actor self.actor = Actor(state_size, action_size, seed).to(device) self.actor_target = Actor(state_size, action_size, seed).to(device) self.actor_opt = optim.Adam(self.actor.parameters(), lr=1e-4) # critic self.critic = Critic(state_size, action_size, seed).to(device) self.critic_target = Critic(state_size, action_size, seed).to(device) self.critic_opt = optim.Adam(self.critic.parameters(), lr=3e-4, weight_decay=0.0001) # will add noise self.noise = OUNoise(action_size, seed) # experience replay self.replay = ReplayBuffer(seed)
def test_a2c(args=get_args()): torch.set_num_threads(1) # for poor CPU env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n # you can also use tianshou.env.SubprocVectorEnv # train_envs = gym.make(args.task) train_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) # test_envs = gym.make(args.task) test_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model net = Net(args.layer_num, args.state_shape, device=args.device) actor = Actor(net, args.action_shape).to(args.device) critic = Critic(net).to(args.device) optim = torch.optim.Adam(list( actor.parameters()) + list(critic.parameters()), lr=args.lr) dist = torch.distributions.Categorical policy = A2CPolicy( actor, critic, optim, dist, args.gamma, gae_lambda=args.gae_lambda, vf_coef=args.vf_coef, ent_coef=args.ent_coef, max_grad_norm=args.max_grad_norm) # collector train_collector = Collector( policy, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector(policy, test_envs) # log log_path = os.path.join(args.logdir, args.task, 'a2c') writer = SummaryWriter(log_path) def save_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(x): return x >= env.spec.reward_threshold # trainer result = onpolicy_trainer( policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.repeat_per_collect, args.test_num, args.batch_size, stop_fn=stop_fn, save_fn=save_fn, writer=writer) assert stop_fn(result['best_reward']) train_collector.close() test_collector.close() if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = gym.make(args.task) collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close()
def test_ddpg(args=get_args()): env = gym.make(args.task) if args.task == 'Pendulum-v0': env.spec.reward_threshold = -250 args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] # train_envs = gym.make(args.task) train_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) # test_envs = gym.make(args.task) test_envs = SubprocVectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model actor = Actor( args.layer_num, args.state_shape, args.action_shape, args.max_action, args.device ).to(args.device) actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr) critic = Critic( args.layer_num, args.state_shape, args.action_shape, args.device ).to(args.device) critic_optim = torch.optim.Adam(critic.parameters(), lr=args.critic_lr) policy = DDPGPolicy( actor, actor_optim, critic, critic_optim, args.tau, args.gamma, args.exploration_noise, [env.action_space.low[0], env.action_space.high[0]], reward_normalization=True, ignore_done=True) # collector train_collector = Collector( policy, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector(policy, test_envs) # log writer = SummaryWriter(args.logdir + '/' + 'ddpg') def stop_fn(x): return x >= env.spec.reward_threshold # trainer result = offpolicy_trainer( policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.test_num, args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task) assert stop_fn(result['best_reward']) train_collector.close() test_collector.close() if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = gym.make(args.task) collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close()
def __init__(self, n_feature, n_action, lr_A=0.001, lr_C=0.01, GAMMA=0.1): self.n_feature = n_feature self.n_action = n_action self.GAMMA = GAMMA self.actor = Actor(self.n_feature, self.n_action) self.critic = Critic(self.n_feature) self.optimizer_actor = optim.Adam(params=self.actor.parameters(), lr=lr_A) self.optimizer_critic = optim.Adam(params=self.critic.parameters(), lr=lr_C) self.cost_his = [] self.value_his = []
def test(args=get_args()): torch.set_num_threads(1) # we just need only one thread for NN env = gym_make() model_path = os.path.join(args.logdir, args.task, 'ddpg/policy.pth') layer = [1024, 512, 512, 512] device = 'cuda' state_shape = env.observation_space.shape or env.observation_space.n action_shape = env.action_space.shape or env.action_space.n action_range = [env.action_space.low, env.action_space.high] actor = Actor( layer, state_shape, action_shape, action_range, device ).to(device) critic = Critic( layer, state_shape, action_shape, device ).to(device) actor = actor.to(device) actor_optim = torch.optim.Adam(actor.parameters()) critic = critic.to(device) critic_optim = torch.optim.Adam(critic.parameters()) policy = DDPGPolicy( actor, actor_optim, critic, critic_optim, action_range=action_range) policy.load_state_dict(torch.load(model_path, map_location=device)) obs = env.reset() # env.state[0] = -30.0 # env.goal[0] = 30.0 env.render() print(env.goal) while True: action, _ = policy.actor(obs.reshape(1,-1), eps=0.01) action = action.detach().cpu().numpy()[0] obs, reward, done, info = env.step(action) # print(env.state) # print(reward) # print(info) env.render() if done: break
def init_policy(args, env): actor = Actor(layer=None, state_shape=args.state_shape, action_shape=args.action_shape, action_range=args.action_range, device=args.device).to(args.device) actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr) critic1 = Critic(layer=None, state_shape=args.state_shape, action_shape=args.action_shape, device=args.device).to(args.device) critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr) critic2 = Critic(layer=None, state_shape=args.state_shape, action_shape=args.action_shape, device=args.device).to(args.device) critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr) policy = TD3Policy(actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim, args.tau, args.gamma, GaussianNoise(sigma=args.exploration_noise), args.policy_noise, args.update_actor_freq, args.noise_clip, args.action_range, reward_normalization=args.rew_norm, ignore_done=args.ignore_done, estimation_step=args.n_step) return policy
def test_sac(args=get_args()): torch.set_num_threads(1) # we just need only one thread for NN env = gym.make(args.task) if args.task == 'Pendulum-v0': env.spec.reward_threshold = -250 args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] # you can also use tianshou.env.SubprocVectorEnv # train_envs = gym.make(args.task) train_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) # test_envs = gym.make(args.task) test_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model actor = ActorProb(args.layer_num, args.state_shape, args.action_shape, args.max_action, args.device).to(args.device) actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr) critic1 = Critic(args.layer_num, args.state_shape, args.action_shape, args.device).to(args.device) critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr) critic2 = Critic(args.layer_num, args.state_shape, args.action_shape, args.device).to(args.device) critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr) policy = SACPolicy(actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim, args.tau, args.gamma, args.alpha, [env.action_space.low[0], env.action_space.high[0]], reward_normalization=True, ignore_done=True) # collector train_collector = Collector(policy, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector(policy, test_envs) # train_collector.collect(n_step=args.buffer_size) # log log_path = os.path.join(args.logdir, args.task, 'sac') writer = SummaryWriter(log_path) def save_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(x): return x >= env.spec.reward_threshold # trainer result = offpolicy_trainer(policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.test_num, args.batch_size, stop_fn=stop_fn, save_fn=save_fn, writer=writer) assert stop_fn(result['best_reward']) train_collector.close() test_collector.close() if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = gym.make(args.task) collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close()
def build_policy(no, args): if no == 0: # server policy net = Net(args.layer_num, args.state_shape, device=args.device) actor = ServerActor(net, (10, )).to(args.device) critic = Critic(net).to(args.device) # orthogonal initialization for m in list(actor.modules()) + list(critic.modules()): if isinstance(m, torch.nn.Linear): torch.nn.init.orthogonal_(m.weight) torch.nn.init.zeros_(m.bias) optim = torch.optim.Adam(list(actor.parameters()) + list(critic.parameters()), lr=args.lr) dist = torch.distributions.Categorical policy = PPOPolicy(actor, critic, optim, dist, discount_factor=args.gamma, max_grad_norm=args.max_grad_norm, eps_clip=args.eps_clip, vf_coef=args.vf_coef, ent_coef=args.ent_coef, gae_lambda=args.gae_lambda, reward_normalization=args.rew_norm, dual_clip=args.dual_clip, value_clip=args.value_clip) elif no == 1: # ND policy net = Net(args.layer_num, (4, ), device=args.device) actor = NFActor(net, (10, )).to(args.device) critic = Critic(net).to(args.device) # orthogonal initialization for m in list(actor.modules()) + list(critic.modules()): if isinstance(m, torch.nn.Linear): torch.nn.init.orthogonal_(m.weight) torch.nn.init.zeros_(m.bias) optim = torch.optim.Adam(list(actor.parameters()) + list(critic.parameters()), lr=args.lr) dist = torch.distributions.Categorical policy = PPOPolicy(actor, critic, optim, dist, discount_factor=args.gamma, max_grad_norm=args.max_grad_norm, eps_clip=args.eps_clip, vf_coef=args.vf_coef, ent_coef=args.ent_coef, gae_lambda=args.gae_lambda, reward_normalization=args.rew_norm, dual_clip=args.dual_clip, value_clip=args.value_clip) elif no == 2: net = Net(args.layer_num, (4, ), device=args.device) actor = RelayActor(net, (10, )).to(args.device) critic = Critic(net).to(args.device) # orthogonal initialization for m in list(actor.modules()) + list(critic.modules()): if isinstance(m, torch.nn.Linear): torch.nn.init.orthogonal_(m.weight) torch.nn.init.zeros_(m.bias) optim = torch.optim.Adam(list(actor.parameters()) + list(critic.parameters()), lr=args.lr) dist = torch.distributions.Categorical policy = PPOPolicy(actor, critic, optim, dist, discount_factor=args.gamma, max_grad_norm=args.max_grad_norm, eps_clip=args.eps_clip, vf_coef=args.vf_coef, ent_coef=args.ent_coef, gae_lambda=args.gae_lambda, reward_normalization=args.rew_norm, dual_clip=args.dual_clip, value_clip=args.value_clip) else: net = Net(args.layer_num, (4, ), device=args.device) actor = NFActor(net, (10, )).to(args.device) critic = Critic(net).to(args.device) # orthogonal initialization for m in list(actor.modules()) + list(critic.modules()): if isinstance(m, torch.nn.Linear): torch.nn.init.orthogonal_(m.weight) torch.nn.init.zeros_(m.bias) optim = torch.optim.Adam(list(actor.parameters()) + list(critic.parameters()), lr=args.lr) dist = torch.distributions.Categorical policy = PPOPolicy(actor, critic, optim, dist, discount_factor=args.gamma, max_grad_norm=args.max_grad_norm, eps_clip=args.eps_clip, vf_coef=args.vf_coef, ent_coef=args.ent_coef, gae_lambda=args.gae_lambda, reward_normalization=args.rew_norm, dual_clip=args.dual_clip, value_clip=args.value_clip) return policy
def test_ppo(args=get_args()): torch.set_num_threads(1) # we just need only one thread for NN env = gym.make(args.task) if args.task == 'Pendulum-v0': env.spec.reward_threshold = -250 args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] # you can also use tianshou.env.SubprocVectorEnv # train_envs = gym.make(args.task) train_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) # test_envs = gym.make(args.task) test_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model actor = ActorProb(args.layer_num, args.state_shape, args.action_shape, args.max_action, args.device).to(args.device) critic = Critic(args.layer_num, args.state_shape, device=args.device).to(args.device) optim = torch.optim.Adam(list(actor.parameters()) + list(critic.parameters()), lr=args.lr) dist = torch.distributions.Normal policy = PPOPolicy( actor, critic, optim, dist, args.gamma, max_grad_norm=args.max_grad_norm, eps_clip=args.eps_clip, vf_coef=args.vf_coef, ent_coef=args.ent_coef, reward_normalization=args.rew_norm, dual_clip=args.dual_clip, value_clip=args.value_clip, # action_range=[env.action_space.low[0], env.action_space.high[0]],) # if clip the action, ppo would not converge :) gae_lambda=args.gae_lambda) # collector train_collector = Collector(policy, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector(policy, test_envs) # log log_path = os.path.join(args.logdir, args.task, 'ppo') writer = SummaryWriter(log_path) def save_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(x): return x >= env.spec.reward_threshold # trainer result = onpolicy_trainer(policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.repeat_per_collect, args.test_num, args.batch_size, stop_fn=stop_fn, save_fn=save_fn, writer=writer) assert stop_fn(result['best_reward']) train_collector.close() test_collector.close() if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = gym.make(args.task) collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close()
def _test_ppo(args=get_args()): # just a demo, I have not made it work :( env = gym.make(args.task) if args.task == 'Pendulum-v0': env.spec.reward_threshold = -250 args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] # train_envs = gym.make(args.task) train_envs = SubprocVectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) # test_envs = gym.make(args.task) test_envs = SubprocVectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model actor = ActorProb(args.layer_num, args.state_shape, args.action_shape, args.max_action, args.device).to(args.device) critic = Critic(args.layer_num, args.state_shape, device=args.device).to(args.device) optim = torch.optim.Adam(list(actor.parameters()) + list(critic.parameters()), lr=args.lr) dist = torch.distributions.Normal policy = PPOPolicy( actor, critic, optim, dist, args.gamma, max_grad_norm=args.max_grad_norm, eps_clip=args.eps_clip, vf_coef=args.vf_coef, ent_coef=args.ent_coef, action_range=[env.action_space.low[0], env.action_space.high[0]]) # collector train_collector = Collector(policy, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector(policy, test_envs) train_collector.collect(n_step=args.step_per_epoch) # log writer = SummaryWriter(args.logdir + '/' + 'ppo') def stop_fn(x): return x >= env.spec.reward_threshold # trainer result = onpolicy_trainer(policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.repeat_per_collect, args.test_num, args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task) assert stop_fn(result['best_reward']) train_collector.close() test_collector.close() if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = gym.make(args.task) collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close()
def test_sac(args=get_args()): env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] # train_envs = gym.make(args.task) train_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) # test_envs = gym.make(args.task) test_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model actor = ActorProb(args.layer_num, args.state_shape, args.action_shape, args.max_action, args.device).to(args.device) actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr) critic1 = Critic(args.layer_num, args.state_shape, args.action_shape, args.device).to(args.device) critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr) critic2 = Critic(args.layer_num, args.state_shape, args.action_shape, args.device).to(args.device) critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr) if args.auto_alpha: target_entropy = -np.prod(env.action_space.shape) log_alpha = torch.zeros(1, requires_grad=True, device=args.device) alpha_optim = torch.optim.Adam([log_alpha], lr=args.alpha_lr) alpha = (target_entropy, log_alpha, alpha_optim) else: alpha = args.alpha policy = SACPolicy(actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim, args.tau, args.gamma, alpha, [env.action_space.low[0], env.action_space.high[0]], reward_normalization=args.rew_norm, ignore_done=True, exploration_noise=OUNoise(0.0, args.noise_std)) # collector train_collector = Collector(policy, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector(policy, test_envs) # train_collector.collect(n_step=args.buffer_size) # log log_path = os.path.join(args.logdir, args.task, 'sac') writer = SummaryWriter(log_path) def save_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(x): return x >= env.spec.reward_threshold # trainer result = offpolicy_trainer(policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.test_num, args.batch_size, stop_fn=stop_fn, save_fn=save_fn, writer=writer) assert stop_fn(result['best_reward']) train_collector.close() test_collector.close() if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = gym.make(args.task) collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close()
])) assert dataset dataloader = torch.utils.data.DataLoader(dataset, batch_size=opt.batch_size, shuffle=True, num_workers=opt.n_cpu) def weights_init(m): classname = m.__class__.__name__ if classname.find('Conv') != -1: m.weight.data.normal_(0.0, 0.02) elif classname.find('BatchNorm') != -1: m.weight.data.normal_(1.0, 0.02) m.bias.data.fill_(0) # create the objects for two networks and for the two optimizers generator = Generator(latent=opt.latent, channels=opt.channels, num_filters=opt.num_filters) critic = Critic(channels=opt.channels, num_filters=opt.num_filters) optimizer_G = torch.optim.RMSprop(generator.parameters(), lr=opt.learning_rate) optimizer_C = torch.optim.RMSprop(critic.parameters(), lr=opt.learning_rate) # put the nets on gpu device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") generator, critic = generator.to(device), critic.to(device) generator.apply(weights_init) critic.apply(weights_init) if opt.dataset == 'cifar10': print(ROOT_DIR + "/cifar") if not os.path.isdir(ROOT_DIR + "/cifar"): os.mkdir(ROOT_DIR + "/cifar") elif opt.dataset == 'LSUN': if not os.path.isdir(ROOT_DIR + "/bedrooms"):
class DDPGAgent: def __init__(self, state_size, action_size, seed, n_hidden_units=128, n_layers=3): self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # actor self.actor = Actor(state_size, action_size, seed).to(device) self.actor_target = Actor(state_size, action_size, seed).to(device) self.actor_opt = optim.Adam(self.actor.parameters(), lr=1e-4) # critic self.critic = Critic(state_size, action_size, seed).to(device) self.critic_target = Critic(state_size, action_size, seed).to(device) self.critic_opt = optim.Adam(self.critic.parameters(), lr=3e-4, weight_decay=0.0001) # will add noise self.noise = OUNoise(action_size, seed) # experience replay self.replay = ReplayBuffer(seed) def act(self, state, noise=True): ''' Returns actions taken. ''' state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.actor.eval() with torch.no_grad(): action = self.actor(state).cpu().data.numpy() self.actor.train() if noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def step(self, state, action, reward, next_state, done): ''' Save experiences into replay and sample if replay contains enough experiences ''' self.replay.add(state, action, reward, next_state, done) if self.replay.len() > self.replay.batch_size: experiences = self.replay.sample() self.learn(experiences, GAMMA) def learn(self, experiences, gamma): ''' Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params: experiences (Tuple[torch.Tensor]): tuple of (s, a, r, n_s, done) tuples gamma (float): discount factor ''' states, actions, rewards, next_states, dones = experiences # update critic: # get predicted next state actions and Qvalues from targets next_actions = self.actor_target(next_states) next_Q_targets = self.critic_target(next_states, next_actions) # get current state Qvalues Q_targets = rewards + (GAMMA * next_Q_targets * (1 - dones)) # compute citic loss Q_expected = self.critic(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # minimize loss self.critic_opt.zero_grad() critic_loss.backward(retain_graph=True) self.critic_opt.step() # update actor: # compute actor loss action_predictions = self.actor(states) actor_loss = -self.critic(states, action_predictions).mean() # minimize actor loss self.actor_opt.zero_grad() actor_loss.backward(retain_graph=True) self.actor_opt.step() # update target networks self.soft_update(self.critic, self.critic_target, TAU) self.soft_update(self.actor, self.actor_target, TAU) def soft_update(self, local, target, tau): ''' Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params: local: PyTorch model (weights will be copied from) target: PyTorch model (weights will be copied to) tau (float): interpolation parameter ''' for target_param, local_param in zip(target.parameters(), local.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
def train(args=get_args()): torch.set_num_threads(1) # we just need only one thread for NN env = gym_make() args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.action_range = [env.action_space.low[0], env.action_space.high[0]] args.layer = [1024, 512, 512, 512] train_envs = VectorEnv( [lambda: gym_make() for _ in range(args.training_num)]) # test_envs = gym.make(args.task) test_envs = VectorEnv( [lambda: gym_make() for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model actor = Actor( args.layer, args.state_shape, args.action_shape, args.action_range, args.device ).to(args.device) actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr) critic = Critic( args.layer, args.state_shape, args.action_shape, args.device ).to(args.device) critic_optim = torch.optim.Adam(critic.parameters(), lr=args.critic_lr) policy = DDPGPolicy( actor, actor_optim, critic, critic_optim, args.tau, args.gamma, args.exploration_noise, args.action_range, reward_normalization=args.rew_norm, ignore_done=True) # collector train_collector = Collector( policy, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector(policy, test_envs) # log log_path = os.path.join(args.logdir, args.task, 'ddpg') writer = SummaryWriter(log_path) # if a model exist, continue to train it model_path = os.path.join(log_path, 'policy.pth') # if os.path.exists(model_path): # policy.load_state_dict(torch.load(model_path)) def save_fn(policy): torch.save(policy.state_dict(), model_path) # trainer result = offpolicy_trainer( policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.test_num, args.batch_size, save_fn=save_fn, writer=writer) train_collector.close() test_collector.close() if __name__ == '__main__': # Let's watch its performance! env = gym_make() collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close()
class Actor_Critic: def __init__(self, n_feature, n_action, lr_A=0.001, lr_C=0.01, GAMMA=0.1): self.n_feature = n_feature self.n_action = n_action self.GAMMA = GAMMA self.actor = Actor(self.n_feature, self.n_action) self.critic = Critic(self.n_feature) self.optimizer_actor = optim.Adam(params=self.actor.parameters(), lr=lr_A) self.optimizer_critic = optim.Adam(params=self.critic.parameters(), lr=lr_C) self.cost_his = [] self.value_his = [] def actor_learn(self, s, a, td_error): s = torch.Tensor(s[np.newaxis, :]) ##batch=1 action_prob = self.actor(s) ##[batch x self.n_action]-->[1, n_action] log_prob = torch.log(action_prob[0, a]) ## a in action index self.exp_v = torch.mean(-1 * log_prob * td_error) self.optimizer_actor.zero_grad() self.exp_v.backward() self.optimizer_actor.step() self.value_his.append(self.exp_v.item()) return self.exp_v def choose_action(self, s): s = torch.Tensor(s[np.newaxis, :]) probs = self.actor(s) return np.random.choice(range(probs.shape[1]), p=probs.clone().detach().numpy().ravel()) def critic_learn(self, s, r, s_): s, s_ = torch.Tensor(s[np.newaxis, :]), torch.Tensor(s_[np.newaxis, :]) v_ = self.critic(s_) ##part of Q target v = self.critic(s) td_error = F.mse_loss(v, r + self.GAMMA * v_) self.cost_his.append(td_error.item()) self.optimizer_critic.zero_grad() td_error.backward() self.optimizer_critic.step() return td_error.item() def plot_cost(self): import matplotlib.pyplot as plt fig, (ax1, ax2) = plt.subplots(2, 1) ax1.plot(np.arange(len(self.cost_his)), self.cost_his) ax1.set_ylabel('Critic TD error') ax2.plot(np.arange(len(self.value_his)), self.value_his) ax2.set_ylabel('Actor value') ax2.set_xlabel('training steps') plt.show() pass