示例#1
0
    def __init__(self,
                 state_size,
                 action_size,
                 seed,
                 n_hidden_units=128,
                 n_layers=3):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # actor
        self.actor = Actor(state_size, action_size, seed).to(device)
        self.actor_target = Actor(state_size, action_size, seed).to(device)
        self.actor_opt = optim.Adam(self.actor.parameters(), lr=1e-4)

        # critic
        self.critic = Critic(state_size, action_size, seed).to(device)
        self.critic_target = Critic(state_size, action_size, seed).to(device)
        self.critic_opt = optim.Adam(self.critic.parameters(),
                                     lr=3e-4,
                                     weight_decay=0.0001)

        # will add noise
        self.noise = OUNoise(action_size, seed)

        # experience replay
        self.replay = ReplayBuffer(seed)
示例#2
0
def init_policy(args, env):
    actor = Actor(layer=None,
                  state_shape=args.state_shape,
                  action_shape=args.action_shape,
                  action_range=args.action_range,
                  device=args.device).to(args.device)
    actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr)
    critic1 = Critic(layer=None,
                     state_shape=args.state_shape,
                     action_shape=args.action_shape,
                     device=args.device).to(args.device)
    critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr)
    critic2 = Critic(layer=None,
                     state_shape=args.state_shape,
                     action_shape=args.action_shape,
                     device=args.device).to(args.device)
    critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr)
    policy = TD3Policy(actor,
                       actor_optim,
                       critic1,
                       critic1_optim,
                       critic2,
                       critic2_optim,
                       args.tau,
                       args.gamma,
                       GaussianNoise(sigma=args.exploration_noise),
                       args.policy_noise,
                       args.update_actor_freq,
                       args.noise_clip,
                       args.action_range,
                       reward_normalization=args.rew_norm,
                       ignore_done=args.ignore_done,
                       estimation_step=args.n_step)
    return policy
示例#3
0
def test_a2c(args=get_args()):
    torch.set_num_threads(1)  # for poor CPU
    env = gym.make(args.task)
    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.action_space.shape or env.action_space.n
    # you can also use tianshou.env.SubprocVectorEnv
    # train_envs = gym.make(args.task)
    train_envs = VectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.training_num)])
    # test_envs = gym.make(args.task)
    test_envs = VectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.test_num)])
    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    train_envs.seed(args.seed)
    test_envs.seed(args.seed)
    # model
    net = Net(args.layer_num, args.state_shape, device=args.device)
    actor = Actor(net, args.action_shape).to(args.device)
    critic = Critic(net).to(args.device)
    optim = torch.optim.Adam(list(
        actor.parameters()) + list(critic.parameters()), lr=args.lr)
    dist = torch.distributions.Categorical
    policy = A2CPolicy(
        actor, critic, optim, dist, args.gamma, gae_lambda=args.gae_lambda,
        vf_coef=args.vf_coef, ent_coef=args.ent_coef,
        max_grad_norm=args.max_grad_norm)
    # collector
    train_collector = Collector(
        policy, train_envs, ReplayBuffer(args.buffer_size))
    test_collector = Collector(policy, test_envs)
    # log
    log_path = os.path.join(args.logdir, args.task, 'a2c')
    writer = SummaryWriter(log_path)

    def save_fn(policy):
        torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))

    def stop_fn(x):
        return x >= env.spec.reward_threshold

    # trainer
    result = onpolicy_trainer(
        policy, train_collector, test_collector, args.epoch,
        args.step_per_epoch, args.collect_per_step, args.repeat_per_collect,
        args.test_num, args.batch_size, stop_fn=stop_fn, save_fn=save_fn,
        writer=writer)
    assert stop_fn(result['best_reward'])
    train_collector.close()
    test_collector.close()
    if __name__ == '__main__':
        pprint.pprint(result)
        # Let's watch its performance!
        env = gym.make(args.task)
        collector = Collector(policy, env)
        result = collector.collect(n_episode=1, render=args.render)
        print(f'Final reward: {result["rew"]}, length: {result["len"]}')
        collector.close()
示例#4
0
def test_ddpg(args=get_args()):
    env = gym.make(args.task)
    if args.task == 'Pendulum-v0':
        env.spec.reward_threshold = -250
    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.action_space.shape or env.action_space.n
    args.max_action = env.action_space.high[0]
    # train_envs = gym.make(args.task)
    train_envs = VectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.training_num)])
    # test_envs = gym.make(args.task)
    test_envs = SubprocVectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.test_num)])
    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    train_envs.seed(args.seed)
    test_envs.seed(args.seed)
    # model
    actor = Actor(
        args.layer_num, args.state_shape, args.action_shape,
        args.max_action, args.device
    ).to(args.device)
    actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr)
    critic = Critic(
        args.layer_num, args.state_shape, args.action_shape, args.device
    ).to(args.device)
    critic_optim = torch.optim.Adam(critic.parameters(), lr=args.critic_lr)
    policy = DDPGPolicy(
        actor, actor_optim, critic, critic_optim,
        args.tau, args.gamma, args.exploration_noise,
        [env.action_space.low[0], env.action_space.high[0]],
        reward_normalization=True, ignore_done=True)
    # collector
    train_collector = Collector(
        policy, train_envs, ReplayBuffer(args.buffer_size))
    test_collector = Collector(policy, test_envs)
    # log
    writer = SummaryWriter(args.logdir + '/' + 'ddpg')

    def stop_fn(x):
        return x >= env.spec.reward_threshold

    # trainer
    result = offpolicy_trainer(
        policy, train_collector, test_collector, args.epoch,
        args.step_per_epoch, args.collect_per_step, args.test_num,
        args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task)
    assert stop_fn(result['best_reward'])
    train_collector.close()
    test_collector.close()
    if __name__ == '__main__':
        pprint.pprint(result)
        # Let's watch its performance!
        env = gym.make(args.task)
        collector = Collector(policy, env)
        result = collector.collect(n_episode=1, render=args.render)
        print(f'Final reward: {result["rew"]}, length: {result["len"]}')
        collector.close()
示例#5
0
    def __init__(self, n_feature, n_action, lr_A=0.001, lr_C=0.01, GAMMA=0.1):
        self.n_feature = n_feature
        self.n_action = n_action
        self.GAMMA = GAMMA

        self.actor = Actor(self.n_feature, self.n_action)
        self.critic = Critic(self.n_feature)

        self.optimizer_actor = optim.Adam(params=self.actor.parameters(),
                                          lr=lr_A)
        self.optimizer_critic = optim.Adam(params=self.critic.parameters(),
                                           lr=lr_C)

        self.cost_his = []
        self.value_his = []
示例#6
0
def test(args=get_args()):
    env = QuadcopterEnv()
    model = Actor(None, env.observation_space.shape, env.action_space.shape,
                  [-1, 1], args.device).to(args.device)
    args.model_path = os.path.join(args.logdir, 'lqr')
    model.load_state_dict(
        torch.load(os.path.join(args.model_path, 'policy.pth'),
                   map_location=args.device))
    for i in range(10):
        obs = env.reset()
        env.render()
        done = False
        while not done:
            act = model(obs.reshape((1, -1)))[0].detach().cpu().numpy()[0]
            obs, reward, done, info = env.step(act)
            env.render()
示例#7
0
def test(args=get_args()):
    torch.set_num_threads(1)  # we just need only one thread for NN

    env = gym_make()

    model_path = os.path.join(args.logdir, args.task, 'ddpg/policy.pth')
    layer = [1024, 512, 512, 512]
    device = 'cuda'

    state_shape = env.observation_space.shape or env.observation_space.n
    action_shape = env.action_space.shape or env.action_space.n
    action_range = [env.action_space.low, env.action_space.high]
    actor = Actor(
        layer, state_shape, action_shape,
        action_range, device
    ).to(device)
    critic = Critic(
        layer, state_shape, action_shape, device
    ).to(device)

    actor = actor.to(device)
    actor_optim = torch.optim.Adam(actor.parameters())
    critic = critic.to(device)
    critic_optim = torch.optim.Adam(critic.parameters())
    policy = DDPGPolicy(
        actor, actor_optim, critic, critic_optim,
        action_range=action_range)
    policy.load_state_dict(torch.load(model_path, map_location=device))
    obs = env.reset()
    # env.state[0] = -30.0
    # env.goal[0] = 30.0
    env.render()
    print(env.goal)
    while True:
        action, _ = policy.actor(obs.reshape(1,-1), eps=0.01)
        action = action.detach().cpu().numpy()[0]
        
        obs, reward, done, info = env.step(action)
        # print(env.state)
        # print(reward)
        # print(info)
        env.render()
        if done:
            break
示例#8
0
def test(args=get_args()):
    env = DubinEnv()
    # env.set_obs([])
    model = Actor(None, env.observation_space['dynamics'].shape,
                  env.action_space.shape, [-1, 1], args.device).to(args.device)
    args.model_path = os.path.join(args.logdir, 'lqr')
    model.load_state_dict(
        torch.load(os.path.join(args.model_path, 'policy.pth'),
                   map_location=args.device))
    for i in range(10):
        env.reset()
        # env.state[:2] -= env.goal[:2]
        # env.goal[:2] -= env.goal[:2]
        obs = env._obs()
        env.render()
        done = False
        while not done:
            normed_obs = obs['dynamics'].reshape((1, -1))
            # /np.array([20,20,np.pi,1,np.pi])

            act = model(normed_obs)[0].detach().cpu().numpy()[0]
            obs, reward, done, info = env.step(act)
            env.render()
示例#9
0
def init_policy(args, env):
    actor = Actor(layer=None,
                  state_shape=args.state_shape,
                  action_shape=args.action_shape,
                  action_range=args.action_range,
                  device=args.device).to(args.device)
    critic = Critic(layer=None,
                    state_shape=args.state_shape,
                    action_shape=args.action_shape,
                    device=args.device).to(args.device)
    # orthogonal initialization
    for m in list(actor.modules()) + list(critic.modules()):
        if isinstance(m, torch.nn.Linear):
            torch.nn.init.orthogonal_(m.weight)
            torch.nn.init.zeros_(m.bias)
    optim = torch.optim.Adam(list(actor.parameters()) +
                             list(critic.parameters()),
                             lr=args.lr)
    dist = DiagGaussian
    policy = PPOPolicy(
        actor,
        critic,
        optim,
        dist,
        args.gamma,
        max_grad_norm=args.max_grad_norm,
        eps_clip=args.eps_clip,
        vf_coef=args.vf_coef,
        ent_coef=args.ent_coef,
        reward_normalization=args.rew_norm,
        # dual_clip=args.dual_clip,
        # dual clip cause monotonically increasing log_std :)
        value_clip=args.value_clip,
        # action_range=[env.action_space.low[0], env.action_space.high[0]],)
        # if clip the action, ppo would not converge :)
        gae_lambda=args.gae_lambda)
    return policy
示例#10
0
def load_model_ddpg(model_path, user_embeddings_path, item_embeddings_path,
                    input_dim, action_dim, hidden_size, device):
    with open(user_embeddings_path, "rb") as f:
        user_embeddings = np.load(f)
    with open(item_embeddings_path, "rb") as f:
        item_embeddings = np.load(f)
    model = Actor(input_dim, action_dim, hidden_size, user_embeddings,
                  item_embeddings)
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.eval()
    return model
示例#11
0
class DDPGAgent:
    def __init__(self,
                 state_size,
                 action_size,
                 seed,
                 n_hidden_units=128,
                 n_layers=3):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # actor
        self.actor = Actor(state_size, action_size, seed).to(device)
        self.actor_target = Actor(state_size, action_size, seed).to(device)
        self.actor_opt = optim.Adam(self.actor.parameters(), lr=1e-4)

        # critic
        self.critic = Critic(state_size, action_size, seed).to(device)
        self.critic_target = Critic(state_size, action_size, seed).to(device)
        self.critic_opt = optim.Adam(self.critic.parameters(),
                                     lr=3e-4,
                                     weight_decay=0.0001)

        # will add noise
        self.noise = OUNoise(action_size, seed)

        # experience replay
        self.replay = ReplayBuffer(seed)

    def act(self, state, noise=True):
        '''
            Returns actions taken.
        '''
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.actor.eval()
        with torch.no_grad():
            action = self.actor(state).cpu().data.numpy()
        self.actor.train()
        if noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def step(self, state, action, reward, next_state, done):
        '''
            Save experiences into replay and sample if replay contains enough experiences
        '''
        self.replay.add(state, action, reward, next_state, done)

        if self.replay.len() > self.replay.batch_size:
            experiences = self.replay.sample()
            self.learn(experiences, GAMMA)

    def learn(self, experiences, gamma):
        '''
            Update policy and value parameters using given batch of experience tuples.
            Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
            where:
                actor_target(state) -> action
                critic_target(state, action) -> Q-value
            Params: experiences (Tuple[torch.Tensor]): tuple of (s, a, r, n_s, done) tuples
                    gamma (float): discount factor
        '''
        states, actions, rewards, next_states, dones = experiences
        # update critic:
        #   get predicted next state actions and Qvalues from targets
        next_actions = self.actor_target(next_states)
        next_Q_targets = self.critic_target(next_states, next_actions)
        #   get current state Qvalues
        Q_targets = rewards + (GAMMA * next_Q_targets * (1 - dones))
        #   compute citic loss
        Q_expected = self.critic(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        #   minimize loss
        self.critic_opt.zero_grad()
        critic_loss.backward(retain_graph=True)
        self.critic_opt.step()

        # update actor:
        #   compute actor loss
        action_predictions = self.actor(states)
        actor_loss = -self.critic(states, action_predictions).mean()
        #   minimize actor loss
        self.actor_opt.zero_grad()
        actor_loss.backward(retain_graph=True)
        self.actor_opt.step()

        # update target networks
        self.soft_update(self.critic, self.critic_target, TAU)
        self.soft_update(self.actor, self.actor_target, TAU)

    def soft_update(self, local, target, tau):
        '''
            Soft update model parameters.
            θ_target = τ*θ_local + (1 - τ)*θ_target
            Params: local: PyTorch model (weights will be copied from)
                    target: PyTorch model (weights will be copied to)
                    tau (float): interpolation parameter
        '''
        for target_param, local_param in zip(target.parameters(),
                                             local.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
示例#12
0
def train(args=get_args()):
    trajs = pickle.load(open('lqr_quadcopter', 'rb'))
    states, acts = trajs['states'], trajs['acts']
    print(np.max(abs(states[:, 0])), np.max(abs(states[:, 1])),
          np.max(abs(states[:, 2])), np.max(abs(acts[:, 0])),
          np.max(abs(acts[:, 1])), np.max(abs(acts[:, 2])),
          np.max(abs(acts[:, 3])))
    env = QuadcopterEnv()
    assert len(states) == len(acts), 'X, y sizes mismatch'
    print(len(states))
    # shuffle
    indices = np.arange(len(states), dtype=int)
    np.random.shuffle(indices)
    # convert training data into tensors
    states = torch.tensor(states[indices],
                          device=args.device,
                          dtype=torch.float)
    acts = torch.tensor(acts[indices], device=args.device, dtype=torch.float)

    model = Actor(None, env.observation_space.shape, env.action_space.shape,
                  [-1, 1], args.device).to(args.device)

    loss = nn.MSELoss()
    optimizer = torch.optim.Adagrad(model.parameters())

    # Train the Models

    best_epoch = None
    best_loss = None
    args.model_path = os.path.join(args.logdir, 'lqr')
    writer = SummaryWriter(log_dir=args.model_path)

    for epoch in range(1, args.epoch + 1):
        loss_train = 0.0
        for i in range(0, len(states), args.batch_size):
            # Forward, Backward and Optimize

            # we need zero_grad since pytorch accumulates gradient
            # this is useful in weight sharing like CNN
            model.zero_grad()

            i_ = i+args.batch_size if i + \
                args.batch_size <= len(states) else len(states)
            b_states, b_acts = states[i:i_], acts[i:i_]
            c_acts = model(b_states)[0]
            loss_ = loss(c_acts, b_acts)

            loss_.backward()
            optimizer.step()

            loss_train = loss_train + loss_
        # loss_train = loss(model(total_pcs[maps],states), nexts)
        writer.add_scalar('Loss/train',
                          loss_train / (len(states) / args.batch_size), epoch)

        # Save the models
        if epoch != 0 and epoch % 50 == 0:
            print("loss: {} in #{}".format(loss_train, epoch))
            if best_epoch == None or loss_train < best_loss:
                best_loss, best_epoch = loss_train, epoch
            print('best_loss: {} in #{}'.format(best_loss, best_epoch))
            if best_epoch == epoch:
                torch.save(model.state_dict(),
                           os.path.join(args.model_path, 'policy.pth'))
    writer.close()
示例#13
0
def train(args=get_args()):
    torch.set_num_threads(1)  # we just need only one thread for NN
    env = gym_make()
    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.action_space.shape or env.action_space.n
    args.action_range = [env.action_space.low[0], env.action_space.high[0]]
    args.layer = [1024, 512, 512, 512]

    train_envs = VectorEnv(
        [lambda: gym_make() for _ in range(args.training_num)])
    # test_envs = gym.make(args.task)
    test_envs = VectorEnv(
        [lambda: gym_make() for _ in range(args.test_num)])
    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    train_envs.seed(args.seed)
    test_envs.seed(args.seed)
    # model
    actor = Actor(
        args.layer, args.state_shape, args.action_shape,
        args.action_range, args.device
    ).to(args.device)
    actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr)
    critic = Critic(
        args.layer, args.state_shape, args.action_shape, args.device
    ).to(args.device)
    critic_optim = torch.optim.Adam(critic.parameters(), lr=args.critic_lr)
    policy = DDPGPolicy(
        actor, actor_optim, critic, critic_optim,
        args.tau, args.gamma, args.exploration_noise,
        args.action_range,
        reward_normalization=args.rew_norm, ignore_done=True)
    # collector
    train_collector = Collector(
        policy, train_envs, ReplayBuffer(args.buffer_size))
    test_collector = Collector(policy, test_envs)
    # log
    log_path = os.path.join(args.logdir, args.task, 'ddpg')
    writer = SummaryWriter(log_path)

    # if a model exist, continue to train it
    model_path = os.path.join(log_path, 'policy.pth')
    # if os.path.exists(model_path):
    #     policy.load_state_dict(torch.load(model_path))
    def save_fn(policy):
        torch.save(policy.state_dict(), model_path)


    # trainer
    result = offpolicy_trainer(
        policy, train_collector, test_collector, args.epoch,
        args.step_per_epoch, args.collect_per_step, args.test_num,
        args.batch_size, save_fn=save_fn, writer=writer)
    train_collector.close()
    test_collector.close()
    if __name__ == '__main__':
        # Let's watch its performance!
        env = gym_make()
        collector = Collector(policy, env)
        result = collector.collect(n_episode=1, render=args.render)
        print(f'Final reward: {result["rew"]}, length: {result["len"]}')
        collector.close()
示例#14
0
def test_sac_with_il(args=get_args()):
    torch.set_num_threads(1)  # we just need only one thread for NN
    env = gym.make(args.task)
    if args.task == 'Pendulum-v0':
        env.spec.reward_threshold = -250
    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.action_space.shape or env.action_space.n
    args.max_action = env.action_space.high[0]
    # you can also use tianshou.env.SubprocVectorEnv
    # train_envs = gym.make(args.task)
    train_envs = VectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.training_num)])
    # test_envs = gym.make(args.task)
    test_envs = VectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.test_num)])
    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    train_envs.seed(args.seed)
    test_envs.seed(args.seed)
    # model
    actor = ActorProb(args.layer_num, args.state_shape, args.action_shape,
                      args.max_action, args.device).to(args.device)
    actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr)
    critic1 = Critic(args.layer_num, args.state_shape, args.action_shape,
                     args.device).to(args.device)
    critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr)
    critic2 = Critic(args.layer_num, args.state_shape, args.action_shape,
                     args.device).to(args.device)
    critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr)
    policy = SACPolicy(actor,
                       actor_optim,
                       critic1,
                       critic1_optim,
                       critic2,
                       critic2_optim,
                       args.tau,
                       args.gamma,
                       args.alpha,
                       [env.action_space.low[0], env.action_space.high[0]],
                       reward_normalization=args.rew_norm,
                       ignore_done=args.ignore_done,
                       estimation_step=args.n_step)
    # collector
    train_collector = Collector(policy, train_envs,
                                ReplayBuffer(args.buffer_size))
    test_collector = Collector(policy, test_envs)
    # train_collector.collect(n_step=args.buffer_size)
    # log
    log_path = os.path.join(args.logdir, args.task, 'sac')
    writer = SummaryWriter(log_path)

    def save_fn(policy):
        torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))

    def stop_fn(x):
        return x >= env.spec.reward_threshold

    # trainer
    result = offpolicy_trainer(policy,
                               train_collector,
                               test_collector,
                               args.epoch,
                               args.step_per_epoch,
                               args.collect_per_step,
                               args.test_num,
                               args.batch_size,
                               stop_fn=stop_fn,
                               save_fn=save_fn,
                               writer=writer)
    assert stop_fn(result['best_reward'])
    test_collector.close()
    if __name__ == '__main__':
        pprint.pprint(result)
        # Let's watch its performance!
        env = gym.make(args.task)
        collector = Collector(policy, env)
        result = collector.collect(n_episode=1, render=args.render)
        print(f'Final reward: {result["rew"]}, length: {result["len"]}')
        collector.close()

    # here we define an imitation collector with a trivial policy
    if args.task == 'Pendulum-v0':
        env.spec.reward_threshold = -300  # lower the goal
    net = Actor(1, args.state_shape, args.action_shape, args.max_action,
                args.device).to(args.device)
    optim = torch.optim.Adam(net.parameters(), lr=args.il_lr)
    il_policy = ImitationPolicy(net, optim, mode='continuous')
    il_test_collector = Collector(il_policy, test_envs)
    train_collector.reset()
    result = offpolicy_trainer(il_policy,
                               train_collector,
                               il_test_collector,
                               args.epoch,
                               args.step_per_epoch // 5,
                               args.collect_per_step,
                               args.test_num,
                               args.batch_size,
                               stop_fn=stop_fn,
                               save_fn=save_fn,
                               writer=writer)
    assert stop_fn(result['best_reward'])
    train_collector.close()
    il_test_collector.close()
    if __name__ == '__main__':
        pprint.pprint(result)
        # Let's watch its performance!
        env = gym.make(args.task)
        collector = Collector(il_policy, env)
        result = collector.collect(n_episode=1, render=args.render)
        print(f'Final reward: {result["rew"]}, length: {result["len"]}')
        collector.close()
示例#15
0
def test_ppo(args=get_args()):
    env = gym.make(args.task)
    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.action_space.shape or env.action_space.n
    # train_envs = gym.make(args.task)
    train_envs = SubprocVectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.training_num)])
    # test_envs = gym.make(args.task)
    test_envs = SubprocVectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.test_num)])
    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    train_envs.seed(args.seed)
    test_envs.seed(args.seed)
    # model
    net = Net(args.layer_num, args.state_shape, device=args.device)
    actor = Actor(net, args.action_shape).to(args.device)
    critic = Critic(net).to(args.device)
    optim = torch.optim.Adam(list(actor.parameters()) +
                             list(critic.parameters()),
                             lr=args.lr)
    dist = torch.distributions.Categorical
    policy = PPOPolicy(actor,
                       critic,
                       optim,
                       dist,
                       args.gamma,
                       max_grad_norm=args.max_grad_norm,
                       eps_clip=args.eps_clip,
                       vf_coef=args.vf_coef,
                       ent_coef=args.ent_coef,
                       action_range=None)
    # collector
    train_collector = Collector(policy, train_envs,
                                ReplayBuffer(args.buffer_size))
    test_collector = Collector(policy, test_envs)
    # log
    writer = SummaryWriter(args.logdir + '/' + 'ppo')

    def stop_fn(x):
        return x >= env.spec.reward_threshold

    # trainer
    result = onpolicy_trainer(policy,
                              train_collector,
                              test_collector,
                              args.epoch,
                              args.step_per_epoch,
                              args.collect_per_step,
                              args.repeat_per_collect,
                              args.test_num,
                              args.batch_size,
                              stop_fn=stop_fn,
                              writer=writer,
                              task=args.task)
    assert stop_fn(result['best_reward'])
    train_collector.close()
    test_collector.close()
    if __name__ == '__main__':
        pprint.pprint(result)
        # Let's watch its performance!
        env = gym.make(args.task)
        collector = Collector(policy, env)
        result = collector.collect(n_episode=1, render=args.render)
        print(f'Final reward: {result["rew"]}, length: {result["len"]}')
        collector.close()
示例#16
0
def test_ddpg(args=get_args()):
    torch.set_num_threads(1)  # we just need only one thread for NN
    env = gym.make(args.task)
    if args.task == 'Pendulum-v0':
        env.spec.reward_threshold = -250
    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.action_space.shape or env.action_space.n
    args.max_action = env.action_space.high[0]
    # you can also use tianshou.env.SubprocVectorEnv
    # train_envs = gym.make(args.task)
    train_envs = VectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.training_num)])
    # test_envs = gym.make(args.task)
    test_envs = VectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.test_num)])
    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    train_envs.seed(args.seed)
    test_envs.seed(args.seed)
    # model
    actor = Actor(args.layer_num, args.state_shape, args.action_shape,
                  args.max_action, args.device).to(args.device)
    actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr)
    critic = Critic(args.layer_num, args.state_shape, args.action_shape,
                    args.device).to(args.device)
    critic_optim = torch.optim.Adam(critic.parameters(), lr=args.critic_lr)
    policy = DDPGPolicy(actor,
                        actor_optim,
                        critic,
                        critic_optim,
                        args.tau,
                        args.gamma,
                        GaussianNoise(sigma=args.exploration_noise),
                        [env.action_space.low[0], env.action_space.high[0]],
                        reward_normalization=args.rew_norm,
                        ignore_done=args.ignore_done,
                        estimation_step=args.n_step)
    # collector
    train_collector = Collector(policy, train_envs,
                                ReplayBuffer(args.buffer_size))
    test_collector = Collector(policy, test_envs)
    # log
    log_path = os.path.join(args.logdir, args.task, 'ddpg')
    writer = SummaryWriter(log_path)

    def save_fn(policy):
        torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))

    def stop_fn(x):
        return x >= env.spec.reward_threshold

    # trainer
    result = offpolicy_trainer(policy,
                               train_collector,
                               test_collector,
                               args.epoch,
                               args.step_per_epoch,
                               args.collect_per_step,
                               args.test_num,
                               args.batch_size,
                               stop_fn=stop_fn,
                               save_fn=save_fn,
                               writer=writer)
    assert stop_fn(result['best_reward'])
    train_collector.close()
    test_collector.close()
    if __name__ == '__main__':
        pprint.pprint(result)
        # Let's watch its performance!
        env = gym.make(args.task)
        collector = Collector(policy, env)
        result = collector.collect(n_episode=1, render=args.render)
        print(f'Final reward: {result["rew"]}, length: {result["len"]}')
        collector.close()
示例#17
0
class Actor_Critic:
    def __init__(self, n_feature, n_action, lr_A=0.001, lr_C=0.01, GAMMA=0.1):
        self.n_feature = n_feature
        self.n_action = n_action
        self.GAMMA = GAMMA

        self.actor = Actor(self.n_feature, self.n_action)
        self.critic = Critic(self.n_feature)

        self.optimizer_actor = optim.Adam(params=self.actor.parameters(),
                                          lr=lr_A)
        self.optimizer_critic = optim.Adam(params=self.critic.parameters(),
                                           lr=lr_C)

        self.cost_his = []
        self.value_his = []

    def actor_learn(self, s, a, td_error):
        s = torch.Tensor(s[np.newaxis, :])  ##batch=1

        action_prob = self.actor(s)  ##[batch x self.n_action]-->[1, n_action]
        log_prob = torch.log(action_prob[0, a])  ## a in action index

        self.exp_v = torch.mean(-1 * log_prob * td_error)

        self.optimizer_actor.zero_grad()
        self.exp_v.backward()
        self.optimizer_actor.step()

        self.value_his.append(self.exp_v.item())
        return self.exp_v

    def choose_action(self, s):
        s = torch.Tensor(s[np.newaxis, :])
        probs = self.actor(s)
        return np.random.choice(range(probs.shape[1]),
                                p=probs.clone().detach().numpy().ravel())

    def critic_learn(self, s, r, s_):
        s, s_ = torch.Tensor(s[np.newaxis, :]), torch.Tensor(s_[np.newaxis, :])

        v_ = self.critic(s_)  ##part of Q target
        v = self.critic(s)
        td_error = F.mse_loss(v, r + self.GAMMA * v_)

        self.cost_his.append(td_error.item())

        self.optimizer_critic.zero_grad()
        td_error.backward()
        self.optimizer_critic.step()

        return td_error.item()

    def plot_cost(self):
        import matplotlib.pyplot as plt

        fig, (ax1, ax2) = plt.subplots(2, 1)

        ax1.plot(np.arange(len(self.cost_his)), self.cost_his)
        ax1.set_ylabel('Critic TD error')

        ax2.plot(np.arange(len(self.value_his)), self.value_his)
        ax2.set_ylabel('Actor value')
        ax2.set_xlabel('training steps')

        plt.show()

        pass