Пример #1
0
def sac(env_fn, actor_critic=MLPActorCritic, ac_kwargs=dict(), seed=0,
        steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99,
        polyak=0.995, lr=1e-3, alpha=0.2, batch_size=100, start_steps=10000,
        update_after=1000, update_every=50, num_test_episodes=10, max_ep_len=1000, policy_type = 1,
        logger_kwargs=dict(), save_freq=1000, save_dir=None):

    torch.manual_seed(seed)
    np.random.seed(seed)

    env = env_fn()
    opp_policy = Policy(game=env, player_num=False)
    test_env = SoccerPLUS(visual=False)
    test_opp_policy = Policy(game=test_env, player_num=False)
    obs_dim = env.n_features
    act_dim = env.n_actions #env.n_actions

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    # act_limit = env.action_space.high[0]

    # Create actor-critic module and target networks
    ac = actor_critic(obs_dim, act_dim, **ac_kwargs)
    ac_targ = deepcopy(ac)
    if torch.cuda.is_available():
        ac.cuda()
        ac_targ.cuda()

    device = torch.device('cuda')
    if args.cpc:
        cpc = CPC(timestep=args.timestep, obs_dim=4, hidden_sizes=[args.hid] * args.l, z_dim=args.z_dim,
                         c_dim=args.c_dim, device=device)
    else:
        cpc = None

    # Freeze target networks with respect to optimizers (only update via polyak averaging)
    for p in ac_targ.parameters():
        p.requires_grad = False

    # List of parameters for both Q-networks (save this for convenience)

    # Experience buffer
    T = Counter()  # training step
    E = Counter()  # training episode

    replay_buffer = ReplayBufferOppo(obs_dim=obs_dim, max_size=args.replay_size, cpc=args.cpc,
                                    cpc_model=cpc, writer=writer_cpc,T=T)

    # Count variables (protip: try to get a feel for how different size networks behave!)
    var_counts = tuple(count_vars(module) for module in [ac.pi, ac.q1, ac.q2])

    # Set up optimizers for policy and q-function
    pi_optimizer = Adam(ac.pi.parameters(), lr=lr)
    q1_optimizer = Adam(ac.q1.parameters(), lr=lr)
    q2_optimizer = Adam(ac.q2.parameters(), lr=lr)
    if args.cpc:
        cpc_optimizer = Adam(cpc.parameters(), lr=args.lr, eps=1e-4)

    # Set up model saving

    # product action
    def get_actions_info(a_prob):
        a_dis = Categorical(a_prob)
        max_a = torch.argmax(a_prob)
        sample_a = a_dis.sample().cpu()
        z = a_prob == 0.0
        z = z.float() * 1e-20
        log_a_prob = torch.log(a_prob + z)
        return a_prob, log_a_prob, sample_a, max_a

    # Set up function for computing SAC Q-losses
    def compute_loss_q(data):
        o, a, r, o2, d = data['obs'], data['act'], data['rew'], data['obs2'], data['done']

        # Bellman backup for Q functions
        with torch.no_grad():
            # Target actions come from *current* policy
            a_prob, log_a_prob, sample_a, max_a = get_actions_info(ac.pi(o2))

            # Target Q-values
            q1_pi_targ = ac_targ.q1(o2)
            q2_pi_targ = ac_targ.q2(o2)
            q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ)
            backup = r + gamma * (1 - d) * torch.sum(a_prob * (q_pi_targ - alpha * log_a_prob),dim=1)

        # MSE loss against Bellman backup
        q1 = ac.q1(o).gather(1, a.unsqueeze(-1).long())
        q2 = ac.q2(o).gather(1, a.unsqueeze(-1).long())
        loss_q1 = F.mse_loss(q1, backup.unsqueeze(-1))
        loss_q2 = F.mse_loss(q2, backup.unsqueeze(-1))
        loss_q = loss_q1 + loss_q2

        return loss_q

    # Set up function for computing SAC pi loss
    def compute_loss_pi(data):
        o = data['obs']
        a_prob, log_a_prob, sample_a, max_a = get_actions_info(ac.pi(o))
        q1_pi = ac.q1(o)
        q2_pi = ac.q2(o)
        q_pi = torch.min(q1_pi, q2_pi)

        # Entropy-regularized policy loss
        loss_pi = torch.sum(a_prob * (alpha * log_a_prob - q_pi),dim=1,keepdim=True).mean()
        entropy = torch.sum(log_a_prob * a_prob, dim=1).detach()

        # Useful info for logging
        pi_info = dict(LogPi=entropy.cpu().numpy())
        return loss_pi, entropy

    def update():
        data = replay_buffer.sample_trans(args.batch_size, device=device)
        # First run one gradient descent step for Q1 and Q2
        q1_optimizer.zero_grad()
        q2_optimizer.zero_grad()
        loss_q = compute_loss_q(data)
        loss_q.backward()
        nn.utils.clip_grad_norm_(ac.parameters(), max_norm=10, norm_type=2)
        q1_optimizer.step()
        q2_optimizer.step()

        # Next run one gradient descent step for pi.
        pi_optimizer.zero_grad()
        loss_pi, entropy = compute_loss_pi(data)
        loss_pi.backward()
        nn.utils.clip_grad_norm_(ac.parameters(), max_norm=10, norm_type=2)
        pi_optimizer.step()

        # Unfreeze Q-networks so you can optimize it at next DDPG step.
        # for p in q_params:
            # p.requires_grad = True

        # Record things

        if t >= update_after:
            # lr = max(args.lr * 2 ** (-(t-update_after) * 0.0001), 1e-10)
            _adjust_learning_rate(q1_optimizer, max(lr, 1e-10))
            _adjust_learning_rate(q2_optimizer, max(lr, 1e-10))
            _adjust_learning_rate(pi_optimizer, max(lr, 1e-10))

        # Finally, update target networks by polyak averaging.
        with torch.no_grad():
            for p, p_targ in zip(ac.parameters(), ac_targ.parameters()):
                p_targ.data.copy_((1 - polyak) * p.data + polyak * p_targ.data)

        writer.add_scalar("training/pi_loss", loss_pi.detach().item(), t)
        writer.add_scalar("training/q_loss", loss_q.detach().item(), t)
        writer.add_scalar("training/entropy", entropy.detach().mean().item(), t)
        writer.add_scalar("training/lr", lr, t)

    def update_cpc():
        data, indexes, min_len = replay_buffer.sample_traj(args.cpc_batch)
        data = data[:,:,3:]
        cpc_optimizer.zero_grad()
        c_hidden = cpc.init_hidden(len(data), args.c_dim)
        acc, loss, latents = cpc(data, c_hidden)

        # replay_buffer.update_latent(indexes, min_len, latents.detach())
        loss.backward()
        # add gradient clipping
        nn.utils.clip_grad_norm_(cpc.parameters(), max_norm=20, norm_type=2)
        cpc_optimizer.step()
        writer_cpc.add_scalar("learner/cpc_acc", acc, t)
        writer_cpc.add_scalar("learner/cpc_loss", loss.detach().item(), t)

    def get_action(o, greedy=False):
        if len(o.shape) == 1:
            o = np.expand_dims(o, axis=0)
        a_prob = ac.act(torch.as_tensor(o, dtype=torch.float32,device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")), greedy)
        a_prob, log_a_prob, sample_a, max_a = get_actions_info(a_prob)
        action = sample_a if not greedy else max_a
        return action.item()

    def get_opp_policy(p):
        p_sample = np.random.rand()
        if p_sample < p:
            return args.opp1
        else:
            return args.opp2
    def test_agent(epoch, t_opp, writer):
        if num_test_episodes == 0:
            return
        with torch.no_grad():
            win = 0
            total_ret = 0
            total_len = 0
            for j in range(num_test_episodes):
                o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0
                while not (d or (ep_len == max_ep_len)):
                    # Take deterministic actions at test time
                    o2, r, d, _ = test_env.step(get_action(o, True), test_opp_policy.get_actions(t_opp))
                    r *= 10
                    # test_env.render()
                    o = o2
                    ep_ret += r
                    ep_len += 1
                total_ret += ep_ret
                total_len += ep_len
                if(ep_ret == 50):
                    win += 1
            mean_score = total_ret / num_test_episodes
            win_rate = win / num_test_episodes
            mean_len = total_len/ num_test_episodes
            print("opponent:\t{}\ntest epoch:\t{}\nmean score:\t{:.1f}\nwin_rate:\t{}\nmean len:\t{}".format(
    t_opp, epoch, mean_score, win_rate, mean_len))
            writer.add_scalar("test/mean_score", mean_score, epoch)
            writer.add_scalar("test/win_rate", win_rate, epoch)
            writer.add_scalar("test/mean_len", mean_len,epoch)

    # Prepare for interaction with environment
    total_steps = steps_per_epoch * epochs
    start_time = time.time()
    scores = []
    trajectory, meta = [], []
    o, ep_ret, ep_len = env.reset(), 0, 0
    discard = False
    opp = get_opp_policy(args.p1)

    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):

        T.increment()

        # Until start_steps have elapsed, randomly sample actions
        # from a uniform distribution for better exploration. Afterwards,
        # use the learned policy.
        with torch.no_grad():
            if t >= start_steps:
                a = get_action(o)
            else:
                a = np.random.randint(act_dim)


        # Step the env
        o2, r, d, info = env.step(a,opp_policy.get_actions(opp))
        if info.get('no_data_receive', False):
            discard = True
        env.render()
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len or discard else d

        # Store experience to replay buffer
        # replay_buffer.store(o, a, r, o2, d)
        e = E.value()
        transition = (o, a, r, o2, d)
        trajectory.append(transition)
        meta.append([opp, 1, e, ep_len, r, a])

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2

        # End of trajectory handling
        if d or (ep_len == max_ep_len) or discard:
            scores.append(ep_ret)
            logger.info("total_step:{}, total_episode:{}, opp:{}, round len:{}, round score:{}, 100 mean score:{}, 10 mean Score:{}".format(t, e, opp, ep_len, ep_ret, np.mean(scores[-100:]),np.mean(scores[-10:])))
            writer.add_scalar("metrics/round_score", ep_ret, t)
            writer.add_scalar("metrics/round_step", ep_len, t)
            writer.add_scalar("metrics/alpha", alpha, t)
            o, ep_ret, ep_len = env.reset(), 0, 0
            replay_buffer.store(trajectory, meta=meta)
            trajectory, meta = [], []
            E.increment()
            if t <= args.change_step:
                opp = get_opp_policy(args.p1)
            else:
                opp = get_opp_policy(args.p2)
            discard = False


        # Update handling
        if t >= update_after and t % update_every == 0:
            for j in range(update_every):
                update()

        # CPC update handing
        if args.cpc and e > args.cpc_batch * 2 and e % args.cpc_update_freq  == 0:
            for _ in range(args.cpc_update_freq):
                update_cpc()

        if t >= update_after and t % save_freq == 0:

            # Test the performance of the deterministic version of the agent.
            test_agent(t, args.opp1, writer_1)
            test_agent(t, args.opp2, writer_3)
Пример #2
0
    with open(os.path.join(experiment_dir, "arguments"), 'w') as f:
        json.dump(args.__dict__, f, indent=2)
    device = torch.device("cuda") if args.cuda else torch.device("cpu")
    # env and model setup
    ac_kwargs = dict(hidden_sizes=[args.hid] * args.l)
    if args.exp_name == "test":
        env = gym.make("CartPole-v0")
    elif args.non_station:
        env = make_ftg_ram_nonstation(args.env, p2_list=args.list, total_episode=args.station_rounds,stable=args.stable)
    else:
        env = make_ftg_ram(args.env, p2=args.p2)
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.n
    if args.cpc:
        global_ac = MLPActorCritic(obs_dim+args.c_dim, act_dim, **ac_kwargs)
        global_cpc = CPC(timestep=args.timestep, obs_dim=obs_dim, hidden_sizes=[args.hid] * args.l, z_dim=args.z_dim,c_dim=args.c_dim)
        global_cpc.share_memory()
    else:
        global_ac = MLPActorCritic(obs_dim, act_dim, **ac_kwargs)
        global_cpc = None

    # async training setup
    T = Counter()
    E = Counter()
    scores = mp.Manager().list()
    wins = mp.Manager().list()
    buffer = mp.Manager().list()

    if os.path.exists(os.path.join(args.save_dir, args.exp_name, args.model_para)):
        global_ac.load_state_dict(torch.load(os.path.join(args.save_dir, args.exp_name, args.model_para)))
        print("load sac model")
Пример #3
0
    # if args.exp_name == "test":
    #     env = gym.make("CartPole-v0")
    # elif args.non_station:
    #     env = make_ftg_ram_nonstation(args.env, p2_list=args.opp_list, total_episode=args.opp_freq,stable=args.stable)
    # else:
    #     env = make_ftg_ram(args.env, p2=args.p2)
    env = SoccerPLUS()
    obs_dim = env.n_features
    act_dim = env.n_actions
    # create model
    global_ac = MLPActorCritic(obs_dim, act_dim, **ac_kwargs)
    if args.cpc:
        global_cpc = CPC(timestep=args.timestep,
                         obs_dim=obs_dim,
                         hidden_sizes=[args.hid] * args.l,
                         z_dim=args.z_dim,
                         c_dim=args.c_dim,
                         device=device)
    else:
        global_cpc = None
    # create shared model for actor
    global_ac_targ = deepcopy(global_ac)
    shared_ac = deepcopy(global_ac).cpu()
    # create optimizer
    pi_optimizer = Adam(global_ac.pi.parameters(), lr=args.lr, eps=1e-4)
    q1_optimizer = Adam(global_ac.q1.parameters(), lr=args.lr, eps=1e-4)
    q2_optimizer = Adam(global_ac.q2.parameters(), lr=args.lr, eps=1e-4)
    alpha_optim = Adam([global_ac.log_alpha], lr=args.lr, eps=1e-4)
    if args.cpc:
        cpc_optimizer = Adam(global_cpc.parameters(), lr=args.lr, eps=1e-4)
    env.close()
Пример #4
0
    elif args.non_station:
        env = make_ftg_ram_nonstation(args.env,
                                      p2_list=args.list,
                                      total_episode=args.station_rounds,
                                      stable=args.stable)
    else:
        env = make_ftg_ram(args.env, p2=args.p2)
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.n

    # load the trained models
    if args.cpc:
        global_ac = MLPActorCritic(obs_dim + args.c_dim, act_dim, **ac_kwargs)
        global_cpc = CPC(timestep=args.timestep,
                         obs_dim=obs_dim,
                         hidden_sizes=[args.hid] * args.l,
                         z_dim=args.z_dim,
                         c_dim=args.c_dim)
        replay_buffer = ReplayBuffer(obs_dim=obs_dim + args.c_dim,
                                     size=args.replay_size)
    else:
        global_ac = MLPActorCritic(obs_dim, act_dim, **ac_kwargs)
        global_cpc = None
        replay_buffer = ReplayBuffer(obs_dim=obs_dim, size=args.replay_size)

    if os.path.exists(
            os.path.join(args.save_dir, args.exp_name, args.model_para)):
        # global_ac.load_state_dict(torch.load(os.path.join(args.save_dir, args.exp_name, args.model_para)))
        load_my_state_dict(
            global_ac,
            os.path.join(args.save_dir, args.exp_name, args.model_para))