示例#1
0
文件: ddpg.py 项目: kvas7andy/MAGAIL
    def _init_model(self):
        """init model from parameters"""
        self.num_states = self.env.observation_space.shape[0]
        self.num_actions = self.env.action_space.shape[0]

        self.action_low, self.action_high = self.env.action_space.low[
            0], self.env.action_space.high[0]
        # seeding
        np.random.seed(self.seed)
        torch.manual_seed(self.seed)
        self.env.seed(self.seed)

        self.policy_net = Actor(self.num_states, self.num_actions,
                                self.action_high).to(device)
        self.policy_net_target = Actor(self.num_states, self.num_actions,
                                       self.action_high).to(device)

        self.value_net = Value(self.num_states + self.num_actions).to(device)
        self.value_net_target = Value(self.num_states +
                                      self.num_actions).to(device)

        self.policy_net_target.load_state_dict(self.policy_net.state_dict())
        self.value_net_target.load_state_dict(self.value_net.state_dict())

        self.optimizer_p = optim.Adam(self.policy_net.parameters(),
                                      lr=self.lr_p)
        self.optimizer_v = optim.Adam(self.value_net.parameters(),
                                      lr=self.lr_v)
示例#2
0
    def __init__(self,
                 args,
                 state_dim,
                 action_dim,
                 is_dict_action=False,
                 is_atari=False):

        self.device = args.device
        self.config = args
        if is_atari:
            self.actor = CNNPolicy(state_dim, action_dim).to(self.device)
            self.critic = CNNCritic(state_dim).to(self.device)
        else:
            self.actor = DiscretePolicy(state_dim, action_dim).to(self.device) if is_dict_action else \
                Policy(state_dim, action_dim, log_std=self.config.log_std).to(self.device)
            self.critic = Value(state_dim).to(self.device)

        # initialize optimizer for actor and critic
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
                                                lr=self.config.learning_rate)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 lr=self.config.learning_rate)

        # optimization epoch number and batch size for PPO
        self.optim_epochs = 10
        self.optim_batch_size = 64
示例#3
0
    def _init_model(self):
        """init model from parameters"""
        self.num_states = self.env.observation_space.shape[0]
        self.num_actions = self.env.action_space.shape[0]

        self.action_low, self.action_high = self.env.action_space.low[0], self.env.action_space.high[0]

        self.target_entropy = - np.prod(self.env.action_space.shape)
        # seeding
        np.random.seed(self.seed)
        torch.manual_seed(self.seed)
        self.env.seed(self.seed)

        self.policy_net = Actor(self.num_states, self.num_actions, action_limit=self.action_high).to(device)

        self.q_net_1 = Value(self.num_states + self.num_actions).to(device)
        self.q_net_target_1 = Value(self.num_states + self.num_actions).to(device)
        self.q_net_2 = Value(self.num_states + self.num_actions).to(device)
        self.q_net_target_2 = Value(self.num_states + self.num_actions).to(device)

        # self.alpha init
        self.alpha = torch.exp(torch.zeros(1, device=device)).requires_grad_()

        self.q_net_target_1.load_state_dict(self.q_net_1.state_dict())
        self.q_net_target_2.load_state_dict(self.q_net_2.state_dict())

        self.optimizer_p = optim.Adam(self.policy_net.parameters(), lr=self.lr_p)
        self.optimizer_a = optim.Adam([self.alpha], lr=self.lr_a)
        self.optimizer_q_1 = optim.Adam(self.q_net_1.parameters(), lr=self.lr_q)
        self.optimizer_q_2 = optim.Adam(self.q_net_2.parameters(), lr=self.lr_q)
示例#4
0
class TestValue(TestCase):
    def setUp(self) -> None:
        self.value = Value(6, drop_rate=0.5)
        self.value2 = Value(11, drop_rate=0.5)
        print(self.value)

    def test_forward(self):
        res = self.value.forward(torch.rand((5, 6)))

        self.assertEqual(res.size(), torch.Size([5, 1]))

    def test_multi_forward(self):
        x1 = torch.rand((5, 6))
        x2 = torch.rand((5, 5))

        res = self.value2.forward(x1, x2)

        print(res)
示例#5
0
    def _init_model(self):
        self.V = Value(num_states=self.config["value"]["num_states"],
                       num_hiddens=self.config["value"]["num_hiddens"],
                       drop_rate=self.config["value"]["drop_rate"],
                       activation=self.config["value"]["activation"])
        self.P = JointPolicy(
            initial_state=self.expert_dataset.state.to(device),
            config=self.config["jointpolicy"])
        self.D = Discriminator(
            num_states=self.config["discriminator"]["num_states"],
            num_actions=self.config["discriminator"]["num_actions"],
            num_hiddens=self.config["discriminator"]["num_hiddens"],
            drop_rate=self.config["discriminator"]["drop_rate"],
            use_noise=self.config["discriminator"]["use_noise"],
            noise_std=self.config["discriminator"]["noise_std"],
            activation=self.config["discriminator"]["activation"])

        print("Model Structure")
        print(self.P)
        print(self.V)
        print(self.D)
        print()

        self.optimizer_policy = optim.Adam(
            self.P.parameters(),
            lr=self.config["jointpolicy"]["learning_rate"])
        self.optimizer_value = optim.Adam(
            self.V.parameters(), lr=self.config["value"]["learning_rate"])
        self.optimizer_discriminator = optim.Adam(
            self.D.parameters(),
            lr=self.config["discriminator"]["learning_rate"])
        self.scheduler_discriminator = optim.lr_scheduler.StepLR(
            self.optimizer_discriminator, step_size=2000, gamma=0.95)

        self.discriminator_func = nn.BCELoss()

        to_device(self.V, self.P, self.D, self.D, self.discriminator_func)
示例#6
0
    def __init__(self,
                 state_dim,
                 channels,
                 kernel_sizes,
                 strides,
                 paddings=None,
                 head_hidden_size=(128, 128),
                 num_aux=0,
                 activation='relu',
                 use_maxpool=False,
                 resnet_first_layer=False):
        super().__init__(state_dim, 1, channels, kernel_sizes, strides,
                         paddings, activation, use_maxpool, num_aux,
                         resnet_first_layer)

        self.head = Value(self.conv_out_size_for_fc + num_aux,
                          head_hidden_size, activation)
示例#7
0
def create_networks():
    """define actor and critic"""
    if is_disc_action:
        policy_net = DiscretePolicy(state_dim,
                                    env.action_space.n,
                                    hidden_size=(64, 32),
                                    activation='relu')
    else:
        policy_net = Policy(state_dim,
                            env.action_space.shape[0],
                            log_std=args.log_std,
                            hidden_size=(64, 32),
                            activation='relu')
    value_net = Value(state_dim, hidden_size=(32, 16), activation='relu')
    if args.WGAN:
        discrim_net = SNDiscriminator(state_dim + action_dim,
                                      hidden_size=(32, 16),
                                      activation='relu')
    elif args.EBGAN or args.GMMIL:
        discrim_net = AESNDiscriminator(state_dim + action_dim,
                                        hidden_size=(32, ),
                                        encode_size=64,
                                        activation='relu',
                                        slope=0.1,
                                        dropout=False,
                                        dprob=0.2)
    elif args.GEOMGAN:
        # new kernel
        #discrim_net = KernelNet(state_dim + action_dim,state_dim + action_dim)
        noise_dim = 64
        discrim_net = AESNDiscriminator(state_dim + action_dim,
                                        hidden_size=(32, ),
                                        encode_size=noise_dim,
                                        activation='relu',
                                        slope=0.1,
                                        dropout=False,
                                        dprob=0.2)
        kernel_net = NoiseNet(noise_dim,
                              hidden_size=(32, ),
                              encode_size=noise_dim,
                              activation='relu',
                              slope=0.1,
                              dropout=False,
                              dprob=0.2)
        optimizer_kernel = torch.optim.Adam(kernel_net.parameters(),
                                            lr=args.learning_rate / 2)
        scheduler_kernel = MultiStepLR(optimizer_kernel,
                                       milestones=args.milestones,
                                       gamma=args.lr_decay)
    else:
        discrim_net = Discriminator(state_dim + action_dim,
                                    hidden_size=(32, 16),
                                    activation='relu')

    optimizer_policy = torch.optim.Adam(policy_net.parameters(),
                                        lr=args.learning_rate)
    optimizer_value = torch.optim.Adam(value_net.parameters(),
                                       lr=args.learning_rate)
    optimizer_discrim = torch.optim.Adam(discrim_net.parameters(),
                                         lr=args.learning_rate)

    scheduler_policy = MultiStepLR(optimizer_policy,
                                   milestones=args.milestones,
                                   gamma=args.lr_decay)
    scheduler_value = MultiStepLR(optimizer_value,
                                  milestones=args.milestones,
                                  gamma=args.lr_decay)
    scheduler_discrim = MultiStepLR(optimizer_discrim,
                                    milestones=args.milestones,
                                    gamma=args.lr_decay)

    if args.WGAN:

        class ExpertReward():
            def __init__(self):
                self.a = 0

            def expert_reward(self, state, action):
                state_action = tensor(np.hstack([state, action]), dtype=dtype)
                with torch.no_grad():
                    return -discrim_net(state_action)[0].item()
                    # return -discrim_net(state_action).sum().item()

        learned_reward = ExpertReward()
    elif args.EBGAN:

        class ExpertReward():
            def __init__(self):
                self.a = 0

            def expert_reward(self, state, action):
                state_action = tensor(np.hstack([state, action]), dtype=dtype)
                with torch.no_grad():
                    _, recon_out = discrim_net(state_action)
                    return -elementwise_loss(
                        recon_out, state_action).item() + args.r_margin

        learned_reward = ExpertReward()
    elif args.GMMIL or args.GEOMGAN:

        class ExpertReward():
            def __init__(self):
                self.r_bias = 0

            def expert_reward(self, state, action):
                with torch.no_grad():
                    return self.r_bias

            def update_XX_YY(self):
                self.XX = torch.diag(torch.mm(self.e_o_enc, self.e_o_enc.t()))
                self.YY = torch.diag(torch.mm(self.g_o_enc, self.g_o_enc.t()))

        learned_reward = ExpertReward()
    else:

        class ExpertReward():
            def __init__(self):
                self.a = 0

            def expert_reward(self, state, action):
                state_action = tensor(np.hstack([state, action]), dtype=dtype)
                with torch.no_grad():
                    return -math.log(discrim_net(state_action)[0].item())

        learned_reward = ExpertReward()
    """create agent"""
    agent = Agent(env,
                  policy_net,
                  device,
                  custom_reward=learned_reward,
                  running_state=None,
                  render=args.render,
                  num_threads=args.num_threads)

    def update_params(batch, i_iter):
        dataSize = min(args.min_batch_size, len(batch.state))
        states = torch.from_numpy(np.stack(
            batch.state)[:dataSize, ]).to(dtype).to(device)
        actions = torch.from_numpy(np.stack(
            batch.action)[:dataSize, ]).to(dtype).to(device)
        rewards = torch.from_numpy(np.stack(
            batch.reward)[:dataSize, ]).to(dtype).to(device)
        masks = torch.from_numpy(np.stack(
            batch.mask)[:dataSize, ]).to(dtype).to(device)
        with torch.no_grad():
            values = value_net(states)
            fixed_log_probs = policy_net.get_log_prob(states, actions)
        """estimate reward"""
        """get advantage estimation from the trajectories"""
        advantages, returns = estimate_advantages(rewards, masks, values,
                                                  args.gamma, args.tau, device)
        """update discriminator"""
        for _ in range(args.discriminator_epochs):
            #dataSize = states.size()[0]
            # expert_state_actions = torch.from_numpy(expert_traj).to(dtype).to(device)
            exp_idx = random.sample(range(expert_traj.shape[0]), dataSize)
            expert_state_actions = torch.from_numpy(
                expert_traj[exp_idx, :]).to(dtype).to(device)

            dis_input_real = expert_state_actions
            if len(actions.shape) == 1:
                actions.unsqueeze_(-1)
                dis_input_fake = torch.cat([states, actions], 1)
                actions.squeeze_(-1)
            else:
                dis_input_fake = torch.cat([states, actions], 1)

            if args.EBGAN or args.GMMIL or args.GEOMGAN:
                # tbd, no discriminaotr learning
                pass
            else:
                g_o = discrim_net(dis_input_fake)
                e_o = discrim_net(dis_input_real)

            optimizer_discrim.zero_grad()
            if args.GEOMGAN:
                optimizer_kernel.zero_grad()

            if args.WGAN:
                if args.LSGAN:
                    pdist = l1dist(dis_input_real,
                                   dis_input_fake).mul(args.lamb)
                    discrim_loss = LeakyReLU(e_o - g_o + pdist).mean()
                else:
                    discrim_loss = torch.mean(e_o) - torch.mean(g_o)
            elif args.EBGAN:
                e_recon = elementwise_loss(e_o, dis_input_real)
                g_recon = elementwise_loss(g_o, dis_input_fake)
                discrim_loss = e_recon
                if (args.margin - g_recon).item() > 0:
                    discrim_loss += (args.margin - g_recon)
            elif args.GMMIL:
                #mmd2_D,K = mix_rbf_mmd2(e_o_enc, g_o_enc, args.sigma_list)
                mmd2_D, K = mix_rbf_mmd2(dis_input_real, dis_input_fake,
                                         args.sigma_list)
                #tbd
                #rewards = K[0]+K[1]-2*K[2]
                rewards = K[1] - K[2]  # -(exp - gen): -(kxy-kyy)=kyy-kxy
                rewards = -rewards.detach(
                )  # exp - gen, maximize (gen label negative)
                errD = mmd2_D
                discrim_loss = -errD  # maximize errD

                # prep for generator
                advantages, returns = estimate_advantages(
                    rewards, masks, values, args.gamma, args.tau, device)
            elif args.GEOMGAN:
                # larger, better, but slower
                noise_num = 100
                mmd2_D, K = mix_imp_mmd2(e_o_enc, g_o_enc, noise_num,
                                         noise_dim, kernel_net, cuda)
                rewards = K[1] - K[2]  # -(exp - gen): -(kxy-kyy)=kyy-kxy
                rewards = -rewards.detach()
                errD = mmd2_D  #+ args.lambda_rg * one_side_errD
                discrim_loss = -errD  # maximize errD

                # prep for generator
                advantages, returns = estimate_advantages(
                    rewards, masks, values, args.gamma, args.tau, device)
            else:
                discrim_loss = discrim_criterion(g_o, ones((states.shape[0], 1), device=device)) + \
                               discrim_criterion(e_o, zeros((e_o.shape[0], 1), device=device))
            if args.GEOMGAN:
                optimizer_kernel.step()
        """perform mini-batch PPO update"""
        optim_iter_num = int(math.ceil(states.shape[0] / args.ppo_batch_size))
        for _ in range(args.generator_epochs):
            perm = np.arange(states.shape[0])
            np.random.shuffle(perm)
            perm = LongTensor(perm).to(device)

            states, actions, returns, advantages, fixed_log_probs = \
                states[perm].clone(), actions[perm].clone(), returns[perm].clone(), advantages[perm].clone(), \
                fixed_log_probs[perm].clone()

            for i in range(optim_iter_num):
                ind = slice(
                    i * args.ppo_batch_size,
                    min((i + 1) * args.ppo_batch_size, states.shape[0]))
                states_b, actions_b, advantages_b, returns_b, fixed_log_probs_b = \
                    states[ind], actions[ind], advantages[ind], returns[ind], fixed_log_probs[ind]

                ppo_step(policy_net, value_net, optimizer_policy,
                         optimizer_value, 1, states_b, actions_b, returns_b,
                         advantages_b, fixed_log_probs_b, args.clip_epsilon,
                         args.l2_reg)

        return rewards

    if args.GEOMGAN:
        return policy_net,value_net,discrim_net,kernel_net,optimizer_policy,optimizer_value,optimizer_discrim,optimizer_kernel,agent,update_params \
            ,scheduler_policy,scheduler_value,scheduler_discrim,scheduler_kernel
    else:
        return policy_net,value_net,discrim_net,optimizer_policy,optimizer_value,optimizer_discrim,agent,update_params \
            ,scheduler_policy,scheduler_value,scheduler_discrim
示例#8
0
running_state = ZFilter((state_dim,), clip=5)
# running_reward = ZFilter((1,), demean=False, clip=10)

"""define actor and critic"""
policy_net = []
value_net = []
if args.model_path is None:
    if is_disc_action:
        for i in range(env_dummy.n):
            policy_net.append(DiscretePolicy(obs_shape_n[i], act_shape_n[i]))
            # print(policy_net[i])
    else:
        policy_net = Policy(obs_shape_n[i], env_dummy.action_space.shape[0], log_std=args.log_std)
    # value_net = Value(state_dim)
    for i in range(env_dummy.n):
        value_net.append(Value(obs_shape_n[i]*env_dummy.n))
        # print(value_net[i])
else:
    # TODO
    policy_net, value_net = pickle.load(open(args.model_path, "rb"))
    # policy_net = [env_dummy.observation_space[i].shape[0] for i in range(env_dummy.n)]
if use_gpu:
    # policy_net = policy_net.cuda()
    # value_net = value_net.cuda()
    for i in range(env_dummy.n):
        policy_net[i].cuda()
        value_net[i].cuda()

optimizer_policy = []
optimizer_value = []
for i in range(env_dummy.n):
示例#9
0
def learn_model(args):

    print("RL result will be saved at %s" % args.rl_filename)
    print("RL model will be saved at %s" % args.rl_model_filename)
    if use_gpu:
        print("Using CUDA.")

    torch.manual_seed(args.rl_seed)
    if use_gpu:
        torch.cuda.manual_seed_all(args.rl_seed)
        torch.backends.cudnn.deterministic = True
    np.random.seed(args.rl_seed)
    random.seed(args.rl_seed)

    env = gym.make(args.env_name)
    env.seed(args.rl_seed)

    env_test = gym.make(args.env_name)
    env_test.seed(args.rl_seed)

    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    a_bound = np.asscalar(env.action_space.high[0])
    a_low = np.asscalar(env.action_space.low[0])
    assert a_bound == -a_low

    ## Binary flag for manually cliping actions for step function after adding Gaussian noise.
    clip = (args.env_name == "LunarLanderContinuous-v2"
            or args.env_name == "BipedalWalker-v2")

    print(env.observation_space)
    print(env.action_space)
    """define actor and critic"""
    policy_net = Policy(state_dim,
                        action_dim,
                        log_std=args.log_std,
                        a_bound=a_bound,
                        hidden_size=args.hidden_size,
                        activation=args.activation).to(device)
    value_net = Value(state_dim,
                      hidden_size=args.hidden_size,
                      activation=args.activation).to(device)

    optimizer_value = torch.optim.Adam(value_net.parameters(),
                                       lr=args.learning_rate_v)
    decayed_lambda_td = args.lambda_td

    def update_params_c(batch, i_iter):
        states = torch.from_numpy(np.stack(batch.state)).float().to(device)
        actions = torch.from_numpy(np.stack(batch.action)).float().to(device)
        rewards = torch.from_numpy(np.stack(batch.reward)).float().to(device)
        masks = torch.from_numpy(np.stack(batch.mask).astype(
            np.float32)).to(device)
        """get advantage estimation from the trajectories"""
        values = value_net(states).data
        advantages, lambda_returns, mc_returns = estimate_advantages(
            rewards, masks, values, args.gamma, args.tau)

        if args.lamret:
            returns = lambda_returns
        else:
            returns = mc_returns
        """perform critic update"""
        #gae_step(value_net, optimizer_value, states, lambda_returns, args.l2_reg)  # full batch GD
        gae_step_epoch(value_net, optimizer_value, states, returns,
                       args.l2_reg)  # Stochastic GD

    """ Function to update the parameters of value and policy networks"""

    def update_params_p(batch, i_iter):

        nonlocal decayed_lambda_td

        states = torch.from_numpy(np.stack(batch.state)).float().to(device)
        actions = torch.from_numpy(np.stack(batch.action)).float().to(device)
        next_states = torch.from_numpy(np.stack(
            batch.next_state)).float().to(device)
        rewards = torch.from_numpy(np.stack(batch.reward)).float().to(device)
        masks = torch.from_numpy(np.stack(batch.mask).astype(
            np.float32)).to(device)
        """get advantage estimation from the trajectories, this is done after gae_step update"""
        values = value_net(states).data
        advantages, lambda_returns, mc_returns = estimate_advantages(
            rewards, masks, values, gamma=args.gamma, tau=args.tau)

        if args.method_name == "TRPO-RET-MC":
            returns = mc_returns.detach(
            )  # detach() does not matter since we back prop policy network only.
        elif args.method_name == "TRPO-RET-GAE":
            returns = lambda_returns.detach(
            )  # detach() does not matter actually.
        else:
            returns = 0  # returns is not used for TRPO and TRPO-TD.

        # standardize or not ?
        if args.mgae:
            advantages = (advantages - advantages.mean()
                          ) / advantages.std()  # this will be m-std version
        else:
            advantages = advantages / advantages.std(
            )  # this will be std version

        trpo_step_td(policy_net=policy_net, value_net=value_net, states=states, actions=actions, next_states=next_states, rewards=rewards, masks=masks, gamma=args.gamma, advantages=advantages, \
            max_kl=args.max_kl, damping=args.damping, \
            lambda_td=decayed_lambda_td, method_name=args.method_name, returns=returns, mtd=args.mtd)
        """ decay the td_reg parameter after update """
        decayed_lambda_td = decayed_lambda_td * args.decay_td

    """create agent"""
    agent = Agent(env, policy_net, render=False)
    agent_test = Agent(env_test,
                       policy_net,
                       mean_action=True,
                       render=args.render)
    """ The actual learning loop"""
    for i_iter in range(args.rl_max_iter_num):
        """ Save the learned policy model """
        if ( (i_iter) % args.rl_save_model_interval == 0 and args.rl_save_model_interval > 0 ) \
            or (i_iter == args.rl_max_iter_num + 1) or i_iter == 0:

            policy_net = policy_net.to(device_cpu)
            value_net = value_net.to(device_cpu)

            pickle.dump((policy_net, value_net),
                        open(args.rl_model_filename + ("_I%d.p" % (i_iter)),
                             'wb'))

            policy_net = policy_net.to(device)
            value_net = value_net.to(device)
        """ Test the policy before update """
        if i_iter % args.log_interval == 0 or i_iter + 1 == args.rl_max_iter_num:
            _, log_test = agent_test.collect_samples_test(max_num_episodes=20,
                                                          render=args.render,
                                                          clip=clip)
        """generate multiple trajectories that reach the minimum batch_size"""
        t0 = time.time()
        batch, log = agent.collect_samples_train(
            args.min_batch_size, render=False,
            clip=clip)  # this is on-policy samples
        t1 = time.time()
        """ update parameters """
        t0_d = time.time()
        update_params_c(batch, i_iter)  #critic update
        update_params_p(batch, i_iter)  #actor update
        t1_d = time.time()
        """ Print out result to stdout and save it to a text file for later usage"""
        if i_iter % args.log_interval == 0:

            result_text = t_format("Iter %6d (%2.2fs)+(%2.2fs)" %
                                   (i_iter, t1 - t0, t1_d - t0_d))
            result_text += " | [R] " + t_format(
                "Avg: %.2f (%.2f)" % (log['avg_reward'], log['std_reward']), 2)
            result_text += " | [R_test] " + t_format("min: %.2f" % log_test['min_reward'], 1) + t_format("max: %.2f" % log_test['max_reward'], 1) \
                            + t_format("Avg: %.2f (%.2f)" % (log_test['avg_reward'], log_test['std_reward']), 2)
            print(result_text)

            with open(args.rl_filename, 'a') as f:
                print(result_text, file=f)
示例#10
0
env_dummy = env_factory(0)
state_dim = env_dummy.observation_space.shape[0]
is_disc_action = len(env_dummy.action_space.shape) == 0
ActionTensor = LongTensor if is_disc_action else DoubleTensor

running_state = ZFilter((state_dim, ), clip=5)
# running_reward = ZFilter((1,), demean=False, clip=10)
"""define actor and critic"""
if args.model_path is None:
    if is_disc_action:
        policy_net = DiscretePolicy(state_dim, env_dummy.action_space.n)
    else:
        policy_net = Policy(state_dim,
                            env_dummy.action_space.shape[0],
                            log_std=args.log_std)
    value_net = Value(state_dim)
else:
    policy_net, value_net, running_state = pickle.load(
        open(args.model_path, "rb"))
if use_gpu:
    policy_net = policy_net.cuda()
    value_net = value_net.cuda()
del env_dummy
"""create agent"""
agent = Agent(env_factory,
              policy_net,
              running_state=running_state,
              render=args.render,
              num_threads=args.num_threads)

示例#11
0
# running_reward = ZFilter((1,), demean=False, clip=10)
running_state = None
"""seeding"""
np.random.seed(args.seed)
torch.manual_seed(args.seed)
env.seed(args.seed)
"""define actor and critic"""
if args.model_path is None:
    if is_disc_action:
        policy_net = DiscretePolicy(state_dim, env.action_space.n)
    else:
        policy_mgr = DiscretePolicy(state_dim, 7)
        policy_wrk = Policy(state_dim + subgoal_dim,
                            env.action_space.shape[0],
                            log_std=args.log_std)
    value_mgr = Value(state_dim)
    value_wrk = Value(state_dim + subgoal_dim)
else:
    policy_net, value_net, running_state = pickle.load(
        open(args.model_path, "rb"))
policy_mgr.to(device)
policy_wrk.to(device)
value_mgr.to(device)
value_wrk.to(device)

# optim_policy_m = torch.optim.Adam(policy_mgr.parameters(), lr=0.01)
# optim_policy_w = torch.optim.Adam(policy_wrk.parameters(), lr=0.01)
# optim_value_m = torch.optim.Adam(value_mgr.parameters(), lr=0.01)
# optim_value_w = torch.optim.Adam(value_wrk.parameters(), lr=0.01)
"""create agent"""
agent = Agent(env,
env_dummy = env_factory(0)
state_dim = env_dummy.observation_space.shape[0]
is_disc_action = len(env_dummy.action_space.shape) == 0
ActionTensor = LongTensor if is_disc_action else DoubleTensor

running_state = ZFilter((state_dim,), clip=5)
# running_reward = ZFilter((1,), demean=False, clip=10)

"""define actor and critic"""
if args.model_path is None:
    if is_disc_action:
        policy_net = DiscretePolicy(state_dim, env_dummy.action_space.n, hidden_size=(20,20))
    else:
        policy_net = Policy(state_dim, env_dummy.action_space.shape[0], log_std=args.log_std, hidden_size=(3,3))
    value_net = Value(state_dim)
else:
    policy_net, value_net, running_state = pickle.load(open(args.model_path, "rb"))
if use_gpu:
    policy_net = policy_net.cuda()
    value_net = value_net.cuda()
del env_dummy

optimizer_policy = torch.optim.Adam(policy_net.parameters(), lr=args.learning_rate)
optimizer_value = torch.optim.Adam(value_net.parameters(), lr=args.learning_rate)

# optimization epoch number and batch size for PPO
optim_epochs = 5
optim_batch_size = 64

"""create agent"""
示例#13
0
        print(H)
"""define actor and critic"""
size = (128, 128)
policy_size = (64, 64)
critic_size = size  #(8, 8)
advantage_size = size  #(8, 8)
if args.model_path is None:
    if is_disc_action:
        policy_net = DiscretePolicy(state_dim, env_dummy.action_space.n)
    else:
        policy_net = Policy(state_dim,
                            env_dummy.action_space.shape[0],
                            hidden_size=policy_size,
                            scale_cov=args.scale_cov)
        #policy_net = Policy(state_dim, env_dummy.action_space.shape[0], hidden_size=policy_size, log_std=0)
    value_net = Value(state_dim, hidden_size=critic_size)
    advantage_net = Advantage((state_dim, action_dim),
                              hidden_size=advantage_size)
else:
    policy_net, value_net, advantage_net, running_state = pickle.load(
        open(args.model_path, "rb"))
if use_gpu:
    policy_net = policy_net.cuda()
    value_net = value_net.cuda()
    advantage_net = advantage_net.cuda()
del env_dummy

optimizer_policy = torch.optim.Adam(policy_net.parameters(),
                                    lr=args.learning_rate)
optimizer_value = torch.optim.Adam(value_net.parameters(),
                                   lr=args.learning_rate)
示例#14
0
                    metavar='N',
                    help="pretrain discriminator iteration (default: 30)")

args = parser.parse_args()
use_gpu = True
np.random.seed(args.seed)
torch.manual_seed(args.seed)
if use_gpu:
    torch.cuda.manual_seed_all(args.seed)

is_disc_action = False
action_dim = 10
ActionTensor = DoubleTensor
"""define actor, critic and discrimiator"""
policy_net = Policy(10, 256, 10, num_layers=2)
value_net = Value(10, 256, num_layers=3)
discrim_net = Discriminator(10, 256, 10, num_layers=3)
discrim_criterion = nn.BCELoss()

#####################################################
### Load Models
load_models = True
if load_models:
    print("Loading Models")
    policy_net, value_net, discrim_net = pickle.load(
        open('learned_models/nextaction_pretrain_sigpolicy.p', 'rb'))
    #_, _, discrim_net = pickle.load(open('learned_models/nextaction_trained_sigpolicy.p', 'rb'))
    print("Loading Models Finished")
#####################################################

if use_gpu:
示例#15
0
if args.model_path is None:
    if is_disc_action:
        policy_net = DiscretePolicy(state_dim, env.action_space.n)

    else:
        if args.sac_policy:
            policy_net = Policy_Tanh_Gaussian(state_dim,
                                              env.action_space.shape[0],
                                              hidden_size=(64, 64),
                                              log_std=args.log_std)

        else:
            policy_net = Policy(state_dim,
                                env.action_space.shape[0],
                                log_std=args.log_std)
    value_net = Value(state_dim)
else:
    policy_net, value_net, running_state, = pickle.load(
        open(args.model_path, "rb"))

policy_net.to(device)
value_net.to(device)

optimizer_policy = torch.optim.Adam(policy_net.parameters(),
                                    lr=args.learning_rate)
optimizer_value = torch.optim.Adam(value_net.parameters(),
                                   lr=args.learning_rate)

params = list(policy_net.parameters()) + list(value_net.parameters())
unique_optimizer = torch.optim.Adam(params, lr=args.learning_rate)
示例#16
0
def train_v_upper_envelope(states,
                           actions,
                           returns,
                           state_dim,
                           device,
                           seed,
                           upper_learning_rate=3e-3,
                           weight_decay=0.02,
                           max_step_num=int(1e6),
                           consecutive_steps=4,
                           k=10000):

    states = torch.from_numpy(np.array(states))
    actions = torch.from_numpy(np.array(actions))
    returns = torch.from_numpy(np.array(returns))  # returns is actually Gts

    use_gpu = True if device == "cuda:0" else False

    # Init upper_envelope net (*use relu as activation function
    upper_envelope = Value(state_dim, activation='relu')
    upper_envelope_retrain = Value(state_dim, activation='relu')

    optimizer_upper = torch.optim.Adam(upper_envelope.parameters(),
                                       lr=upper_learning_rate,
                                       weight_decay=weight_decay)
    optimizer_upper_retrain = torch.optim.Adam(
        upper_envelope_retrain.parameters(),
        lr=upper_learning_rate,
        weight_decay=weight_decay)

    if use_gpu:
        upper_envelope = upper_envelope.cuda()
        upper_envelope_retrain = upper_envelope_retrain.cuda()

    # =========================== #
    # Split data into training and testing #
    # But make sure the highest Ri is in the training set

    # pick out the highest data point
    highestR, indice = torch.max(returns, 0)
    highestR = highestR.view(-1, 1)
    highestS = states[indice]
    highestA = actions[indice]
    print("HighestR:", highestR)

    statesW = torch.cat((states[:indice], states[indice + 1:]))
    actionsW = torch.cat((actions[:indice], actions[indice + 1:]))
    returnsW = torch.cat((returns[:indice], returns[indice + 1:]))

    # shuffle the data
    perm = np.arange(statesW.shape[0])
    np.random.shuffle(perm)
    perm = LongTensor(perm).cuda() if use_gpu else LongTensor(perm)
    statesW, actionsW, returnsW = statesW[perm], actionsW[perm], returnsW[perm]

    # divide data into train/test
    divide = int(states.shape[0] * 0.8)
    train_states, train_actions, train_returns = statesW[:
                                                         divide], actionsW[:
                                                                           divide], returnsW[:
                                                                                             divide]
    test_states, test_actions, test_returns = statesW[divide:], actionsW[
        divide:], returnsW[divide:]

    # add the highest data into training
    print(train_states.size(), highestS.size())
    print(train_actions.size(), highestA.size())
    print(train_returns.size(), highestR.size())
    train_states = torch.cat((train_states.squeeze(), highestS.unsqueeze(0)))
    train_actions = torch.cat((train_actions.squeeze(), highestA.unsqueeze(0)))
    train_returns = torch.cat(
        (train_returns.squeeze(), highestR.squeeze().unsqueeze(0)))

    # train upper envelope
    # env_dummy = env_factory(0)
    # state_dim = env_dummy.observation_space.shape[0]
    # upper_envelope = Value(state_dim)
    # optimizer = torch.optim.Adam(upper_envelope.parameters(), lr=0.003, weight_decay=20)

    epoch_n = 100
    batch_size = 64
    optim_iter_num = int(math.ceil(train_states.shape[0] / batch_size))

    num_increase = 0
    previous_loss = math.inf

    calculate_vali = 2
    best_parameters = upper_envelope.state_dict()
    running_traning_steps = 0
    best_training_steps = running_traning_steps

    # Upper Envelope Training starts
    upper_envelope.train()

    while num_increase < consecutive_steps:
        # update theta for n steps, n = calculate_vali
        # train calculate_vali steps
        for i in range(calculate_vali):
            train_loss = 0
            perm = np.arange(train_states.shape[0])
            np.random.shuffle(perm)
            perm = LongTensor(perm).cuda() if use_gpu else LongTensor(perm)

            train_states, train_actions, train_returns = train_states[
                perm], train_actions[perm], train_returns[perm]

            for i in range(optim_iter_num):
                ind = slice(i * batch_size,
                            min((i + 1) * batch_size, states.shape[0]))
                states_b, returns_b = train_states[ind], train_returns[ind]
                states_b = Variable(states_b.float())
                returns_b = Variable(returns_b.float())
                Vsi = upper_envelope(states_b)
                # loss = loss_fn(Vsi, returns_b)
                loss = L2PenaltyLoss(Vsi, returns_b, k_val=k)
                train_loss += loss.detach()
                upper_envelope.zero_grad()
                loss.backward()
                optimizer_upper.step()

        # early stopping

        running_traning_steps += calculate_vali

        # calculate validation error
        test_iter = int(math.ceil(test_states.shape[0] / batch_size))
        validation_loss = 0
        for n in range(test_iter):
            ind = slice(n * batch_size,
                        min((n + 1) * batch_size, states.shape[0]))
            states_t, returns_t = test_states[ind], test_returns[ind]
            states_t = Variable(states_t.float())
            returns_t = Variable(returns_t.float())
            Vsi = upper_envelope(states_t)
            loss = L2PenaltyLoss(Vsi, returns_t, k_val=k)
            validation_loss += loss

        if validation_loss < previous_loss:
            best_training_steps = running_traning_steps
            previous_loss = validation_loss
            best_parameters = upper_envelope.state_dict()
            num_increase = 0
        else:
            num_increase += 1

    print("best_training_steps:", best_training_steps)
    upper_envelope.load_state_dict(best_parameters)

    # retrain on the whole set
    upper_envelope_retrain.train()

    optim_iter_num = int(math.ceil(states.shape[0] / batch_size))
    for i in range(best_training_steps):
        train_loss = 0
        perm = np.arange(states.shape[0])
        np.random.shuffle(perm)
        perm = LongTensor(perm).cuda() if use_gpu else LongTensor(perm)

        states, actions, returns = states[perm], actions[perm], returns[perm]

        for i in range(optim_iter_num):
            ind = slice(i * batch_size,
                        min((i + 1) * batch_size, states.shape[0]))
            states_b, returns_b = states[ind], returns[ind]
            states_b = Variable(states_b.float())
            returns_b = Variable(returns_b.float())
            Vsi = upper_envelope_retrain(states_b)
            #loss = loss_fn(Vsi, returns_b)
            loss = L2PenaltyLoss(Vsi, returns_b, k_val=k)
            train_loss += loss.detach()
            upper_envelope_retrain.zero_grad()
            loss.backward()
            optimizer_upper_retrain.step()

    upper_envelope.load_state_dict(upper_envelope_retrain.state_dict())
    print("Policy training is complete.")

    return upper_envelope
示例#17
0
class MAGAIL:
    def __init__(self, config, log_dir, exp_name):
        self.config = config
        self.exp_name = exp_name
        self.writer = SummaryWriter(log_dir=f"{log_dir}/{self.exp_name}")
        """seeding"""
        seed = self.config["general"]["seed"]
        torch.manual_seed(seed)
        np.random.seed(seed)

        self._load_expert_data()
        self._init_model()

    def _init_model(self):
        self.V = Value(num_states=self.config["value"]["num_states"],
                       num_hiddens=self.config["value"]["num_hiddens"],
                       drop_rate=self.config["value"]["drop_rate"],
                       activation=self.config["value"]["activation"])
        self.P = JointPolicy(
            initial_state=self.expert_dataset.state.to(device),
            config=self.config["jointpolicy"])
        self.D = Discriminator(
            num_states=self.config["discriminator"]["num_states"],
            num_actions=self.config["discriminator"]["num_actions"],
            num_hiddens=self.config["discriminator"]["num_hiddens"],
            drop_rate=self.config["discriminator"]["drop_rate"],
            use_noise=self.config["discriminator"]["use_noise"],
            noise_std=self.config["discriminator"]["noise_std"],
            activation=self.config["discriminator"]["activation"])

        print("Model Structure")
        print(self.P)
        print(self.V)
        print(self.D)
        print()

        self.optimizer_policy = optim.Adam(
            self.P.parameters(),
            lr=self.config["jointpolicy"]["learning_rate"])
        self.optimizer_value = optim.Adam(
            self.V.parameters(), lr=self.config["value"]["learning_rate"])
        self.optimizer_discriminator = optim.Adam(
            self.D.parameters(),
            lr=self.config["discriminator"]["learning_rate"])
        self.scheduler_discriminator = optim.lr_scheduler.StepLR(
            self.optimizer_discriminator, step_size=2000, gamma=0.95)

        self.discriminator_func = nn.BCELoss()

        to_device(self.V, self.P, self.D, self.D, self.discriminator_func)

    def _load_expert_data(self):
        num_expert_states = self.config["general"]["num_states"]
        num_expert_actions = self.config["general"]["num_actions"]
        expert_batch_size = self.config["general"]["expert_batch_size"]

        self.expert_dataset = ExpertDataSet(
            data_set_path=self.config["general"]["expert_data_path"],
            num_states=num_expert_states,
            num_actions=num_expert_actions)
        self.expert_data_loader = DataLoader(
            dataset=self.expert_dataset,
            batch_size=expert_batch_size,
            shuffle=True,
            num_workers=multiprocessing.cpu_count() // 2)

    def train(self, epoch):
        self.P.train()
        self.D.train()
        self.V.train()

        # collect generated batch
        gen_batch = self.P.collect_samples(
            self.config["ppo"]["sample_batch_size"])
        # batch: ('state', 'action', 'next_state', 'log_prob', 'mask')
        gen_batch_state = trans_shape_func(
            torch.stack(gen_batch.state
                        ))  # [trajectory length * parallel size, state size]
        gen_batch_action = trans_shape_func(
            torch.stack(gen_batch.action
                        ))  # [trajectory length * parallel size, action size]
        gen_batch_next_state = trans_shape_func(
            torch.stack(gen_batch.next_state)
        )  # [trajectory length * parallel size, state size]
        gen_batch_old_log_prob = trans_shape_func(
            torch.stack(
                gen_batch.log_prob))  # [trajectory length * parallel size, 1]
        gen_batch_mask = trans_shape_func(torch.stack(
            gen_batch.mask))  # [trajectory length * parallel size, 1]

        # grad_collect_func = lambda d: torch.cat([grad.view(-1) for grad in torch.autograd.grad(d, self.D.parameters(), retain_graph=True)]).unsqueeze(0)
        ####################################################
        # update discriminator
        ####################################################
        for expert_batch_state, expert_batch_action in self.expert_data_loader:
            gen_r = self.D(gen_batch_state, gen_batch_action)
            expert_r = self.D(expert_batch_state.to(device),
                              expert_batch_action.to(device))

            # label smoothing for discriminator
            expert_labels = torch.ones_like(expert_r)
            gen_labels = torch.zeros_like(gen_r)

            if self.config["discriminator"]["use_label_smoothing"]:
                smoothing_rate = self.config["discriminator"][
                    "label_smooth_rate"]
                expert_labels *= (1 - smoothing_rate)
                gen_labels += torch.ones_like(gen_r) * smoothing_rate

            e_loss = self.discriminator_func(expert_r, expert_labels)
            g_loss = self.discriminator_func(gen_r, gen_labels)
            d_loss = e_loss + g_loss

            # """ WGAN with Gradient Penalty"""
            # d_loss = gen_r.mean() - expert_r.mean()
            # differences_batch_state = gen_batch_state[:expert_batch_state.size(0)] - expert_batch_state
            # differences_batch_action = gen_batch_action[:expert_batch_action.size(0)] - expert_batch_action
            # alpha = torch.rand(expert_batch_state.size(0), 1)
            # interpolates_batch_state = gen_batch_state[:expert_batch_state.size(0)] + (alpha * differences_batch_state)
            # interpolates_batch_action = gen_batch_action[:expert_batch_action.size(0)] + (alpha * differences_batch_action)
            # gradients = torch.cat([x for x in map(grad_collect_func, self.D(interpolates_batch_state, interpolates_batch_action))])
            # slopes = torch.norm(gradients, p=2, dim=-1)
            # gradient_penalty = torch.mean((slopes - 1.) ** 2)
            # d_loss += 10 * gradient_penalty

            self.optimizer_discriminator.zero_grad()
            d_loss.backward()
            self.optimizer_discriminator.step()

            self.scheduler_discriminator.step()

        self.writer.add_scalar('train/loss/d_loss', d_loss.item(), epoch)
        self.writer.add_scalar("train/loss/e_loss", e_loss.item(), epoch)
        self.writer.add_scalar("train/loss/g_loss", g_loss.item(), epoch)
        self.writer.add_scalar('train/reward/expert_r',
                               expert_r.mean().item(), epoch)
        self.writer.add_scalar('train/reward/gen_r',
                               gen_r.mean().item(), epoch)

        with torch.no_grad():
            gen_batch_value = self.V(gen_batch_state)
            gen_batch_reward = self.D(gen_batch_state, gen_batch_action)

        gen_batch_advantage, gen_batch_return = estimate_advantages(
            gen_batch_reward, gen_batch_mask, gen_batch_value,
            self.config["gae"]["gamma"], self.config["gae"]["tau"],
            self.config["jointpolicy"]["trajectory_length"])

        ####################################################
        # update policy by ppo [mini_batch]
        ####################################################
        ppo_optim_epochs = self.config["ppo"]["ppo_optim_epochs"]
        ppo_mini_batch_size = self.config["ppo"]["ppo_mini_batch_size"]
        gen_batch_size = gen_batch_state.shape[0]
        optim_iter_num = int(math.ceil(gen_batch_size / ppo_mini_batch_size))

        for _ in range(ppo_optim_epochs):
            perm = torch.randperm(gen_batch_size)

            for i in range(optim_iter_num):
                ind = perm[slice(
                    i * ppo_mini_batch_size,
                    min((i + 1) * ppo_mini_batch_size, gen_batch_size))]
                mini_batch_state, mini_batch_action, mini_batch_next_state, mini_batch_advantage, mini_batch_return, \
                mini_batch_old_log_prob = gen_batch_state[ind], gen_batch_action[ind], gen_batch_next_state[ind], \
                                          gen_batch_advantage[ind], gen_batch_return[ind], gen_batch_old_log_prob[ind]

                v_loss, p_loss = ppo_step(
                    self.P,
                    self.V,
                    self.optimizer_policy,
                    self.optimizer_value,
                    states=mini_batch_state,
                    actions=mini_batch_action,
                    next_states=mini_batch_next_state,
                    returns=mini_batch_return,
                    old_log_probs=mini_batch_old_log_prob,
                    advantages=mini_batch_advantage,
                    ppo_clip_ratio=self.config["ppo"]["clip_ratio"],
                    value_l2_reg=self.config["value"]["l2_reg"])

                self.writer.add_scalar('train/loss/p_loss', p_loss, epoch)
                self.writer.add_scalar('train/loss/v_loss', v_loss, epoch)

        print(f" Training episode:{epoch} ".center(80, "#"))
        print('gen_r:', gen_r.mean().item())
        print('expert_r:', expert_r.mean().item())
        print('d_loss', d_loss.item())

    def eval(self, epoch):
        self.P.eval()
        self.D.eval()
        self.V.eval()

        gen_batch = self.P.collect_samples(
            self.config["ppo"]["sample_batch_size"])
        gen_batch_state = torch.stack(gen_batch.state)
        gen_batch_action = torch.stack(gen_batch.action)

        gen_r = self.D(gen_batch_state, gen_batch_action)
        for expert_batch_state, expert_batch_action in self.expert_data_loader:
            expert_r = self.D(expert_batch_state.to(device),
                              expert_batch_action.to(device))

            print(f" Evaluating episode:{epoch} ".center(80, "-"))
            print('validate_gen_r:', gen_r.mean().item())
            print('validate_expert_r:', expert_r.mean().item())

        self.writer.add_scalar("validate/reward/gen_r",
                               gen_r.mean().item(), epoch)
        self.writer.add_scalar("validate/reward/expert_r",
                               expert_r.mean().item(), epoch)

    def save_model(self, save_path):
        if not os.path.exists(save_path):
            os.mkdir(save_path)
        # dump model from pkl file
        # torch.save((self.D, self.P, self.V), f"{save_path}/{self.exp_name}.pt")
        torch.save(self.D, f"{save_path}/{self.exp_name}_Discriminator.pt")
        torch.save(self.P, f"{save_path}/{self.exp_name}_JointPolicy.pt")
        torch.save(self.V, f"{save_path}/{self.exp_name}_Value.pt")

    def load_model(self, model_path):
        # load entire model
        # self.D, self.P, self.V = torch.load((self.D, self.P, self.V), f"{save_path}/{self.exp_name}.pt")
        self.D = torch.load(f"{model_path}_Discriminator.pt",
                            map_location=device)
        self.P = torch.load(f"{model_path}_JointPolicy.pt",
                            map_location=device)
        self.V = torch.load(f"{model_path}_Value.pt", map_location=device)
示例#18
0
device = torch.device('cuda') if torch.cuda.is_available() else torch.device(
    'cpu')
if torch.cuda.is_available():
    torch.cuda.set_device(args.gpu_index)
env = gym.make(args.env_name)
state_dim = env.observation_space.shape[0]
is_disc_action = len(env.action_space.shape) == 0
action_dim = 1 if is_disc_action else env.action_space.shape[0]
running_reward = ZFilter((1, ), demean=False, clip=10)
print("Seed: {}".format(args.seed))
np.random.seed(args.seed)
torch.manual_seed(args.seed)
env.seed(args.seed)
policy_net = Policy(state_dim, env.action_space.shape[0], log_std=args.log_std)
value_net = Value(
    state_dim
)  # Initialise value network. Used  for calculating Advantages for TRPO
critic_net = OTCritic(state_dim + action_dim)  # Initialise OT critic
if args.resume_training:
    policy_net, value_net, critic_net, running_state, running_reward = pickle.load(
        open(
            'assets/learned_models/ablation/SIL/{}/{}_SIL_s{}.p'.format(
                args.dataset_size, args.env_name, args.seed), "rb"))
to_device(device, policy_net, value_net, critic_net)
optimizer_ot = torch.optim.Adam(critic_net.parameters(), lr=args.critic_lr)
optimizer_policy = torch.optim.Adam(policy_net.parameters(),
                                    lr=args.learning_rate)
optimizer_value = torch.optim.Adam(value_net.parameters(),
                                   lr=args.learning_rate)
#OT params
def create_networks():
    """define actor and critic"""
    if is_disc_action:
        policy_net = DiscretePolicy(state_dim,
                                    env.action_space.n,
                                    hidden_size=(64, 32),
                                    activation='relu')
    else:
        policy_net = Policy(state_dim,
                            env.action_space.shape[0],
                            log_std=args.log_std,
                            hidden_size=(64, 32),
                            activation='relu')
    value_net = Value(state_dim, hidden_size=(32, 16), activation='relu')
    if args.AL:
        discrim_net = SNDiscriminator(state_dim + action_dim,
                                      hidden_size=(32, 16),
                                      activation='relu')
    elif args.EBGAN or args.GMMIL:
        discrim_net = AESNDiscriminator(state_dim + action_dim,
                                        hidden_size=(32, ),
                                        encode_size=64,
                                        activation='leakyrelu',
                                        slope=0.1,
                                        dropout=True,
                                        dprob=0.2)
    elif args.VAKLIL:
        noise_dim = 64
        mid_dim = 32
        discrim_net = VAEDiscriminator(state_dim + action_dim,
                                       num_outputs=noise_dim,
                                       sigmoid_out=False,
                                       sn=True,
                                       test=False,
                                       w_init=False,
                                       hidden_size_enc=(),
                                       hidden_size_dec=(),
                                       encode_size=mid_dim,
                                       activation='relu',
                                       dropout=False)
        kernel_net = NoiseNet(noise_dim,
                              hidden_size=(32, ),
                              encode_size=noise_dim,
                              activation='relu',
                              dropout=False)
        optimizer_kernel = torch.optim.Adam(kernel_net.parameters(),
                                            lr=args.learning_rate)
        scheduler_kernel = MultiStepLR(optimizer_kernel,
                                       milestones=args.milestones,
                                       gamma=args.lr_kernel_decay)
    else:
        discrim_net = Discriminator(state_dim + action_dim,
                                    hidden_size=(32, 16),
                                    activation='relu')

    optimizer_policy = torch.optim.Adam(policy_net.parameters(),
                                        lr=args.learning_rate)
    optimizer_value = torch.optim.Adam(value_net.parameters(),
                                       lr=args.learning_rate)
    optimizer_discrim = torch.optim.Adam(discrim_net.parameters(),
                                         lr=args.learning_rate)

    scheduler_policy = MultiStepLR(optimizer_policy,
                                   milestones=args.milestones,
                                   gamma=args.lr_decay)
    scheduler_value = MultiStepLR(optimizer_value,
                                  milestones=args.milestones,
                                  gamma=args.lr_decay)
    scheduler_discrim = MultiStepLR(optimizer_discrim,
                                    milestones=args.milestones,
                                    gamma=args.lr_kernel_decay)

    if args.AL:

        class ExpertReward():
            def __init__(self):
                self.a = 0

            def expert_reward(self, state, action):
                state_action = tensor(np.hstack([state, action]), dtype=dtype)
                with torch.no_grad():
                    return -discrim_net(state_action)[0].item()

        learned_reward = ExpertReward()
    elif args.EBGAN:

        class ExpertReward():
            def __init__(self):
                self.a = 0

            def expert_reward(self, state, action):
                state_action = tensor(np.hstack([state, action]), dtype=dtype)
                with torch.no_grad():
                    _, recon_out = discrim_net(state_action)
                    return -elementwise_loss(
                        recon_out, state_action).item() + args.r_margin

        learned_reward = ExpertReward()
    elif args.GMMIL or args.VAKLIL:

        class ExpertReward():
            def __init__(self):
                self.r_bias = 0

            def expert_reward(self, state, action):
                with torch.no_grad():
                    return self.r_bias

            def update_XX_YY(self):
                self.XX = torch.diag(torch.mm(self.e_o_enc, self.e_o_enc.t()))
                self.YY = torch.diag(torch.mm(self.g_o_enc, self.g_o_enc.t()))

        learned_reward = ExpertReward()
    else:

        class ExpertReward():
            def __init__(self):
                self.a = 0

            def expert_reward(self, state, action):
                state_action = tensor(np.hstack([state, action]), dtype=dtype)
                with torch.no_grad():
                    return -math.log(discrim_net(state_action)[0].item())

        learned_reward = ExpertReward()
    """create agent"""
    agent = Agent(env,
                  policy_net,
                  device,
                  custom_reward=learned_reward,
                  running_state=None,
                  render=args.render,
                  num_threads=args.num_threads)

    def update_params(batch, i_iter):
        dataSize = min(args.min_batch_size, len(batch.state))
        states = torch.from_numpy(np.stack(
            batch.state)[:dataSize, ]).to(dtype).to(device)
        actions = torch.from_numpy(np.stack(
            batch.action)[:dataSize, ]).to(dtype).to(device)
        rewards = torch.from_numpy(np.stack(
            batch.reward)[:dataSize, ]).to(dtype).to(device)
        masks = torch.from_numpy(np.stack(
            batch.mask)[:dataSize, ]).to(dtype).to(device)
        with torch.no_grad():
            values = value_net(states)
            fixed_log_probs = policy_net.get_log_prob(states, actions)
        """estimate reward"""
        """get advantage estimation from the trajectories"""
        advantages, returns = estimate_advantages(rewards, masks, values,
                                                  args.gamma, args.tau, device)
        """update discriminator"""
        for _ in range(args.discriminator_epochs):
            exp_idx = random.sample(range(expert_traj.shape[0]), dataSize)
            expert_state_actions = torch.from_numpy(
                expert_traj[exp_idx, :]).to(dtype).to(device)

            dis_input_real = expert_state_actions
            if len(actions.shape) == 1:
                actions.unsqueeze_(-1)
                dis_input_fake = torch.cat([states, actions], 1)
                actions.squeeze_(-1)
            else:
                dis_input_fake = torch.cat([states, actions], 1)

            if args.EBGAN or args.GMMIL or args.VAKLIL:
                g_o_enc, g_mu, g_sigma = discrim_net(dis_input_fake,
                                                     mean_mode=False)
                e_o_enc, e_mu, e_sigma = discrim_net(dis_input_real,
                                                     mean_mode=False)
            else:
                g_o = discrim_net(dis_input_fake)
                e_o = discrim_net(dis_input_real)

            optimizer_discrim.zero_grad()
            if args.VAKLIL:
                optimizer_kernel.zero_grad()

            if args.AL:
                if args.LSGAN:
                    pdist = l1dist(dis_input_real,
                                   dis_input_fake).mul(args.lamb)
                    discrim_loss = LeakyReLU(e_o - g_o + pdist).mean()
                else:
                    discrim_loss = torch.mean(e_o) - torch.mean(g_o)
            elif args.EBGAN:
                e_recon = elementwise_loss(e_o, dis_input_real)
                g_recon = elementwise_loss(g_o, dis_input_fake)
                discrim_loss = e_recon
                if (args.margin - g_recon).item() > 0:
                    discrim_loss += (args.margin - g_recon)
            elif args.GMMIL:
                mmd2_D, K = mix_rbf_mmd2(e_o_enc, g_o_enc, args.sigma_list)
                rewards = K[1] - K[2]  # -(exp - gen): -(kxy-kyy)=kyy-kxy
                rewards = -rewards.detach(
                )  # exp - gen, maximize (gen label negative)
                errD = mmd2_D
                discrim_loss = -errD  # maximize errD

                advantages, returns = estimate_advantages(
                    rewards, masks, values, args.gamma, args.tau, device)
            elif args.VAKLIL:
                noise_num = 20000
                mmd2_D_net, _, penalty = mix_imp_with_bw_mmd2(
                    e_o_enc, g_o_enc, noise_num, noise_dim, kernel_net, cuda,
                    args.sigma_list)
                mmd2_D_rbf, _ = mix_rbf_mmd2(e_o_enc, g_o_enc, args.sigma_list)
                errD = (mmd2_D_net + mmd2_D_rbf) / 2
                # 1e-8: small number for numerical stability
                i_c = 0.2
                bottleneck_loss = torch.mean((0.5 * torch.sum((torch.cat(
                    (e_mu, g_mu), dim=0)**2) + (torch.cat(
                        (e_sigma, g_sigma), dim=0)**2) - torch.log((torch.cat(
                            (e_sigma, g_sigma), dim=0)**2) + 1e-8) - 1,
                                                              dim=1))) - i_c
                discrim_loss = -errD + (args.beta * bottleneck_loss) + (
                    args.lambda_h * penalty)
            else:
                discrim_loss = discrim_criterion(g_o, ones((states.shape[0], 1), device=device)) + \
                               discrim_criterion(e_o, zeros((e_o.shape[0], 1), device=device))

            discrim_loss.backward()
            optimizer_discrim.step()
            if args.VAKLIL:
                optimizer_kernel.step()

        if args.VAKLIL:
            with torch.no_grad():
                noise_num = 20000
                g_o_enc, _, _ = discrim_net(dis_input_fake)
                e_o_enc, _, _ = discrim_net(dis_input_real)
                _, K_net, _ = mix_imp_with_bw_mmd2(e_o_enc, g_o_enc, noise_num,
                                                   noise_dim, kernel_net, cuda,
                                                   args.sigma_list)
                _, K_rbf = mix_rbf_mmd2(e_o_enc, g_o_enc, args.sigma_list)
                K = [sum(x) / 2 for x in zip(K_net, K_rbf)]
                rewards = K[1] - K[2]  # -(exp - gen): -(kxy-kyy)=kyy-kxy
                rewards = -rewards  #.detach()
                advantages, returns = estimate_advantages(
                    rewards, masks, values, args.gamma, args.tau, device)
        """perform mini-batch PPO update"""
        optim_iter_num = int(math.ceil(states.shape[0] / args.ppo_batch_size))
        for _ in range(args.generator_epochs):
            perm = np.arange(states.shape[0])
            np.random.shuffle(perm)
            perm = LongTensor(perm).to(device)

            states, actions, returns, advantages, fixed_log_probs = \
                states[perm].clone(), actions[perm].clone(), returns[perm].clone(), advantages[perm].clone(), \
                fixed_log_probs[perm].clone()

            for i in range(optim_iter_num):
                ind = slice(
                    i * args.ppo_batch_size,
                    min((i + 1) * args.ppo_batch_size, states.shape[0]))
                states_b, actions_b, advantages_b, returns_b, fixed_log_probs_b = \
                    states[ind], actions[ind], advantages[ind], returns[ind], fixed_log_probs[ind]

                ppo_step(policy_net, value_net, optimizer_policy,
                         optimizer_value, 1, states_b, actions_b, returns_b,
                         advantages_b, fixed_log_probs_b, args.clip_epsilon,
                         args.l2_reg)

        return rewards

    if args.VAKLIL:
        return policy_net,value_net,discrim_net,kernel_net,optimizer_policy,optimizer_value,optimizer_discrim,optimizer_kernel,agent,update_params \
            ,scheduler_policy,scheduler_value,scheduler_discrim,scheduler_kernel
    else:
        return policy_net,value_net,discrim_net,optimizer_policy,optimizer_value,optimizer_discrim,agent,update_params \
            ,scheduler_policy,scheduler_value,scheduler_discrim
示例#20
0
def update_params(batch, i_iter, opt):
    """update discriminator"""
    reirl_weights.write(
        reirl(expert_traj[:, :-action_dim], np.stack(batch.state), opt))
    value_net = Value(state_dim)
    optimizer_value = torch.optim.Adam(value_net.parameters(),
                                       lr=args.learning_rate)
    if i_iter > 0:
        j_max = 3  #if i_iter < 20 else 15
        for j in range(j_max):  #3):
            batch, log = ppo_agent.collect_samples(3000)
            print('{}\tT_sample {}\texpert_R_avg {}\tR_avg {}'.format(
                j, log['sample_time'], log['avg_c_reward'], log['avg_reward']))
            states = torch.from_numpy(np.stack(
                batch.state)).to(dtype).to(device)
            player_actions = torch.from_numpy(np.stack(
                batch.player_action)).to(dtype).to(device)
            opponent_actions = torch.from_numpy(np.stack(
                batch.opponent_action)).to(dtype).to(device)
            rewards = torch.from_numpy(np.stack(
                batch.reward)).to(dtype).to(device)
            masks = torch.from_numpy(np.stack(batch.mask)).to(dtype).to(device)
            with torch.no_grad():
                values = value_net(states)
                fixed_log_probs = policy_net.get_log_prob(
                    states, player_actions)
                opponent_fixed_log_probs = opponent_net.get_log_prob(
                    states, opponent_actions)
            """get advantage estimation from the trajectories"""
            advantages, returns = estimate_advantages(rewards, masks, values,
                                                      args.gamma, args.tau,
                                                      device)
            """perform mini-batch PPO update"""
            optim_iter_num = int(math.ceil(states.shape[0] / optim_batch_size))
            for _ in range(optim_epochs):
                perm = np.arange(states.shape[0])
                np.random.shuffle(perm)
                perm = LongTensor(perm).to(device)

                states, player_actions, opponent_actions, returns, advantages, fixed_log_probs, opponent_fixed_log_probs = \
                    states[perm].clone(), player_actions[perm].clone(), \
                    opponent_actions[perm].clone(), returns[perm].clone(), \
                    advantages[perm].clone(), \
                    fixed_log_probs[perm].clone(), opponent_fixed_log_probs[
                        perm].clone()

                for i in range(optim_iter_num):
                    ind = slice(
                        i * optim_batch_size,
                        min((i + 1) * optim_batch_size, states.shape[0]))
                    states_b, player_actions_b, opponent_actions_b, advantages_b, returns_b, fixed_log_probs_b, opponent_fixed_log_probs_b = \
                        states[ind], player_actions[ind], opponent_actions[ind], \
                        advantages[ind], returns[ind], fixed_log_probs[ind], \
                        opponent_fixed_log_probs[ind]

                    # Update the player
                    ppo_step(policy_net,
                             value_net,
                             optimizer_policy,
                             optimizer_value,
                             1,
                             states_b,
                             player_actions_b,
                             returns_b,
                             advantages_b,
                             fixed_log_probs_b,
                             args.clip_epsilon,
                             args.l2_reg,
                             max_grad=max_grad)
                    # Update the opponent
                    ppo_step(opponent_net,
                             value_net,
                             optimizer_opponent,
                             optimizer_value,
                             1,
                             states_b,
                             opponent_actions_b,
                             returns_b,
                             advantages_b,
                             opponent_fixed_log_probs_b,
                             args.clip_epsilon,
                             args.l2_reg,
                             opponent=True,
                             max_grad=max_grad)
示例#21
0
文件: ddpg.py 项目: kvas7andy/MAGAIL
class DDPG:
    def __init__(
        self,
        env=None,
        render=False,
        num_process=1,
        memory_size=1000000,
        lr_p=1e-3,
        lr_v=1e-3,
        gamma=0.99,
        polyak=0.995,
        explore_size=10000,
        batch_size=100,
        min_update_step=1000,
        update_step=50,
        action_noise=0.1,
        seed=1,
    ):
        self.env = env
        self.render = render
        self.gamma = gamma
        self.polyak = polyak
        self.memory = FixedMemory(memory_size)
        self.explore_size = explore_size
        self.num_process = num_process
        self.lr_p = lr_p
        self.lr_v = lr_v
        self.batch_size = batch_size
        self.min_update_step = min_update_step
        self.update_step = update_step
        self.action_noise = action_noise
        self.seed = seed

        self._init_model()

    def _init_model(self):
        """init model from parameters"""
        self.num_states = self.env.observation_space.shape[0]
        self.num_actions = self.env.action_space.shape[0]

        self.action_low, self.action_high = self.env.action_space.low[
            0], self.env.action_space.high[0]
        # seeding
        np.random.seed(self.seed)
        torch.manual_seed(self.seed)
        self.env.seed(self.seed)

        self.policy_net = Actor(self.num_states, self.num_actions,
                                self.action_high).to(device)
        self.policy_net_target = Actor(self.num_states, self.num_actions,
                                       self.action_high).to(device)

        self.value_net = Value(self.num_states + self.num_actions).to(device)
        self.value_net_target = Value(self.num_states +
                                      self.num_actions).to(device)

        self.policy_net_target.load_state_dict(self.policy_net.state_dict())
        self.value_net_target.load_state_dict(self.value_net.state_dict())

        self.optimizer_p = optim.Adam(self.policy_net.parameters(),
                                      lr=self.lr_p)
        self.optimizer_v = optim.Adam(self.value_net.parameters(),
                                      lr=self.lr_v)

    def choose_action(self, state, noise_scale):
        """select action"""
        self.policy_net.eval()
        state = FLOAT(state).unsqueeze(0).to(device)
        with torch.no_grad():
            action = self.policy_net(state)
        self.policy_net.train()
        action = action.cpu().numpy()[0]
        # add noise
        noise = noise_scale * np.random.randn(self.num_actions)
        action += noise
        action = np.clip(action, -self.action_high, self.action_high)
        return action

    def eval(self, i_iter, render=False):
        """evaluate model"""
        self.policy_net.eval()
        self.value_net.eval()

        state = self.env.reset()
        test_reward = 0
        while True:
            if render:
                self.env.render()
            action = self.choose_action(state, 0)
            state, reward, done, _ = self.env.step(action)

            test_reward += reward
            if done:
                break
        print(f"Iter: {i_iter}, test Reward: {test_reward}")
        self.env.close()

    def learn(self, writer, i_iter, step):
        """interact"""
        self.policy_net.train()
        self.value_net.train()

        state = self.env.reset()
        episode_reward = 0

        while True:
            if self.render:
                self.env.render()

            action = self.choose_action(state, self.action_noise)

            next_state, reward, done, _ = self.env.step(action)
            mask = 0 if done else 1
            # ('state', 'action', 'reward', 'next_state', 'mask', 'log_prob')
            self.memory.push(state, action, reward, next_state, mask)

            episode_reward += reward

            if step >= self.min_update_step and step % self.update_step == 0:
                for _ in range(self.update_step):
                    batch = self.memory.sample(
                        self.batch_size)  # random sample batch
                    self.update(batch)

            if done:
                break

            state = next_state

        self.env.close()

        print(f"Iter: {i_iter}, reward: {episode_reward}")

        # record reward information
        writer.add_scalar("ddpg/reward", episode_reward, i_iter)

    def update(self, batch):
        """learn model"""
        batch_state = FLOAT(batch.state).to(device)
        batch_action = FLOAT(batch.action).to(device)
        batch_reward = FLOAT(batch.reward).to(device)
        batch_next_state = FLOAT(batch.next_state).to(device)
        batch_mask = FLOAT(batch.mask).to(device)

        # update by DDPG
        ddpg_step(self.policy_net, self.policy_net_target, self.value_net,
                  self.value_net_target, self.optimizer_p, self.optimizer_v,
                  batch_state, batch_action, batch_reward, batch_next_state,
                  batch_mask, self.gamma, self.polyak)

    def load(self, model_path):
        print(f"Loading Saved Model from {model_path}")
        self.policy_net, self.value_net = torch.load(model_path,
                                                     map_location=device)

    def save(self, save_path):
        if not os.path.exists(save_path):
            os.mkdir(save_path)
        """save model"""
        torch.save((self.policy_net, self.value_net),
                   f"{save_path}/WebEye_ddpg.pt")
示例#22
0
running_state = ZFilter((state_dim, ), clip=5)
""" Seeding """
np.random.seed(exp_args["config"]["seed"])
torch.manual_seed(exp_args["config"]["seed"])
env.seed(exp_args["config"]["seed"])
""" define policy(actor) and critic(value function predictor) """

if is_discrete_action_space:
    policy_net = DiscretePolicy(state_dim, env.action_space.n,
                                exp_args["model"]["hidden"],
                                exp_args["model"]["activation"])
else:
    raise ValueError(
        "Policy for Continous Action Space is not implemented yet")

value_net = Value(state_dim, exp_args["model"]["hidden"],
                  exp_args["model"]["activation"])

policy_net.to(device)
value_net.to(device)

optimizer_policy = torch.optim.Adam(policy_net.parameters(),
                                    lr=exp_args["config"]["lr"])
optimizer_value = torch.optim.Adam(value_net.parameters(),
                                   lr=exp_args["config"]["lr"])
""" Create Agent """

agent = Agent(env,
              policy_net,
              device,
              running_state=running_state,
              render=exp_args["config"]["render"],
示例#23
0
np.random.seed(args.seed)
torch.manual_seed(args.seed)
if use_gpu:
    torch.cuda.manual_seed_all(args.seed)

env_dummy = env_factory(0)
state_dim = env_dummy.observation_space.shape[0]
is_disc_action = len(env_dummy.action_space.shape) == 0
action_dim = (1 if is_disc_action else env_dummy.action_space.shape[0])
ActionTensor = LongTensor if is_disc_action else DoubleTensor
"""define actor, critic and discrimiator"""
if is_disc_action:
    policy_net = DiscretePolicy(state_dim, env_dummy.action_space.n)
else:
    policy_net = Policy(state_dim, env_dummy.action_space.shape[0])
value_net = Value(state_dim)
discrim_net = Discriminator(state_dim + action_dim)
discrim_criterion = nn.BCELoss()
if use_gpu:
    policy_net = policy_net.cuda()
    value_net = value_net.cuda()
    discrim_net = discrim_net.cuda()
    discrim_criterion = discrim_criterion.cuda()

optimizer_policy = torch.optim.Adam(policy_net.parameters(),
                                    lr=args.learning_rate)
optimizer_value = torch.optim.Adam(value_net.parameters(),
                                   lr=args.learning_rate)
optimizer_discrim = torch.optim.Adam(discrim_net.parameters(),
                                     lr=args.learning_rate)
示例#24
0
 def setUp(self) -> None:
     self.value = Value(6, drop_rate=0.5)
     self.value2 = Value(11, drop_rate=0.5)
     print(self.value)
示例#25
0
# running_reward = ZFilter((1,), demean=False, clip=10)
running_state = None
"""seeding"""
np.random.seed(args.seed)
torch.manual_seed(args.seed)
env.seed(args.seed)
"""define actor and critic"""
if args.model_path is None:
    if is_disc_action:
        policy_net = DiscretePolicy(state_dim, env.action_space.n)
    else:
        policy_mgr = DiscretePolicy(state_dim, 4)
        policy_wrk = Policy(state_dim + subgoal_dim,
                            env.action_space.shape[0],
                            log_std=args.log_std)
    value_mgr = Value(state_dim)
    value_wrk = Value(state_dim + subgoal_dim)
else:
    policy_net, value_net, running_state = pickle.load(
        open(args.model_path, "rb"))
policy_mgr.to(device)
policy_wrk.to(device)
value_mgr.to(device)
value_wrk.to(device)

optim_policy_m = torch.optim.Adam(policy_mgr.parameters(), lr=0.01)
optim_policy_w = torch.optim.Adam(policy_wrk.parameters(), lr=0.01)
optim_value_m = torch.optim.Adam(value_mgr.parameters(), lr=0.01)
optim_value_w = torch.optim.Adam(value_wrk.parameters(), lr=0.01)

optim_epochs = 10
示例#26
0
np.random.seed(args.seed)
torch.manual_seed(args.seed)
env.seed(args.seed)

p_nets = []
v_nets = []
p_opts = []
v_opts = []
"""define actor and critic"""
if args.model_path is None:
    if is_disc_action:
        for i in range(env.n_agents):
            p_nets.append(
                DiscretePolicy(args.dec_agents, env.n_agents, state_dim,
                               env.action_space[0].n))
            v_nets.append(Value(env.n_agents, state_dim))
            # add only one policy and value networks if using team unified network settings.
            if args.dec_agents is False:
                break
    else:
        policy_net = Policy(state_dim,
                            env.action_space[0].n,
                            log_std=args.log_std)
else:
    p_nets, v_nets, running_state = pickle.load(open(args.model_path, "rb"))

dtype = torch.float64
torch.set_default_dtype(dtype)
device = torch.device('cpu')

for i in range(env.n_agents):
示例#27
0
np.random.seed(args.seed)
torch.manual_seed(args.seed)
env_dummy = env_factory(0)
state_dim = env_dummy.observation_space.shape[0]
is_disc_action = len(env_dummy.action_space.shape) == 0
running_state = ZFilter((state_dim, ), clip=5)
# running_reward = ZFilter((1,), demean=False, clip=10)
"""define actor and critic"""
if args.model_path is None:
    if is_disc_action:
        policy_net = DiscretePolicy(state_dim, env_dummy.action_space.n)
    else:
        policy_net = Policy(state_dim,
                            env_dummy.action_space.shape[0],
                            log_std=args.log_std)
    value_net = Value(state_dim)
else:
    policy_net, value_net, running_state = pickle.load(
        open(args.model_path, "rb"))
policy_net.to(device)
value_net.to(device)
del env_dummy
"""create agent"""
agent = Agent(env_factory,
              policy_net,
              device,
              running_state=running_state,
              render=args.render,
              num_threads=args.num_threads)

示例#28
0
def train(**kwargs):
    print('here')
    config = {
        "lr": kwargs['lr'],
        "gamma": kwargs['gamma']
    }
    dtype = torch.float64
    torch.set_default_dtype(dtype)
    device = torch.device('cuda', index=args.gpu_index) if torch.cuda.is_available() else torch.device('cpu')
    if torch.cuda.is_available():
        torch.cuda.set_device(args.gpu_index)

    """environment"""
    env = gym.make(args.env_name)
    state_dim = env.observation_space.shape[0]
    is_disc_action = len(env.action_space.shape) == 0
    running_state = ZFilter((state_dim,), clip=5)
    # running_reward = ZFilter((1,), demean=False, clip=10)

    """seeding"""
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    env.seed(args.seed)

    # """define actor and critic"""
    if args.model_path is None:
        if is_disc_action:
            policy_net = DiscretePolicy(state_dim, env.action_space.n)
        else:
            policy_net = Policy(state_dim, env.action_space.shape[0], log_std=args.log_std)
        value_net = Value(state_dim)
    else:
        policy_net, value_net, running_state = pickle.load(open(args.model_path, "rb"))
    policy_net.to(device)
    value_net.to(device)

    # optimization epoch number and batch size for PPO
    optim_epochs = 10
    optim_batch_size = 64

    """create agent"""
    agent = Agent(env, policy_net, device, running_state=running_state, render=args.render, num_threads=args.num_threads)
    def update_params(batch, i_iter, config):
        states = torch.from_numpy(np.stack(batch.state)).to(dtype).to(device)
        actions = torch.from_numpy(np.stack(batch.action)).to(dtype).to(device)
        rewards = torch.from_numpy(np.stack(batch.reward)).to(dtype).to(device)
        masks = torch.from_numpy(np.stack(batch.mask)).to(dtype).to(device)
        with torch.no_grad():
            values = value_net(states)
            fixed_log_probs = policy_net.get_log_prob(states, actions)

        """get advantage estimation from the trajectories"""
        advantages, returns = estimate_advantages(rewards, masks, values, config['gamma'], args.tau, device)

        """perform mini-batch PPO update"""
        optim_iter_num = int(math.ceil(states.shape[0] / optim_batch_size))
        for _ in range(optim_epochs):
            perm = np.arange(states.shape[0])
            np.random.shuffle(perm)
            perm = LongTensor(perm).to(device)

            states, actions, returns, advantages, fixed_log_probs = \
                states[perm].clone(), actions[perm].clone(), returns[perm].clone(), advantages[perm].clone(), fixed_log_probs[perm].clone()

            for i in range(optim_iter_num):
                ind = slice(i * optim_batch_size, min((i + 1) * optim_batch_size, states.shape[0]))
                states_b, actions_b, advantages_b, returns_b, fixed_log_probs_b = \
                    states[ind], actions[ind], advantages[ind], returns[ind], fixed_log_probs[ind]

                ppo_step(policy_net, value_net, optimizer_policy, optimizer_value, 1, states_b, actions_b, returns_b,
                        advantages_b, fixed_log_probs_b, args.clip_epsilon, args.l2_reg)

    def main_loop(config):
        optimizer_policy = torch.optim.Adam(policy_net.parameters(), lr=config['lr'])
        optimizer_value = torch.optim.Adam(value_net.parameters(), lr=config['lr'])
        for i_iter in range(args.max_iter_num):
            """generate multiple trajectories that reach the minimum batch_size"""
            batch, log = agent.collect_samples(args.min_batch_size)
            t0 = time.time()
            update_params(batch, i_iter, config)
            t1 = time.time()

            if i_iter % args.log_interval == 0:
                print('{}\tT_sample {:.4f}\tT_update {:.4f}\tR_min {:.2f}\tR_max {:.2f}\tR_avg {:.2f}'.format(
                    i_iter, log['sample_time'], t1-t0, log['min_reward'], log['max_reward'], log['avg_reward']))

            if args.save_model_interval > 0 and (i_iter+1) % args.save_model_interval == 0:
                to_device(torch.device('cpu'), policy_net, value_net)
                pickle.dump((policy_net, value_net, running_state),
                            open(os.path.join(assets_dir(), 'learned_models/{}_ppo.p'.format(args.env_name)), 'wb'))
                to_device(device, policy_net, value_net)

        #     """clean up gpu memory"""
            torch.cuda.empty_cache()
        return agent.evaluate()

    print('a')
    print(config)
    print(args)
    return main_loop(config)
示例#29
0
np.random.seed(args.seed)
torch.manual_seed(args.seed)
env_dummy = env_factory(0)
state_dim = env_dummy.observation_space.shape[0]
is_disc_action = len(env_dummy.action_space.shape) == 0
running_state = ZFilter((state_dim, ), clip=5)
# running_reward = ZFilter((1,), demean=False, clip=10)
"""define actor and critic"""
if args.model_path is None:
    if is_disc_action:
        policy_net = DiscretePolicy(state_dim, env_dummy.action_space.n)
    else:
        policy_net = Policy(state_dim,
                            env_dummy.action_space.shape[0],
                            log_std=args.log_std)
    value_net = Value(state_dim)
else:
    policy_net, value_net, running_state = pickle.load(
        open(args.model_path, "rb"))
policy_net.to(device)
value_net.to(device)
del env_dummy

optimizer_policy = torch.optim.Adam(policy_net.parameters(), lr=0.01)
optimizer_value = torch.optim.Adam(value_net.parameters(), lr=0.01)
"""create agent"""
agent = Agent(env_factory,
              policy_net,
              device,
              running_state=running_state,
              render=args.render,
示例#30
0
class SAC_Alpha:
    def __init__(self,
                 env,
                 render=False,
                 num_process=1,
                 memory_size=1000000,
                 lr_p=1e-3,
                 lr_a=3e-4,
                 lr_q=1e-3,
                 gamma=0.99,
                 polyak=0.995,
                 batch_size=100,
                 min_update_step=1000,
                 update_step=50,
                 target_update_delay=1,
                 seed=1,
                 ):
        self.env = env
        self.gamma = gamma
        self.polyak = polyak
        self.memory = FixedMemory(memory_size)
        self.render = render
        self.num_process = num_process
        self.lr_p = lr_p
        self.lr_a = lr_a
        self.lr_q = lr_q
        self.batch_size = batch_size
        self.min_update_step = min_update_step
        self.update_step = update_step
        self.target_update_delay = target_update_delay
        self.seed = seed

        self._init_model()

    def _init_model(self):
        """init model from parameters"""
        self.num_states = self.env.observation_space.shape[0]
        self.num_actions = self.env.action_space.shape[0]

        self.action_low, self.action_high = self.env.action_space.low[0], self.env.action_space.high[0]

        self.target_entropy = - np.prod(self.env.action_space.shape)
        # seeding
        np.random.seed(self.seed)
        torch.manual_seed(self.seed)
        self.env.seed(self.seed)

        self.policy_net = Actor(self.num_states, self.num_actions, action_limit=self.action_high).to(device)

        self.q_net_1 = Value(self.num_states + self.num_actions).to(device)
        self.q_net_target_1 = Value(self.num_states + self.num_actions).to(device)
        self.q_net_2 = Value(self.num_states + self.num_actions).to(device)
        self.q_net_target_2 = Value(self.num_states + self.num_actions).to(device)

        # self.alpha init
        self.alpha = torch.exp(torch.zeros(1, device=device)).requires_grad_()

        self.q_net_target_1.load_state_dict(self.q_net_1.state_dict())
        self.q_net_target_2.load_state_dict(self.q_net_2.state_dict())

        self.optimizer_p = optim.Adam(self.policy_net.parameters(), lr=self.lr_p)
        self.optimizer_a = optim.Adam([self.alpha], lr=self.lr_a)
        self.optimizer_q_1 = optim.Adam(self.q_net_1.parameters(), lr=self.lr_q)
        self.optimizer_q_2 = optim.Adam(self.q_net_2.parameters(), lr=self.lr_q)

    def choose_action(self, state):
        """select action"""
        state = FLOAT(state).unsqueeze(0).to(device)
        with torch.no_grad():
            action, _ = self.policy_net.get_action_log_prob(state)
        action = action.cpu().numpy()[0]
        return action, None

    def eval(self, i_iter, render=False):
        """evaluate model"""
        state = self.env.reset()
        test_reward = 0
        while True:
            if render:
                self.env.render()
            action, _ = self.choose_action(state)
            state, reward, done, _ = self.env.step(action)

            test_reward += reward
            if done:
                break
        print(f"Iter: {i_iter}, test Reward: {test_reward}")
        self.env.close()

    def learn(self, writer, i_iter, step):
        """interact"""

        state = self.env.reset()
        episode_reward = 0

        while True:

            if self.render:
                self.env.render()

            action, _ = self.choose_action(state)

            next_state, reward, done, _ = self.env.step(action)
            mask = 0 if done else 1
            # ('state', 'action', 'reward', 'next_state', 'mask')
            self.memory.push(state, action, reward, next_state, mask)

            episode_reward += reward

            if step >= self.min_update_step and step % self.update_step == 0:
                for k in range(1, self.update_step + 1):
                    batch = self.memory.sample(self.batch_size)  # random sample batch
                    self.update(batch, k)

            if done:
                break

            state = next_state

        self.env.close()

        print(f"Iter: {i_iter}, reward: {episode_reward}")
        # record reward information
        writer.add_scalar("sac_alpha/reward", episode_reward, i_iter)

    def update(self, batch, k_iter):
        """learn model"""
        batch_state = FLOAT(batch.state).to(device)
        batch_action = FLOAT(batch.action).to(device)
        batch_reward = FLOAT(batch.reward).to(device)
        batch_next_state = FLOAT(batch.next_state).to(device)
        batch_mask = FLOAT(batch.mask).to(device)

        # update by SAC Alpha
        sac_alpha_step(self.policy_net, self.q_net_1, self.q_net_2, self.alpha, self.q_net_target_1,
                       self.q_net_target_2,
                       self.optimizer_p, self.optimizer_q_1, self.optimizer_q_2, self.optimizer_a, batch_state,
                       batch_action, batch_reward, batch_next_state, batch_mask, self.gamma, self.polyak,
                       self.target_entropy,
                       k_iter % self.target_update_delay == 0)

    def load(self, model_path):
        print(f"Loading Saved Model from {model_path}")
        self.policy_net, self.q_net_1, self.q_net_2, self.alpha = torch.load(model_path, map_location=device)

    def save(self, save_path):
        """save model"""
        if not os.path.exists(save_path):
            os.mkdir(save_path)

        torch.save((self.policy_net, self.q_net_1, self.q_net_2, self.alpha), f"{save_path}/WebEye_sac_alpha.pt")