示例#1
0
    def __init__(self,
                 args,
                 state_dim,
                 action_dim,
                 is_dict_action=False,
                 is_atari=False):

        self.device = args.device
        self.config = args
        if is_atari:
            self.actor = CNNPolicy(state_dim, action_dim).to(self.device)
            self.critic = CNNCritic(state_dim).to(self.device)
        else:
            self.actor = DiscretePolicy(state_dim, action_dim).to(self.device) if is_dict_action else \
                Policy(state_dim, action_dim, log_std=self.config.log_std).to(self.device)
            self.critic = Value(state_dim).to(self.device)

        # initialize optimizer for actor and critic
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
                                                lr=self.config.learning_rate)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 lr=self.config.learning_rate)

        # optimization epoch number and batch size for PPO
        self.optim_epochs = 10
        self.optim_batch_size = 64
示例#2
0
    def __init__(self,
                 args,
                 state_dim,
                 action_dim,
                 is_dict_action=False,
                 is_atari=False):

        self.device = args.device
        self.config = args

        self.is_dict_action = is_dict_action
        self.is_atari = is_atari

        self.state_dim = state_dim

        self.actor = DiscretePolicy(state_dim, action_dim).to(self.device) if is_dict_action else \
            Policy(state_dim, action_dim, log_std=self.config.log_std).to(self.device)

        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
                                                lr=self.config.learning_rate)
        self.actor_loss = nn.CrossEntropyLoss(
        ) if self.is_dict_action else nn.MSELoss()
示例#3
0
    return env


np.random.seed(args.seed)
torch.manual_seed(args.seed)
if use_gpu:
    torch.cuda.manual_seed_all(args.seed)

env_dummy = env_factory(0)
state_dim = env_dummy.observation_space.shape[0]
is_disc_action = len(env_dummy.action_space.shape) == 0
action_dim = (1 if is_disc_action else env_dummy.action_space.shape[0])
ActionTensor = LongTensor if is_disc_action else DoubleTensor
"""define actor, critic and discrimiator"""
if is_disc_action:
    policy_net = DiscretePolicy(state_dim, env_dummy.action_space.n)
else:
    policy_net = Policy(state_dim, env_dummy.action_space.shape[0])
value_net = Value(state_dim)
discrim_net = Discriminator(state_dim + action_dim)
discrim_criterion = nn.BCELoss()
if use_gpu:
    policy_net = policy_net.cuda()
    value_net = value_net.cuda()
    discrim_net = discrim_net.cuda()
    discrim_criterion = discrim_criterion.cuda()

optimizer_policy = torch.optim.Adam(policy_net.parameters(),
                                    lr=args.learning_rate)
optimizer_value = torch.optim.Adam(value_net.parameters(),
                                   lr=args.learning_rate)
示例#4
0
state = env.reset()
state_dim = state['observation'].shape[0] + state['desired_goal'].shape[0]
subgoal_dim = state['achieved_goal'].shape[0]

is_disc_action = len(env.action_space.shape) == 0
#running_state = ZFilter((state_dim,), clip=5)
# running_reward = ZFilter((1,), demean=False, clip=10)
running_state = None
"""seeding"""
np.random.seed(args.seed)
torch.manual_seed(args.seed)
env.seed(args.seed)
"""define actor and critic"""
if args.model_path is None:
    if is_disc_action:
        policy_net = DiscretePolicy(state_dim, env.action_space.n)
    else:
        policy_mgr = Policy(state_dim,
                            subgoal_dim,
                            log_std=args.log_std,
                            activation_factor=5)
        policy_wrk = Policy(state_dim - subgoal_dim,
                            env.action_space.shape[0],
                            log_std=args.log_std)
    value_mgr = Value(state_dim)
    value_wrk = Value(state_dim - subgoal_dim)
else:
    policy_net, value_net, running_state = pickle.load(
        open(args.model_path, "rb"))
policy_mgr.to(device)
policy_wrk.to(device)
示例#5
0
def create_networks():
    """define actor and critic"""
    if is_disc_action:
        policy_net = DiscretePolicy(state_dim,
                                    env.action_space.n,
                                    hidden_size=(64, 32),
                                    activation='relu')
    else:
        policy_net = Policy(state_dim,
                            env.action_space.shape[0],
                            log_std=args.log_std,
                            hidden_size=(64, 32),
                            activation='relu')
    value_net = Value(state_dim, hidden_size=(32, 16), activation='relu')
    if args.WGAN:
        discrim_net = SNDiscriminator(state_dim + action_dim,
                                      hidden_size=(32, 16),
                                      activation='relu')
    elif args.EBGAN or args.GMMIL:
        discrim_net = AESNDiscriminator(state_dim + action_dim,
                                        hidden_size=(32, ),
                                        encode_size=64,
                                        activation='relu',
                                        slope=0.1,
                                        dropout=False,
                                        dprob=0.2)
    elif args.GEOMGAN:
        # new kernel
        #discrim_net = KernelNet(state_dim + action_dim,state_dim + action_dim)
        noise_dim = 64
        discrim_net = AESNDiscriminator(state_dim + action_dim,
                                        hidden_size=(32, ),
                                        encode_size=noise_dim,
                                        activation='relu',
                                        slope=0.1,
                                        dropout=False,
                                        dprob=0.2)
        kernel_net = NoiseNet(noise_dim,
                              hidden_size=(32, ),
                              encode_size=noise_dim,
                              activation='relu',
                              slope=0.1,
                              dropout=False,
                              dprob=0.2)
        optimizer_kernel = torch.optim.Adam(kernel_net.parameters(),
                                            lr=args.learning_rate / 2)
        scheduler_kernel = MultiStepLR(optimizer_kernel,
                                       milestones=args.milestones,
                                       gamma=args.lr_decay)
    else:
        discrim_net = Discriminator(state_dim + action_dim,
                                    hidden_size=(32, 16),
                                    activation='relu')

    optimizer_policy = torch.optim.Adam(policy_net.parameters(),
                                        lr=args.learning_rate)
    optimizer_value = torch.optim.Adam(value_net.parameters(),
                                       lr=args.learning_rate)
    optimizer_discrim = torch.optim.Adam(discrim_net.parameters(),
                                         lr=args.learning_rate)

    scheduler_policy = MultiStepLR(optimizer_policy,
                                   milestones=args.milestones,
                                   gamma=args.lr_decay)
    scheduler_value = MultiStepLR(optimizer_value,
                                  milestones=args.milestones,
                                  gamma=args.lr_decay)
    scheduler_discrim = MultiStepLR(optimizer_discrim,
                                    milestones=args.milestones,
                                    gamma=args.lr_decay)

    if args.WGAN:

        class ExpertReward():
            def __init__(self):
                self.a = 0

            def expert_reward(self, state, action):
                state_action = tensor(np.hstack([state, action]), dtype=dtype)
                with torch.no_grad():
                    return -discrim_net(state_action)[0].item()
                    # return -discrim_net(state_action).sum().item()

        learned_reward = ExpertReward()
    elif args.EBGAN:

        class ExpertReward():
            def __init__(self):
                self.a = 0

            def expert_reward(self, state, action):
                state_action = tensor(np.hstack([state, action]), dtype=dtype)
                with torch.no_grad():
                    _, recon_out = discrim_net(state_action)
                    return -elementwise_loss(
                        recon_out, state_action).item() + args.r_margin

        learned_reward = ExpertReward()
    elif args.GMMIL or args.GEOMGAN:

        class ExpertReward():
            def __init__(self):
                self.r_bias = 0

            def expert_reward(self, state, action):
                with torch.no_grad():
                    return self.r_bias

            def update_XX_YY(self):
                self.XX = torch.diag(torch.mm(self.e_o_enc, self.e_o_enc.t()))
                self.YY = torch.diag(torch.mm(self.g_o_enc, self.g_o_enc.t()))

        learned_reward = ExpertReward()
    else:

        class ExpertReward():
            def __init__(self):
                self.a = 0

            def expert_reward(self, state, action):
                state_action = tensor(np.hstack([state, action]), dtype=dtype)
                with torch.no_grad():
                    return -math.log(discrim_net(state_action)[0].item())

        learned_reward = ExpertReward()
    """create agent"""
    agent = Agent(env,
                  policy_net,
                  device,
                  custom_reward=learned_reward,
                  running_state=None,
                  render=args.render,
                  num_threads=args.num_threads)

    def update_params(batch, i_iter):
        dataSize = min(args.min_batch_size, len(batch.state))
        states = torch.from_numpy(np.stack(
            batch.state)[:dataSize, ]).to(dtype).to(device)
        actions = torch.from_numpy(np.stack(
            batch.action)[:dataSize, ]).to(dtype).to(device)
        rewards = torch.from_numpy(np.stack(
            batch.reward)[:dataSize, ]).to(dtype).to(device)
        masks = torch.from_numpy(np.stack(
            batch.mask)[:dataSize, ]).to(dtype).to(device)
        with torch.no_grad():
            values = value_net(states)
            fixed_log_probs = policy_net.get_log_prob(states, actions)
        """estimate reward"""
        """get advantage estimation from the trajectories"""
        advantages, returns = estimate_advantages(rewards, masks, values,
                                                  args.gamma, args.tau, device)
        """update discriminator"""
        for _ in range(args.discriminator_epochs):
            #dataSize = states.size()[0]
            # expert_state_actions = torch.from_numpy(expert_traj).to(dtype).to(device)
            exp_idx = random.sample(range(expert_traj.shape[0]), dataSize)
            expert_state_actions = torch.from_numpy(
                expert_traj[exp_idx, :]).to(dtype).to(device)

            dis_input_real = expert_state_actions
            if len(actions.shape) == 1:
                actions.unsqueeze_(-1)
                dis_input_fake = torch.cat([states, actions], 1)
                actions.squeeze_(-1)
            else:
                dis_input_fake = torch.cat([states, actions], 1)

            if args.EBGAN or args.GMMIL or args.GEOMGAN:
                # tbd, no discriminaotr learning
                pass
            else:
                g_o = discrim_net(dis_input_fake)
                e_o = discrim_net(dis_input_real)

            optimizer_discrim.zero_grad()
            if args.GEOMGAN:
                optimizer_kernel.zero_grad()

            if args.WGAN:
                if args.LSGAN:
                    pdist = l1dist(dis_input_real,
                                   dis_input_fake).mul(args.lamb)
                    discrim_loss = LeakyReLU(e_o - g_o + pdist).mean()
                else:
                    discrim_loss = torch.mean(e_o) - torch.mean(g_o)
            elif args.EBGAN:
                e_recon = elementwise_loss(e_o, dis_input_real)
                g_recon = elementwise_loss(g_o, dis_input_fake)
                discrim_loss = e_recon
                if (args.margin - g_recon).item() > 0:
                    discrim_loss += (args.margin - g_recon)
            elif args.GMMIL:
                #mmd2_D,K = mix_rbf_mmd2(e_o_enc, g_o_enc, args.sigma_list)
                mmd2_D, K = mix_rbf_mmd2(dis_input_real, dis_input_fake,
                                         args.sigma_list)
                #tbd
                #rewards = K[0]+K[1]-2*K[2]
                rewards = K[1] - K[2]  # -(exp - gen): -(kxy-kyy)=kyy-kxy
                rewards = -rewards.detach(
                )  # exp - gen, maximize (gen label negative)
                errD = mmd2_D
                discrim_loss = -errD  # maximize errD

                # prep for generator
                advantages, returns = estimate_advantages(
                    rewards, masks, values, args.gamma, args.tau, device)
            elif args.GEOMGAN:
                # larger, better, but slower
                noise_num = 100
                mmd2_D, K = mix_imp_mmd2(e_o_enc, g_o_enc, noise_num,
                                         noise_dim, kernel_net, cuda)
                rewards = K[1] - K[2]  # -(exp - gen): -(kxy-kyy)=kyy-kxy
                rewards = -rewards.detach()
                errD = mmd2_D  #+ args.lambda_rg * one_side_errD
                discrim_loss = -errD  # maximize errD

                # prep for generator
                advantages, returns = estimate_advantages(
                    rewards, masks, values, args.gamma, args.tau, device)
            else:
                discrim_loss = discrim_criterion(g_o, ones((states.shape[0], 1), device=device)) + \
                               discrim_criterion(e_o, zeros((e_o.shape[0], 1), device=device))
            if args.GEOMGAN:
                optimizer_kernel.step()
        """perform mini-batch PPO update"""
        optim_iter_num = int(math.ceil(states.shape[0] / args.ppo_batch_size))
        for _ in range(args.generator_epochs):
            perm = np.arange(states.shape[0])
            np.random.shuffle(perm)
            perm = LongTensor(perm).to(device)

            states, actions, returns, advantages, fixed_log_probs = \
                states[perm].clone(), actions[perm].clone(), returns[perm].clone(), advantages[perm].clone(), \
                fixed_log_probs[perm].clone()

            for i in range(optim_iter_num):
                ind = slice(
                    i * args.ppo_batch_size,
                    min((i + 1) * args.ppo_batch_size, states.shape[0]))
                states_b, actions_b, advantages_b, returns_b, fixed_log_probs_b = \
                    states[ind], actions[ind], advantages[ind], returns[ind], fixed_log_probs[ind]

                ppo_step(policy_net, value_net, optimizer_policy,
                         optimizer_value, 1, states_b, actions_b, returns_b,
                         advantages_b, fixed_log_probs_b, args.clip_epsilon,
                         args.l2_reg)

        return rewards

    if args.GEOMGAN:
        return policy_net,value_net,discrim_net,kernel_net,optimizer_policy,optimizer_value,optimizer_discrim,optimizer_kernel,agent,update_params \
            ,scheduler_policy,scheduler_value,scheduler_discrim,scheduler_kernel
    else:
        return policy_net,value_net,discrim_net,optimizer_policy,optimizer_value,optimizer_discrim,agent,update_params \
            ,scheduler_policy,scheduler_value,scheduler_discrim
示例#6
0
def train(**kwargs):
    print('here')
    config = {
        "lr": kwargs['lr'],
        "gamma": kwargs['gamma']
    }
    dtype = torch.float64
    torch.set_default_dtype(dtype)
    device = torch.device('cuda', index=args.gpu_index) if torch.cuda.is_available() else torch.device('cpu')
    if torch.cuda.is_available():
        torch.cuda.set_device(args.gpu_index)

    """environment"""
    env = gym.make(args.env_name)
    state_dim = env.observation_space.shape[0]
    is_disc_action = len(env.action_space.shape) == 0
    running_state = ZFilter((state_dim,), clip=5)
    # running_reward = ZFilter((1,), demean=False, clip=10)

    """seeding"""
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    env.seed(args.seed)

    # """define actor and critic"""
    if args.model_path is None:
        if is_disc_action:
            policy_net = DiscretePolicy(state_dim, env.action_space.n)
        else:
            policy_net = Policy(state_dim, env.action_space.shape[0], log_std=args.log_std)
        value_net = Value(state_dim)
    else:
        policy_net, value_net, running_state = pickle.load(open(args.model_path, "rb"))
    policy_net.to(device)
    value_net.to(device)

    # optimization epoch number and batch size for PPO
    optim_epochs = 10
    optim_batch_size = 64

    """create agent"""
    agent = Agent(env, policy_net, device, running_state=running_state, render=args.render, num_threads=args.num_threads)
    def update_params(batch, i_iter, config):
        states = torch.from_numpy(np.stack(batch.state)).to(dtype).to(device)
        actions = torch.from_numpy(np.stack(batch.action)).to(dtype).to(device)
        rewards = torch.from_numpy(np.stack(batch.reward)).to(dtype).to(device)
        masks = torch.from_numpy(np.stack(batch.mask)).to(dtype).to(device)
        with torch.no_grad():
            values = value_net(states)
            fixed_log_probs = policy_net.get_log_prob(states, actions)

        """get advantage estimation from the trajectories"""
        advantages, returns = estimate_advantages(rewards, masks, values, config['gamma'], args.tau, device)

        """perform mini-batch PPO update"""
        optim_iter_num = int(math.ceil(states.shape[0] / optim_batch_size))
        for _ in range(optim_epochs):
            perm = np.arange(states.shape[0])
            np.random.shuffle(perm)
            perm = LongTensor(perm).to(device)

            states, actions, returns, advantages, fixed_log_probs = \
                states[perm].clone(), actions[perm].clone(), returns[perm].clone(), advantages[perm].clone(), fixed_log_probs[perm].clone()

            for i in range(optim_iter_num):
                ind = slice(i * optim_batch_size, min((i + 1) * optim_batch_size, states.shape[0]))
                states_b, actions_b, advantages_b, returns_b, fixed_log_probs_b = \
                    states[ind], actions[ind], advantages[ind], returns[ind], fixed_log_probs[ind]

                ppo_step(policy_net, value_net, optimizer_policy, optimizer_value, 1, states_b, actions_b, returns_b,
                        advantages_b, fixed_log_probs_b, args.clip_epsilon, args.l2_reg)

    def main_loop(config):
        optimizer_policy = torch.optim.Adam(policy_net.parameters(), lr=config['lr'])
        optimizer_value = torch.optim.Adam(value_net.parameters(), lr=config['lr'])
        for i_iter in range(args.max_iter_num):
            """generate multiple trajectories that reach the minimum batch_size"""
            batch, log = agent.collect_samples(args.min_batch_size)
            t0 = time.time()
            update_params(batch, i_iter, config)
            t1 = time.time()

            if i_iter % args.log_interval == 0:
                print('{}\tT_sample {:.4f}\tT_update {:.4f}\tR_min {:.2f}\tR_max {:.2f}\tR_avg {:.2f}'.format(
                    i_iter, log['sample_time'], t1-t0, log['min_reward'], log['max_reward'], log['avg_reward']))

            if args.save_model_interval > 0 and (i_iter+1) % args.save_model_interval == 0:
                to_device(torch.device('cpu'), policy_net, value_net)
                pickle.dump((policy_net, value_net, running_state),
                            open(os.path.join(assets_dir(), 'learned_models/{}_ppo.p'.format(args.env_name)), 'wb'))
                to_device(device, policy_net, value_net)

        #     """clean up gpu memory"""
            torch.cuda.empty_cache()
        return agent.evaluate()

    print('a')
    print(config)
    print(args)
    return main_loop(config)
示例#7
0
class PPO(object):
    def __init__(self,
                 args,
                 state_dim,
                 action_dim,
                 is_dict_action=False,
                 is_atari=False):

        self.device = args.device
        self.config = args
        if is_atari:
            self.actor = CNNPolicy(state_dim, action_dim).to(self.device)
            self.critic = CNNCritic(state_dim).to(self.device)
        else:
            self.actor = DiscretePolicy(state_dim, action_dim).to(self.device) if is_dict_action else \
                Policy(state_dim, action_dim, log_std=self.config.log_std).to(self.device)
            self.critic = Value(state_dim).to(self.device)

        # initialize optimizer for actor and critic
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
                                                lr=self.config.learning_rate)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 lr=self.config.learning_rate)

        # optimization epoch number and batch size for PPO
        self.optim_epochs = 10
        self.optim_batch_size = 64

    def train(self, batch):
        """
        Train the policy using the given batch.
        :param batch:
        :return:
        """

        states = torch.DoubleTensor(np.stack(batch.state)).to(self.device)
        actions = torch.DoubleTensor(np.stack(batch.action)).to(self.device)
        rewards = torch.DoubleTensor(np.stack(batch.reward)).to(self.device)
        masks = torch.DoubleTensor(np.stack(batch.mask)).to(self.device)

        with torch.no_grad():
            values = self.critic(states)
            fixed_log_probs = self.actor.get_log_prob(states, actions)

        # get advantage estimation from the trajectories
        advantages, returns = estimate_advantages(rewards, masks, values,
                                                  self.config.gamma,
                                                  self.config.tau, self.device)

        # compute minibatch size
        optim_iter_num = int(math.ceil(states.shape[0] /
                                       self.optim_batch_size))

        # PPO updates
        for _ in range(self.optim_epochs):
            perm = np.arange(states.shape[0])
            np.random.shuffle(perm)
            perm = torch.LongTensor(perm).to(self.device)

            states, actions, returns, advantages, fixed_log_probs = \
                states[perm].clone(), actions[perm].clone(), returns[perm].clone(), advantages[perm].clone(), \
                fixed_log_probs[perm].clone()

            for i in range(optim_iter_num):
                ind = slice(
                    i * self.optim_batch_size,
                    min((i + 1) * self.optim_batch_size, states.shape[0]))
                states_b, actions_b, advantages_b, returns_b, fixed_log_probs_b = \
                    states[ind], actions[ind], advantages[ind], returns[ind], fixed_log_probs[ind]
                self.ppo_step(states_b, actions_b, returns_b, advantages_b,
                              fixed_log_probs_b)

    def ppo_step(self, states, actions, returns, advantages, fixed_log_probs):
        """
        A PPO policy gradient update step.
        :param states:
        :param actions:
        :param returns:
        :param advantages:
        :param fixed_log_probs:
        :return:
        """
        # update critic, for now assume one epoch
        values_pred = self.critic(states)
        value_loss = (values_pred - returns).pow(2).mean()
        # weight decay
        for param in self.critic.parameters():
            value_loss += param.pow(2).sum() * self.config.l2_reg
        self.critic_optimizer.zero_grad()
        value_loss.backward()
        self.critic_optimizer.step()

        # update actor
        log_probs = self.actor.get_log_prob(states, actions)
        ratio = torch.exp(log_probs - fixed_log_probs)
        surr1 = ratio * advantages
        surr2 = torch.clamp(ratio, 1.0 - self.config.clip_epsilon,
                            1.0 + self.config.clip_epsilon) * advantages
        policy_surr = -torch.min(surr1, surr2).mean()
        self.actor.zero_grad()
        policy_surr.backward()
        torch.nn.utils.clip_grad_norm_(self.actor.parameters(), 40)
        self.actor_optimizer.step()
示例#8
0
np.random.seed(args.seed)
torch.manual_seed(args.seed)
if use_gpu:
    torch.cuda.manual_seed_all(args.seed)

env_dummy = env_factory(0)
state_dim = env_dummy.observation_space.shape[0]
is_disc_action = len(env_dummy.action_space.shape) == 0
ActionTensor = LongTensor if is_disc_action else DoubleTensor

running_state = ZFilter((state_dim, ), clip=5)
# running_reward = ZFilter((1,), demean=False, clip=10)
"""define actor and critic"""
if args.model_path is None:
    if is_disc_action:
        policy_net = DiscretePolicy(state_dim, env_dummy.action_space.n)
    else:
        policy_net = Policy(state_dim,
                            env_dummy.action_space.shape[0],
                            log_std=args.log_std)
    value_net = Value(state_dim)
else:
    policy_net, value_net, running_state = pickle.load(
        open(args.model_path, "rb"))
if use_gpu:
    policy_net = policy_net.cuda()
    value_net = value_net.cuda()
del env_dummy
"""create agent"""
agent = Agent(env_factory,
              policy_net,
示例#9
0
state = env.reset()
state_dim = state['observation'].shape[0] + state['desired_goal'].shape[0]
subgoal_dim = 3

is_disc_action = len(env.action_space.shape) == 0
#running_state = ZFilter((state_dim,), clip=5)
# running_reward = ZFilter((1,), demean=False, clip=10)
running_state = None
"""seeding"""
np.random.seed(args.seed)
torch.manual_seed(args.seed)
env.seed(args.seed)
"""define actor and critic"""
if args.model_path is None:
    if is_disc_action:
        policy_net = DiscretePolicy(state_dim, env.action_space.n)
    else:
        policy_mgr = DiscretePolicy(state_dim, 7)
        policy_wrk = Policy(state_dim + subgoal_dim,
                            env.action_space.shape[0],
                            log_std=args.log_std)
    value_mgr = Value(state_dim)
    value_wrk = Value(state_dim + subgoal_dim)
else:
    policy_net, value_net, running_state = pickle.load(
        open(args.model_path, "rb"))
policy_mgr.to(device)
policy_wrk.to(device)
value_mgr.to(device)
value_wrk.to(device)
示例#10
0
else:
    running_state = None

# if args.reward_running_state == 1:
#     running_reward = ZFilter((1,), demean=False, clip=10)
# else:
#     running_reward = None
"""seeding"""
np.random.seed(args.seed)
torch.manual_seed(args.seed)
"""define actor and critic"""

if args.model_path is None:
    if is_disc_action:
        policy_net = DiscretePolicy(state_dim, env.action_space.n)

    else:
        if args.sac_policy:
            policy_net = Policy_Tanh_Gaussian(state_dim,
                                              env.action_space.shape[0],
                                              hidden_size=(64, 64),
                                              log_std=args.log_std)

        else:
            policy_net = Policy(state_dim,
                                env.action_space.shape[0],
                                log_std=args.log_std)
    value_net = Value(state_dim)
else:
    policy_net, value_net, running_state, = pickle.load(
def create_networks():
    """define actor and critic"""
    if is_disc_action:
        policy_net = DiscretePolicy(state_dim,
                                    env.action_space.n,
                                    hidden_size=(64, 32),
                                    activation='relu')
    else:
        policy_net = Policy(state_dim,
                            env.action_space.shape[0],
                            log_std=args.log_std,
                            hidden_size=(64, 32),
                            activation='relu')
    value_net = Value(state_dim, hidden_size=(32, 16), activation='relu')
    if args.AL:
        discrim_net = SNDiscriminator(state_dim + action_dim,
                                      hidden_size=(32, 16),
                                      activation='relu')
    elif args.EBGAN or args.GMMIL:
        discrim_net = AESNDiscriminator(state_dim + action_dim,
                                        hidden_size=(32, ),
                                        encode_size=64,
                                        activation='leakyrelu',
                                        slope=0.1,
                                        dropout=True,
                                        dprob=0.2)
    elif args.VAKLIL:
        noise_dim = 64
        mid_dim = 32
        discrim_net = VAEDiscriminator(state_dim + action_dim,
                                       num_outputs=noise_dim,
                                       sigmoid_out=False,
                                       sn=True,
                                       test=False,
                                       w_init=False,
                                       hidden_size_enc=(),
                                       hidden_size_dec=(),
                                       encode_size=mid_dim,
                                       activation='relu',
                                       dropout=False)
        kernel_net = NoiseNet(noise_dim,
                              hidden_size=(32, ),
                              encode_size=noise_dim,
                              activation='relu',
                              dropout=False)
        optimizer_kernel = torch.optim.Adam(kernel_net.parameters(),
                                            lr=args.learning_rate)
        scheduler_kernel = MultiStepLR(optimizer_kernel,
                                       milestones=args.milestones,
                                       gamma=args.lr_kernel_decay)
    else:
        discrim_net = Discriminator(state_dim + action_dim,
                                    hidden_size=(32, 16),
                                    activation='relu')

    optimizer_policy = torch.optim.Adam(policy_net.parameters(),
                                        lr=args.learning_rate)
    optimizer_value = torch.optim.Adam(value_net.parameters(),
                                       lr=args.learning_rate)
    optimizer_discrim = torch.optim.Adam(discrim_net.parameters(),
                                         lr=args.learning_rate)

    scheduler_policy = MultiStepLR(optimizer_policy,
                                   milestones=args.milestones,
                                   gamma=args.lr_decay)
    scheduler_value = MultiStepLR(optimizer_value,
                                  milestones=args.milestones,
                                  gamma=args.lr_decay)
    scheduler_discrim = MultiStepLR(optimizer_discrim,
                                    milestones=args.milestones,
                                    gamma=args.lr_kernel_decay)

    if args.AL:

        class ExpertReward():
            def __init__(self):
                self.a = 0

            def expert_reward(self, state, action):
                state_action = tensor(np.hstack([state, action]), dtype=dtype)
                with torch.no_grad():
                    return -discrim_net(state_action)[0].item()

        learned_reward = ExpertReward()
    elif args.EBGAN:

        class ExpertReward():
            def __init__(self):
                self.a = 0

            def expert_reward(self, state, action):
                state_action = tensor(np.hstack([state, action]), dtype=dtype)
                with torch.no_grad():
                    _, recon_out = discrim_net(state_action)
                    return -elementwise_loss(
                        recon_out, state_action).item() + args.r_margin

        learned_reward = ExpertReward()
    elif args.GMMIL or args.VAKLIL:

        class ExpertReward():
            def __init__(self):
                self.r_bias = 0

            def expert_reward(self, state, action):
                with torch.no_grad():
                    return self.r_bias

            def update_XX_YY(self):
                self.XX = torch.diag(torch.mm(self.e_o_enc, self.e_o_enc.t()))
                self.YY = torch.diag(torch.mm(self.g_o_enc, self.g_o_enc.t()))

        learned_reward = ExpertReward()
    else:

        class ExpertReward():
            def __init__(self):
                self.a = 0

            def expert_reward(self, state, action):
                state_action = tensor(np.hstack([state, action]), dtype=dtype)
                with torch.no_grad():
                    return -math.log(discrim_net(state_action)[0].item())

        learned_reward = ExpertReward()
    """create agent"""
    agent = Agent(env,
                  policy_net,
                  device,
                  custom_reward=learned_reward,
                  running_state=None,
                  render=args.render,
                  num_threads=args.num_threads)

    def update_params(batch, i_iter):
        dataSize = min(args.min_batch_size, len(batch.state))
        states = torch.from_numpy(np.stack(
            batch.state)[:dataSize, ]).to(dtype).to(device)
        actions = torch.from_numpy(np.stack(
            batch.action)[:dataSize, ]).to(dtype).to(device)
        rewards = torch.from_numpy(np.stack(
            batch.reward)[:dataSize, ]).to(dtype).to(device)
        masks = torch.from_numpy(np.stack(
            batch.mask)[:dataSize, ]).to(dtype).to(device)
        with torch.no_grad():
            values = value_net(states)
            fixed_log_probs = policy_net.get_log_prob(states, actions)
        """estimate reward"""
        """get advantage estimation from the trajectories"""
        advantages, returns = estimate_advantages(rewards, masks, values,
                                                  args.gamma, args.tau, device)
        """update discriminator"""
        for _ in range(args.discriminator_epochs):
            exp_idx = random.sample(range(expert_traj.shape[0]), dataSize)
            expert_state_actions = torch.from_numpy(
                expert_traj[exp_idx, :]).to(dtype).to(device)

            dis_input_real = expert_state_actions
            if len(actions.shape) == 1:
                actions.unsqueeze_(-1)
                dis_input_fake = torch.cat([states, actions], 1)
                actions.squeeze_(-1)
            else:
                dis_input_fake = torch.cat([states, actions], 1)

            if args.EBGAN or args.GMMIL or args.VAKLIL:
                g_o_enc, g_mu, g_sigma = discrim_net(dis_input_fake,
                                                     mean_mode=False)
                e_o_enc, e_mu, e_sigma = discrim_net(dis_input_real,
                                                     mean_mode=False)
            else:
                g_o = discrim_net(dis_input_fake)
                e_o = discrim_net(dis_input_real)

            optimizer_discrim.zero_grad()
            if args.VAKLIL:
                optimizer_kernel.zero_grad()

            if args.AL:
                if args.LSGAN:
                    pdist = l1dist(dis_input_real,
                                   dis_input_fake).mul(args.lamb)
                    discrim_loss = LeakyReLU(e_o - g_o + pdist).mean()
                else:
                    discrim_loss = torch.mean(e_o) - torch.mean(g_o)
            elif args.EBGAN:
                e_recon = elementwise_loss(e_o, dis_input_real)
                g_recon = elementwise_loss(g_o, dis_input_fake)
                discrim_loss = e_recon
                if (args.margin - g_recon).item() > 0:
                    discrim_loss += (args.margin - g_recon)
            elif args.GMMIL:
                mmd2_D, K = mix_rbf_mmd2(e_o_enc, g_o_enc, args.sigma_list)
                rewards = K[1] - K[2]  # -(exp - gen): -(kxy-kyy)=kyy-kxy
                rewards = -rewards.detach(
                )  # exp - gen, maximize (gen label negative)
                errD = mmd2_D
                discrim_loss = -errD  # maximize errD

                advantages, returns = estimate_advantages(
                    rewards, masks, values, args.gamma, args.tau, device)
            elif args.VAKLIL:
                noise_num = 20000
                mmd2_D_net, _, penalty = mix_imp_with_bw_mmd2(
                    e_o_enc, g_o_enc, noise_num, noise_dim, kernel_net, cuda,
                    args.sigma_list)
                mmd2_D_rbf, _ = mix_rbf_mmd2(e_o_enc, g_o_enc, args.sigma_list)
                errD = (mmd2_D_net + mmd2_D_rbf) / 2
                # 1e-8: small number for numerical stability
                i_c = 0.2
                bottleneck_loss = torch.mean((0.5 * torch.sum((torch.cat(
                    (e_mu, g_mu), dim=0)**2) + (torch.cat(
                        (e_sigma, g_sigma), dim=0)**2) - torch.log((torch.cat(
                            (e_sigma, g_sigma), dim=0)**2) + 1e-8) - 1,
                                                              dim=1))) - i_c
                discrim_loss = -errD + (args.beta * bottleneck_loss) + (
                    args.lambda_h * penalty)
            else:
                discrim_loss = discrim_criterion(g_o, ones((states.shape[0], 1), device=device)) + \
                               discrim_criterion(e_o, zeros((e_o.shape[0], 1), device=device))

            discrim_loss.backward()
            optimizer_discrim.step()
            if args.VAKLIL:
                optimizer_kernel.step()

        if args.VAKLIL:
            with torch.no_grad():
                noise_num = 20000
                g_o_enc, _, _ = discrim_net(dis_input_fake)
                e_o_enc, _, _ = discrim_net(dis_input_real)
                _, K_net, _ = mix_imp_with_bw_mmd2(e_o_enc, g_o_enc, noise_num,
                                                   noise_dim, kernel_net, cuda,
                                                   args.sigma_list)
                _, K_rbf = mix_rbf_mmd2(e_o_enc, g_o_enc, args.sigma_list)
                K = [sum(x) / 2 for x in zip(K_net, K_rbf)]
                rewards = K[1] - K[2]  # -(exp - gen): -(kxy-kyy)=kyy-kxy
                rewards = -rewards  #.detach()
                advantages, returns = estimate_advantages(
                    rewards, masks, values, args.gamma, args.tau, device)
        """perform mini-batch PPO update"""
        optim_iter_num = int(math.ceil(states.shape[0] / args.ppo_batch_size))
        for _ in range(args.generator_epochs):
            perm = np.arange(states.shape[0])
            np.random.shuffle(perm)
            perm = LongTensor(perm).to(device)

            states, actions, returns, advantages, fixed_log_probs = \
                states[perm].clone(), actions[perm].clone(), returns[perm].clone(), advantages[perm].clone(), \
                fixed_log_probs[perm].clone()

            for i in range(optim_iter_num):
                ind = slice(
                    i * args.ppo_batch_size,
                    min((i + 1) * args.ppo_batch_size, states.shape[0]))
                states_b, actions_b, advantages_b, returns_b, fixed_log_probs_b = \
                    states[ind], actions[ind], advantages[ind], returns[ind], fixed_log_probs[ind]

                ppo_step(policy_net, value_net, optimizer_policy,
                         optimizer_value, 1, states_b, actions_b, returns_b,
                         advantages_b, fixed_log_probs_b, args.clip_epsilon,
                         args.l2_reg)

        return rewards

    if args.VAKLIL:
        return policy_net,value_net,discrim_net,kernel_net,optimizer_policy,optimizer_value,optimizer_discrim,optimizer_kernel,agent,update_params \
            ,scheduler_policy,scheduler_value,scheduler_discrim,scheduler_kernel
    else:
        return policy_net,value_net,discrim_net,optimizer_policy,optimizer_value,optimizer_discrim,agent,update_params \
            ,scheduler_policy,scheduler_value,scheduler_discrim
示例#12
0
                                   force=True,
                                   mode='training')
    return env


env_dummy = env_factory(0)
state_dim = env_dummy.observation_space.shape[0]
is_disc_action = len(env_dummy.action_space.shape) == 0
ActionTensor = LongTensor if is_disc_action else DoubleTensor

running_state = ZFilter((state_dim, ), clip=5)
# running_reward = ZFilter((1,), demean=False, clip=10)
"""define actor and critic"""
if args.model_path is None:
    if is_disc_action:
        policy_net = DiscretePolicy(state_dim, env_dummy.action_space.n)
    else:
        policy_net = Policy(state_dim,
                            env_dummy.action_space.shape[0],
                            hidden_size=(500, 500),
                            activation='relu',
                            log_std=args.log_std)
    value_net = Value(state_dim)
else:
    policy_net, value_net, running_state = pickle.load(
        open(args.model_path, "rb"))
    print('loaded pre_trained model!')

if args.based_model is True:
    policy_net.load_state_dict(
        torch.load(assets_dir() +
示例#13
0
# is_disc_action = len(env_dummy.action_space[0]) == 0
is_disc_action = True

# ActionTensor = LongTensor if is_disc_action else DoubleTensor
ActionTensor = LongTensor if is_disc_action else FloatTensor

running_state = ZFilter((state_dim,), clip=5)
# running_reward = ZFilter((1,), demean=False, clip=10)

"""define actor and critic"""
policy_net = []
value_net = []
if args.model_path is None:
    if is_disc_action:
        for i in range(env_dummy.n):
            policy_net.append(DiscretePolicy(obs_shape_n[i], act_shape_n[i]))
            # print(policy_net[i])
    else:
        policy_net = Policy(obs_shape_n[i], env_dummy.action_space.shape[0], log_std=args.log_std)
    # value_net = Value(state_dim)
    for i in range(env_dummy.n):
        value_net.append(Value(obs_shape_n[i]*env_dummy.n))
        # print(value_net[i])
else:
    # TODO
    policy_net, value_net = pickle.load(open(args.model_path, "rb"))
    # policy_net = [env_dummy.observation_space[i].shape[0] for i in range(env_dummy.n)]
if use_gpu:
    # policy_net = policy_net.cuda()
    # value_net = value_net.cuda()
    for i in range(env_dummy.n):
示例#14
0
is_disc_action = len(env.action_space[0].shape) == 0
"""seeding"""
np.random.seed(args.seed)
torch.manual_seed(args.seed)
env.seed(args.seed)

p_nets = []
v_nets = []
p_opts = []
v_opts = []
"""define actor and critic"""
if args.model_path is None:
    if is_disc_action:
        for i in range(env.n_agents):
            p_nets.append(
                DiscretePolicy(args.dec_agents, env.n_agents, state_dim,
                               env.action_space[0].n))
            v_nets.append(Value(env.n_agents, state_dim))
            # add only one policy and value networks if using team unified network settings.
            if args.dec_agents is False:
                break
    else:
        policy_net = Policy(state_dim,
                            env.action_space[0].n,
                            log_std=args.log_std)
else:
    p_nets, v_nets, running_state = pickle.load(open(args.model_path, "rb"))

dtype = torch.float64
torch.set_default_dtype(dtype)
device = torch.device('cpu')
torch.manual_seed(args.seed)
if use_gpu:
    torch.cuda.manual_seed_all(args.seed)

env_dummy = env_factory(0)
state_dim = env_dummy.observation_space.shape[0]
is_disc_action = len(env_dummy.action_space.shape) == 0
ActionTensor = LongTensor if is_disc_action else DoubleTensor

running_state = ZFilter((state_dim,), clip=5)
# running_reward = ZFilter((1,), demean=False, clip=10)

"""define actor and critic"""
if args.model_path is None:
    if is_disc_action:
        policy_net = DiscretePolicy(state_dim, env_dummy.action_space.n, hidden_size=(20,20))
    else:
        policy_net = Policy(state_dim, env_dummy.action_space.shape[0], log_std=args.log_std, hidden_size=(3,3))
    value_net = Value(state_dim)
else:
    policy_net, value_net, running_state = pickle.load(open(args.model_path, "rb"))
if use_gpu:
    policy_net = policy_net.cuda()
    value_net = value_net.cuda()
del env_dummy

optimizer_policy = torch.optim.Adam(policy_net.parameters(), lr=args.learning_rate)
optimizer_value = torch.optim.Adam(value_net.parameters(), lr=args.learning_rate)

# optimization epoch number and batch size for PPO
optim_epochs = 5
示例#16
0
class BC(object):
    """
    A vanilla Behavior Cloning model.
    """
    def __init__(self,
                 args,
                 state_dim,
                 action_dim,
                 is_dict_action=False,
                 is_atari=False):

        self.device = args.device
        self.config = args

        self.is_dict_action = is_dict_action
        self.is_atari = is_atari

        self.state_dim = state_dim

        self.actor = DiscretePolicy(state_dim, action_dim).to(self.device) if is_dict_action else \
            Policy(state_dim, action_dim, log_std=self.config.log_std).to(self.device)

        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
                                                lr=self.config.learning_rate)
        self.actor_loss = nn.CrossEntropyLoss(
        ) if self.is_dict_action else nn.MSELoss()

    def set_expert(self, expert_traj, num_trajs):
        """
        Set the expert trajectories.
        :param expert_traj:
        :param num_traj
        :return:
        """
        # self.expert_traj_pool = expert_traj
        # self.expert_traj = np.vstack(expert_traj[:num_trajs])

    def set_expert2(self, expert_traj):
        """
        Set the expert trajectories.
        :param expert_traj:
        :param num_traj
        :return:
        """
        self.expert_traj = expert_traj

    def train(self):
        """
        :param num_traj:
        :return:
        """
        if self.expert_traj is not None:
            expert_traj = self.expert_traj
        expert_state_actions = torch.DoubleTensor(expert_traj)

        expert_states = expert_state_actions[:, :self.state_dim].to(
            self.device)
        expert_actions = expert_state_actions[:,
                                              self.state_dim:].to(self.device)

        predicted_actions = self.actor(expert_states)[0]

        self.actor_optimizer.zero_grad()
        loss = self.actor_loss(predicted_actions, expert_actions)
        loss.backward()
        self.actor_optimizer.step()

        return loss.to('cpu').detach().numpy()

    def train2(self):
        """
        :return:
        """
        if self.expert_traj is not None:
            expert_traj = self.expert_traj
        # expert_state_actions = torch.DoubleTensor(expert_traj)
        #
        # expert_states = expert_state_actions[:,:self.state_dim].to(self.device)
        # expert_actions = expert_state_actions[:,self.state_dim:].to(self.device)
        expert_states = torch.DoubleTensor(self.expert_traj.state)
        expert_actions = torch.DoubleTensor(self.expert_traj.action)
        predicted_actions = self.actor(expert_states)[0]

        self.actor_optimizer.zero_grad()
        loss = self.actor_loss(predicted_actions, expert_actions)
        loss.backward()
        self.actor_optimizer.step()

        return loss.to('cpu').detach().numpy()
示例#17
0
state, _ = env.reset()
state_dim = state.shape[0]
subgoal_dim = 3

is_disc_action = len(env.action_space.shape) == 0
#running_state = ZFilter((state_dim,), clip=5)
# running_reward = ZFilter((1,), demean=False, clip=10)
running_state = None
"""seeding"""
np.random.seed(args.seed)
torch.manual_seed(args.seed)
env.seed(args.seed)
"""define actor and critic"""
if args.model_path is None:
    if is_disc_action:
        policy_net = DiscretePolicy(state_dim, env.action_space.n)
    else:
        policy_mgr = DiscretePolicy(state_dim, 4)
        policy_wrk = Policy(state_dim + subgoal_dim,
                            env.action_space.shape[0],
                            log_std=args.log_std)
    value_mgr = Value(state_dim)
    value_wrk = Value(state_dim + subgoal_dim)
else:
    policy_net, value_net, running_state = pickle.load(
        open(args.model_path, "rb"))
policy_mgr.to(device)
policy_wrk.to(device)
value_mgr.to(device)
value_wrk.to(device)
# if args.reward_running_state == 1:
#     running_reward = ZFilter((1,), demean=False, clip=10)
# else:
#     running_reward = None


"""seeding"""
np.random.seed(args.seed)
torch.manual_seed(args.seed)

"""define actor and critic"""

if args.model_path is None:
    if is_disc_action:
        policy_net = DiscretePolicy(state_dim, env.action_space.n)
    else:
        policy_net = Policy(state_dim, env.action_space.shape[0], log_std=args.log_std)
    value_net = Value(state_dim)
else:
    policy_net, value_net, running_state,  = pickle.load(open(args.model_path, "rb"))


policy_net.to(device)
value_net.to(device)


optimizer_policy = torch.optim.Adam(policy_net.parameters(), lr=args.learning_rate)
optimizer_value = torch.optim.Adam(value_net.parameters(), lr=args.learning_rate)

# optimization epoch number and batch size for PPO
示例#19
0
device = torch.device(
    'cuda',
    index=args.gpu_index) if torch.cuda.is_available() else torch.device('cpu')
if torch.cuda.is_available():
    torch.cuda.set_device(args.gpu_index)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
env_dummy = env_factory(0)
state_dim = env_dummy.observation_space.shape[0]
is_disc_action = len(env_dummy.action_space.shape) == 0
running_state = ZFilter((state_dim, ), clip=5)
# running_reward = ZFilter((1,), demean=False, clip=10)
"""define actor and critic"""
if args.model_path is None:
    if is_disc_action:
        policy_net = DiscretePolicy(state_dim, env_dummy.action_space.n)
    else:
        policy_net = Policy(state_dim,
                            env_dummy.action_space.shape[0],
                            log_std=args.log_std)
    value_net = Value(state_dim)
else:
    policy_net, value_net, running_state = pickle.load(
        open(args.model_path, "rb"))
policy_net.to(device)
value_net.to(device)
del env_dummy
"""create agent"""
agent = Agent(env_factory,
              policy_net,
              device,
示例#20
0
    torch.cuda.set_device(exp_args["config"]["gpu-index"])
""" environment """
env = gym.make(exp_args["config"]["env-name"])
state_dim = env.observation_space.shape[0]
is_discrete_action_space = len(
    env.action_space.shape) == 0  # shape is empty () for discrete environments
running_state = ZFilter((state_dim, ), clip=5)
""" Seeding """
np.random.seed(exp_args["config"]["seed"])
torch.manual_seed(exp_args["config"]["seed"])
env.seed(exp_args["config"]["seed"])
""" define policy(actor) and critic(value function predictor) """

if is_discrete_action_space:
    policy_net = DiscretePolicy(state_dim, env.action_space.n,
                                exp_args["model"]["hidden"],
                                exp_args["model"]["activation"])
else:
    raise ValueError(
        "Policy for Continous Action Space is not implemented yet")

value_net = Value(state_dim, exp_args["model"]["hidden"],
                  exp_args["model"]["activation"])

policy_net.to(device)
value_net.to(device)

optimizer_policy = torch.optim.Adam(policy_net.parameters(),
                                    lr=exp_args["config"]["lr"])
optimizer_value = torch.optim.Adam(value_net.parameters(),
                                   lr=exp_args["config"]["lr"])
示例#21
0
dtype = torch.float64
torch.set_default_dtype(dtype)
device = torch.device(
    'cuda',
    index=args.gpu_index) if torch.cuda.is_available() else torch.device('cpu')
np.random.seed(args.seed)
torch.manual_seed(args.seed)
env_dummy = env_factory(0)
state_dim = env_dummy.observation_space.shape[0]
is_disc_action = len(env_dummy.action_space.shape) == 0
running_state = ZFilter((state_dim, ), clip=5)
# running_reward = ZFilter((1,), demean=False, clip=10)
"""define actor and critic"""
if args.model_path is None:
    if is_disc_action:
        policy_net = DiscretePolicy(state_dim, env_dummy.action_space.n)
    else:
        policy_net = Policy(state_dim,
                            env_dummy.action_space.shape[0],
                            log_std=args.log_std)
    value_net = Value(state_dim)
else:
    policy_net, value_net, running_state = pickle.load(
        open(args.model_path, "rb"))
policy_net.to(device)
value_net.to(device)
del env_dummy

optimizer_policy = torch.optim.Adam(policy_net.parameters(), lr=0.01)
optimizer_value = torch.optim.Adam(value_net.parameters(), lr=0.01)
"""create agent"""
示例#22
0
np.random.seed(args.seed)
torch.manual_seed(args.seed)
env.seed(args.seed)
"""define actor and critic"""
if args.env_name == 'CarRacing-v0':
    num_aux = 1
else:
    num_aux = 0

if num_aux > 0:
    aux_running_state = ZFilter(num_aux, clip=5)
else:
    aux_running_state = None

if is_disc_action:
    policy_net = DiscretePolicy(state_dim, env.action_space.n)
elif is_img_state:
    policy_net = CNNPolicy(state_dim,
                           env.action_space.shape[0],
                           cnn_options['channels'],
                           cnn_options['kernel_sizes'],
                           cnn_options['strides'],
                           head_hidden_size=cnn_options['head_hidden_sizes'],
                           num_aux=num_aux,
                           log_std=args.log_std,
                           resnet_first_layer=args.cnn_resnet_first_layer)
else:
    policy_net = Policy(state_dim,
                        env.action_space.shape[0],
                        log_std=args.log_std)