Пример #1
0
    initialise_weights(critic)

    # Define optimisers
    opt_gen = optim.Adam(gen.parameters(), lr=LEARNING_RATE, betas=(0.0, 0.9))
    opt_critic = optim.Adam(critic.parameters(),
                            lr=LEARNING_RATE,
                            betas=(0.0, 0.9))

    # Setup torchvision
    fixed_noise = torch.randn(32, Z_DIM, 1, 1).to(device)
    writer_real = SummaryWriter('logs/real')
    writer_fake = SummaryWriter('logs/fake')
    step = 0

    gen.train()
    critic.train()

    # Network training
    for epoch in range(NUM_EPOCHS):
        # Clear CPU and GPU cache each epoch
        gc.collect()
        torch.cuda.empty_cache()

        print('Epoch {}/{}'.format(epoch + 1, NUM_EPOCHS))

        # Save model every 50 epochs
        if (epoch + 1) % 50 == 0:
            torch.save(gen, 'gen_epoch{}'.format(epoch + 1))

        # For each batch_idx...
        for batch_idx, (real, _) in enumerate(loader):
Пример #2
0
class DDPG(object):
    def __init__(self, nb_status, nb_actions, args, writer):
        self.clip_actor_grad = args.clip_actor_grad
        self.nb_status = nb_status * args.window_length
        self.nb_actions = nb_actions
        self.discrete = args.discrete
        self.pic = args.pic
        self.writer = writer
        self.select_time = 0
        self.train_actions = args.train_actions
        if self.pic:
            self.nb_status = args.pic_status
        
        # Create Actor and Critic Network
        net_cfg = {
            'hidden1':args.hidden1, 
            'hidden2':args.hidden2, 
            'use_bn':args.bn,
            'use_bn_affine':args.bn_affine,
            'init_method':args.init_method
        }
        if args.pic:
            self.cnn = CNN(3, args.pic_status)
            self.cnn_optim = Adam(self.cnn.parameters(), lr=args.crate)
        self.actor = Actor(self.nb_status, self.nb_actions, **net_cfg)
        self.actor_target = Actor(self.nb_status, self.nb_actions, **net_cfg)
        self.actor_optim  = Adam(self.actor.parameters(), lr=args.prate)

        self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg)
        self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg)
        self.critic_optim  = Adam(self.critic.parameters(), lr=args.rate)

        hard_update(self.actor_target, self.actor) # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)
        
        #Create replay buffer
        self.memory = rpm(args.rmsize) # SequentialMemory(limit=args.rmsize, window_length=args.window_length)
        self.random_process = Myrandom(size=nb_actions)

        # Hyper-parameters
        self.batch_size = args.batch_size
        self.tau = args.tau
        self.discount = args.discount
        self.depsilon = 1.0 / args.epsilon

        # 
        self.epsilon = 1.0
        self.s_t = None # Most recent state
        self.a_t = None # Most recent action
        self.use_cuda = args.cuda
        # 
        if self.use_cuda: self.cuda()

    def normalize(self, pic):
        pic = pic.swapaxes(0, 2).swapaxes(1, 2)
        return pic

    def update_policy(self, train_actor = True):
        # Sample batch
        state_batch, action_batch, reward_batch, \
            next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size)

        # Prepare for the target q batch
        if self.pic:
            state_batch = np.array([self.normalize(x) for x in state_batch])
            state_batch = to_tensor(state_batch, volatile=True)
            print('label 1')
            print('size = ', state_batch.shape)
            state_batch = self.cnn(state_batch)
            print('label 2')
            next_state_batch = np.array([self.normalize(x) for x in next_state_batch])
            next_state_batch = to_tensor(next_state_batch, volatile=True)
            next_state_batch = self.cnn(next_state_batch)
            next_q_values = self.critic_target([
                next_state_batch,
                self.actor_target(next_state_batch)
            ])
        else:
            next_q_values = self.critic_target([
                to_tensor(next_state_batch, volatile=True),
                self.actor_target(to_tensor(next_state_batch, volatile=True)),
            ])
        # print('batch of picture is ok')
        next_q_values.volatile = False

        target_q_batch = to_tensor(reward_batch) + \
            self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values

        # Critic update
        self.critic.zero_grad()
        if self.pic: self.cnn.zero_grad()

        if self.pic:
            state_batch.volatile = False
            q_batch = self.critic([state_batch, to_tensor(action_batch)])
        else:
            q_batch = self.critic([to_tensor(state_batch), to_tensor(action_batch)])

        # print(reward_batch, next_q_values*self.discount, target_q_batch, terminal_batch.astype(np.float))
        value_loss = criterion(q_batch, target_q_batch)
        value_loss.backward()
        self.critic_optim.step()
        if self.pic: self.cnn_optim.step()

        self.actor.zero_grad()

        if self.pic:
            state_batch.volatile = False
            policy_loss = -self.critic([
                state_batch,
                self.actor(state_batch)
            ])
        else:
            policy_loss = -self.critic([
                to_tensor(state_batch),
                self.actor(to_tensor(state_batch))
            ])

        policy_loss = policy_loss.mean()
        policy_loss.backward()

        if self.clip_actor_grad is not None:
            torch.nn.utils.clip_grad_norm(self.actor.parameters(), float(self.clip_actor_grad))

            if self.writer != None:
                mean_policy_grad = np.array(np.mean([np.linalg.norm(p.grad.data.cpu().numpy().ravel()) for p in self.actor.parameters()]))
                #print(mean_policy_grad)
                self.writer.add_scalar('train/mean_policy_grad', mean_policy_grad, self.select_time)

        if train_actor:
            self.actor_optim.step()

        # Target update
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)

        return -policy_loss, value_loss

    def eval(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()

    def train(self):
        self.actor.train()
        self.actor_target.train()
        self.critic.train()
        self.critic_target.train()

    def cuda(self):
        self.actor.cuda()
        self.actor_target.cuda()
        self.critic.cuda()
        self.critic_target.cuda()

    def observe(self, r_t, s_t1, done):
        self.memory.append([self.s_t, self.a_t, r_t, s_t1, done])
        self.s_t = s_t1

    def random_action(self):
        action = np.random.uniform(-1.,1.,self.nb_actions)
        self.a_t = action
        if self.discrete:
            return action.argmax()
        else:
            return action

    def select_action(self, s_t, decay_epsilon=True, return_fix=False, noise_level=0):
        if self.pic:
            # print(s_t.shape)
            s_t = self.normalize(s_t)
            s_t = self.cnn(to_tensor(np.array([s_t])))
        self.eval()
        # print(s_t.shape)
        if self.pic:
            action = to_numpy(
                self.actor(s_t)
            ).squeeze(0)
        else:
            action = to_numpy(
                self.actor(to_tensor(np.array([s_t])))
            ).squeeze(0)
            
        self.train()
        noise_level = noise_level * max(self.epsilon, 0)
        
        action = action * (1 - noise_level) + (self.random_process.sample() * noise_level)
        action = np.clip(action, -1., 1.)

        if decay_epsilon:
            self.epsilon -= self.depsilon

        if self.use_cuda:
            tmp = nn.Parameter(torch.FloatTensor(np.array([action])).cuda())
        else:
            tmp = nn.Parameter(torch.FloatTensor(np.array([action])))
        optim = Adam([tmp], lr=1e-3)
        if self.train_actions == True:
            for i in range(10):
                optim.zero_grad()
                loss = -self.critic([
                    to_tensor(np.array([s_t])), tmp
                ])
                loss.backward()
                optim.step()
                tmp = torch.clamp(tmp, -1., 1.)

        Q1 = self.critic([
            to_tensor(np.array([s_t]), volatile=True), to_tensor(np.array([action]), volatile=True)
        ])
        Q2 = self.critic([
            to_tensor(np.array([s_t]), volatile=True), tmp
        ])
        if self.writer != None:
            self.writer.add_scalar('train/d_Q', (Q2 - Q1).data.cpu().numpy(), self.select_time)
            self.select_time += 1

        for j in range(self.nb_actions):
            action[j] = tmp[0][j]
        action = np.clip(action, -1., 1.)
        self.a_t = action
        if return_fix:
            return action
        if self.discrete:
            return action.argmax()
        else:
            return action

    def reset(self, obs):
        self.s_t = obs
        self.random_process.reset_status()

    def load_weights(self, output, num=1):        
        if output is None: return
        self.actor.load_state_dict(
            torch.load('{}/actor{}.pkl'.format(output, num))
        )
        self.actor_target.load_state_dict(
            torch.load('{}/actor{}.pkl'.format(output, num))
        )
        self.critic.load_state_dict(
            torch.load('{}/critic{}.pkl'.format(output, num))
        )
        self.critic_target.load_state_dict(
            torch.load('{}/critic{}.pkl'.format(output, num))
        )

    def save_model(self, output, num):
        if self.use_cuda:
            self.actor.cpu()
            self.critic.cpu()
        torch.save(
            self.actor.state_dict(),
            '{}/actor{}.pkl'.format(output, num)
        )
        torch.save(
            self.critic.state_dict(),
            '{}/critic{}.pkl'.format(output, num)
        )
        if self.use_cuda:
            self.actor.cuda()
            self.critic.cuda()
Пример #3
0
def main():
    env = gym.make(args.env_name)
    env.seed(500)
    torch.manual_seed(500)

    state_size = env.observation_space.shape[0]
    action_size = env.action_space.shape[0]
    print('state size:', state_size)
    print('action size:', action_size)

    actor = Actor(state_size, action_size, args)
    critic = Critic(state_size, args)
    critic_optimizer = optim.Adam(critic.parameters(), lr=args.critic_lr)

    writer = SummaryWriter(args.logdir)

    recent_rewards = deque(maxlen=100)
    episodes = 0

    for iter in range(args.max_iter_num):
        trajectories = deque()
        steps = 0

        while steps < args.total_sample_size:
            done = False
            score = 0
            episodes += 1

            state = env.reset()
            state = np.reshape(state, [1, state_size])

            while not done:
                if args.render:
                    env.render()

                steps += 1

                mu, std = actor(torch.Tensor(state))
                action = get_action(mu, std)

                next_state, reward, done, _ = env.step(action)

                mask = 0 if done else 1

                trajectories.append((state, action, reward, mask))

                next_state = np.reshape(next_state, [1, state_size])
                state = next_state
                score += reward

                if done:
                    recent_rewards.append(score)

        actor.train(), critic.train()
        train_model(actor, critic, critic_optimizer, trajectories, state_size,
                    action_size)

        writer.add_scalar('log/score', float(score), episodes)

        if iter % args.log_interval == 0:
            print('{} iter | {} episode | score_avg: {:.2f}'.format(
                iter, episodes, np.mean(recent_rewards)))

        if np.mean(recent_rewards) > args.goal_score:
            if not os.path.isdir(args.save_path):
                os.makedirs(args.save_path)

            ckpt_path = args.save_path + 'model.pth.tar'
            torch.save(actor.state_dict(), ckpt_path)
            print('Recent rewards exceed -300. So end')
            break
Пример #4
0
class DDPG(object):
    def __init__(self, nb_status, nb_actions, args, writer):
        self.clip_actor_grad = args.clip_actor_grad
        self.nb_status = nb_status * args.window_length
        self.nb_actions = nb_actions
        self.writer = writer
        self.select_time = 0

        # Create Actor and Critic Network
        net_cfg = {
            'hidden1': args.hidden1,
            'hidden2': args.hidden2,
            'init_method': args.init_method
        }

        self.actor = Actor(self.nb_status, self.nb_actions, **net_cfg)
        self.actor_target = Actor(self.nb_status, self.nb_actions, **net_cfg)
        self.actor_optim = Adam(self.actor.parameters(), lr=args.prate)

        self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg)
        self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.rate)

        hard_update(self.actor_target,
                    self.actor)  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

        #Create replay buffer
        self.memory = rpm(args.rmsize)
        self.random_process = Myrandom(size=nb_actions)

        # Hyper-parameters
        self.batch_size = args.batch_size
        self.tau = args.tau
        self.discount = args.discount
        self.depsilon = 1.0 / args.epsilon

        #
        self.epsilon = 1.0
        self.s_t = None  # Most recent state
        self.a_t = None  # Most recent action
        self.use_cuda = args.cuda
        #
        if self.use_cuda: self.cuda()

    def update_policy(self, train_actor=True):
        # Sample batch
        state_batch, action_batch, reward_batch, \
            next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size)

        # Prepare for the target q batch
        next_q_values = self.critic_target([
            to_tensor(next_state_batch, volatile=True),
            self.actor_target(to_tensor(next_state_batch, volatile=True)),
        ])
        # print('batch of picture is ok')
        next_q_values.volatile = False

        target_q_batch = to_tensor(reward_batch) + \
            self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values

        # Critic update
        self.critic.zero_grad()

        q_batch = self.critic(
            [to_tensor(state_batch),
             to_tensor(action_batch)])

        # print(reward_batch, next_q_values*self.discount, target_q_batch, terminal_batch.astype(np.float))
        value_loss = nn.MSELoss()(q_batch, target_q_batch)
        value_loss.backward()
        self.critic_optim.step()

        self.actor.zero_grad()

        policy_loss = -self.critic(
            [to_tensor(state_batch),
             self.actor(to_tensor(state_batch))])

        policy_loss = policy_loss.mean()
        policy_loss.backward()

        if self.clip_actor_grad is not None:
            torch.nn.utils.clip_grad_norm(self.actor.parameters(),
                                          float(self.clip_actor_grad))

            if self.writer != None:
                mean_policy_grad = np.array(
                    np.mean([
                        np.linalg.norm(p.grad.data.cpu().numpy().ravel())
                        for p in self.actor.parameters()
                    ]))
                #print(mean_policy_grad)
                self.writer.add_scalar('train/mean_policy_grad',
                                       mean_policy_grad, self.select_time)

        if train_actor:
            self.actor_optim.step()

        # Target update
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)

        return -policy_loss, value_loss

    def eval(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()

    def train(self):
        self.actor.train()
        self.actor_target.train()
        self.critic.train()
        self.critic_target.train()

    def cuda(self):
        self.actor.cuda()
        self.actor_target.cuda()
        self.critic.cuda()
        self.critic_target.cuda()

    def observe(self, r_t, s_t1, done):
        self.memory.append([self.s_t, self.a_t, r_t, s_t1, done])
        self.s_t = s_t1

    def random_action(self):
        action = np.random.uniform(-1., 1., self.nb_actions)
        self.a_t = action
        return action

    def select_action(self,
                      s_t,
                      decay_epsilon=True,
                      return_fix=False,
                      noise_level=0):
        self.eval()
        # print(s_t.shape)
        action = to_numpy(self.actor(to_tensor(np.array([s_t])))).squeeze(0)

        self.train()
        noise_level = noise_level * max(self.epsilon, 0)

        action = action * (1 - noise_level) + (self.random_process.sample() *
                                               noise_level)
        action = np.clip(action, -1., 1.)

        if decay_epsilon:
            self.epsilon -= self.depsilon

        self.a_t = action
        return action

    def reset(self, obs):
        self.s_t = obs
        self.random_process.reset_status()

    def load_weights(self, output, num=1):
        if output is None: return
        self.actor.load_state_dict(
            torch.load('{}/actor{}.pkl'.format(output, num)))
        self.actor_target.load_state_dict(
            torch.load('{}/actor{}.pkl'.format(output, num)))
        self.critic.load_state_dict(
            torch.load('{}/critic{}.pkl'.format(output, num)))
        self.critic_target.load_state_dict(
            torch.load('{}/critic{}.pkl'.format(output, num)))

    def save_model(self, output, num):
        if self.use_cuda:
            self.actor.cpu()
            self.critic.cpu()
        torch.save(self.actor.state_dict(),
                   '{}/actor{}.pkl'.format(output, num))
        torch.save(self.critic.state_dict(),
                   '{}/critic{}.pkl'.format(output, num))
        if self.use_cuda:
            self.actor.cuda()
            self.critic.cuda()
Пример #5
0
def main():
    env = gym.make(args.env_name)
    env.seed(args.seed)
    torch.manual_seed(args.seed)

    num_inputs = env.observation_space.shape[0]
    num_actions = env.action_space.shape[0]
    running_state = ZFilter((num_inputs,), clip=5)

    print('state size:', num_inputs) 
    print('action size:', num_actions)

    actor = Actor(num_inputs, num_actions, args)
    critic = Critic(num_inputs, args)
    vdb = VDB(num_inputs + num_actions, args)

    actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate)
    critic_optim = optim.Adam(critic.parameters(), lr=args.learning_rate, 
                              weight_decay=args.l2_rate) 
    vdb_optim = optim.Adam(vdb.parameters(), lr=args.learning_rate)
    
    # load demonstrations
    expert_demo, _ = pickle.load(open('./expert_demo/expert_demo.p', "rb"))
    demonstrations = np.array(expert_demo)
    print("demonstrations.shape", demonstrations.shape)

    writer = SummaryWriter(args.logdir)
    
    if args.load_model is not None:
        saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model))
        ckpt = torch.load(saved_ckpt_path)

        actor.load_state_dict(ckpt['actor'])
        critic.load_state_dict(ckpt['critic'])
        vdb.load_state_dict(ckpt['vdb'])

        running_state.rs.n = ckpt['z_filter_n']
        running_state.rs.mean = ckpt['z_filter_m']
        running_state.rs.sum_square = ckpt['z_filter_s']

        print("Loaded OK ex. Zfilter N {}".format(running_state.rs.n))

    
    episodes = 0    

    for iter in range(args.max_iter_num):
        actor.eval(), critic.eval()
        memory = deque()

        steps = 0
        scores = []

        while steps < args.total_sample_size: 
            state = env.reset()
            score = 0

            state = running_state(state)
            
            for _ in range(10000): 
                if args.render:
                    env.render()

                steps += 1

                mu, std = actor(torch.Tensor(state).unsqueeze(0))
                action = get_action(mu, std)[0]
                next_state, reward, done, _ = env.step(action)
                irl_reward = get_reward(vdb, state, action)

                if done:
                    mask = 0
                else:
                    mask = 1

                memory.append([state, action, irl_reward, mask])

                next_state = running_state(next_state)
                state = next_state

                score += reward

                if done:
                    break
            
            episodes += 1
            scores.append(score)
        
        score_avg = np.mean(scores)
        print('{} episode score is {:.2f}'.format(episodes, score_avg))
        writer.add_scalar('log/score', float(score_avg), iter)

        actor.train(), critic.train(), vdb.train() 
        train_vdb(vdb, memory, vdb_optim, demonstrations, 0, args)
        train_ppo(actor, critic, memory, actor_optim, critic_optim, args)
Пример #6
0
class DDPG:
    def __init__(self, env, args):
        ob_space = env.observation_space
        goal_dim = env.goal_dim
        ob_dim = ob_space.shape[0]
        self.ob_dim = ob_dim
        self.ac_dim = ac_dim = 7
        self.goal_dim = goal_dim
        self.num_iters = args.num_iters
        self.random_prob = args.random_prob
        self.tau = args.tau
        self.reward_scale = args.reward_scale
        self.gamma = args.gamma

        self.log_interval = args.log_interval
        self.save_interval = args.save_interval
        self.rollout_steps = args.rollout_steps
        self.env = env
        self.batch_size = args.batch_size
        self.train_steps = args.train_steps
        self.closest_dist = np.inf
        self.warmup_iter = args.warmup_iter
        self.max_grad_norm = args.max_grad_norm
        self.use_her = args.her
        self.k_future = args.k_future
        self.model_dir = os.path.join(args.save_dir, 'model')
        self.pretrain_dir = args.pretrain_dir
        os.makedirs(self.model_dir, exist_ok=True)
        self.global_step = 0
        self.actor = Actor(ob_dim=ob_dim,
                           act_dim=ac_dim,
                           hid1_dim=args.hid1_dim,
                           hid2_dim=args.hid2_dim,
                           hid3_dim=args.hid3_dim,
                           init_method=args.init_method)
        self.critic = Critic(ob_dim=ob_dim,
                             act_dim=ac_dim,
                             hid1_dim=args.hid1_dim,
                             hid2_dim=args.hid2_dim,
                             hid3_dim=args.hid3_dim,
                             init_method=args.init_method)
        if args.resume or args.test or args.pretrain_dir is not None:
            self.load_model(args.resume_step, pretrain_dir=args.pretrain_dir)
        if not args.test:
            self.actor_target = Actor(ob_dim=ob_dim,
                                      act_dim=ac_dim,
                                      hid1_dim=args.hid1_dim,
                                      hid2_dim=args.hid2_dim,
                                      hid3_dim=args.hid3_dim,
                                      init_method=args.init_method)
            self.critic_target = Critic(ob_dim=ob_dim,
                                        act_dim=ac_dim,
                                        hid1_dim=args.hid1_dim,
                                        hid2_dim=args.hid2_dim,
                                        hid3_dim=args.hid3_dim,
                                        init_method=args.init_method)
            self.actor_optim = self.construct_optim(self.actor,
                                                    lr=args.actor_lr)
            cri_w_decay = args.critic_weight_decay
            self.critic_optim = self.construct_optim(self.critic,
                                                     lr=args.critic_lr,
                                                     weight_decay=cri_w_decay)
            self.hard_update(self.actor_target, self.actor)
            self.hard_update(self.critic_target, self.critic)

            self.actor_target.eval()
            self.critic_target.eval()
            if args.noise_type == 'ou_noise':
                mu = np.zeros(ac_dim)
                sigma = float(args.ou_noise_std) * np.ones(ac_dim)
                self.action_noise = OrnsteinUhlenbeckActionNoise(mu=mu,
                                                                 sigma=sigma)
            elif args.noise_type == 'uniform':
                low_limit = args.uniform_noise_low
                high_limit = args.uniform_noise_high
                dec_step = args.max_noise_dec_step
                self.action_noise = UniformNoise(low_limit=low_limit,
                                                 high_limit=high_limit,
                                                 dec_step=dec_step)

            elif args.noise_type == 'gaussian':
                mu = np.zeros(ac_dim)
                sigma = args.normal_noise_std * np.ones(ac_dim)
                self.action_noise = NormalActionNoise(mu=mu, sigma=sigma)

            self.memory = Memory(limit=int(args.memory_limit),
                                 action_shape=(int(ac_dim), ),
                                 observation_shape=(int(ob_dim), ))
            self.critic_loss = nn.MSELoss()
            self.ob_norm = args.ob_norm
            if self.ob_norm:
                self.obs_oms = OnlineMeanStd(shape=(1, ob_dim))
            else:
                self.obs_oms = None

        self.cuda()

    def test(self, render=False, record=True, slow_t=0):
        dist, succ_rate = self.rollout(render=render,
                                       record=record,
                                       slow_t=slow_t)
        print('Final step distance: ', dist)

    def train(self):
        self.net_mode(train=True)
        tfirststart = time.time()
        epoch_episode_rewards = deque(maxlen=1)
        epoch_episode_steps = deque(maxlen=1)
        total_rollout_steps = 0
        for epoch in range(self.global_step, self.num_iters):
            episode_reward = 0
            episode_step = 0
            self.action_noise.reset()
            obs = self.env.reset()
            obs = obs[0]
            epoch_actor_losses = []
            epoch_critic_losses = []
            if self.use_her:
                ep_experi = {
                    'obs': [],
                    'act': [],
                    'reward': [],
                    'new_obs': [],
                    'ach_goals': [],
                    'done': []
                }
            for t_rollout in range(self.rollout_steps):
                total_rollout_steps += 1
                ran = np.random.random(1)[0]
                if self.pretrain_dir is None and epoch < self.warmup_iter or \
                        ran < self.random_prob:
                    act = self.random_action().flatten()
                else:
                    act = self.policy(obs).flatten()
                new_obs, r, done, info = self.env.step(act)
                ach_goals = new_obs[1].copy()
                new_obs = new_obs[0].copy()
                episode_reward += r
                episode_step += 1
                self.memory.append(obs, act, r * self.reward_scale, new_obs,
                                   ach_goals, done)
                if self.use_her:
                    ep_experi['obs'].append(obs)
                    ep_experi['act'].append(act)
                    ep_experi['reward'].append(r * self.reward_scale)
                    ep_experi['new_obs'].append(new_obs)
                    ep_experi['ach_goals'].append(ach_goals)
                    ep_experi['done'].append(done)
                if self.ob_norm:
                    self.obs_oms.update(new_obs)
                obs = new_obs
            epoch_episode_rewards.append(episode_reward)
            epoch_episode_steps.append(episode_step)
            if self.use_her:
                for t in range(episode_step - self.k_future):
                    ob = ep_experi['obs'][t]
                    act = ep_experi['act'][t]
                    new_ob = ep_experi['new_obs'][t]
                    ach_goal = ep_experi['ach_goals'][t]
                    k_futures = np.random.choice(np.arange(
                        t + 1, episode_step),
                                                 self.k_future - 1,
                                                 replace=False)
                    k_futures = np.concatenate((np.array([t]), k_futures))
                    for future in k_futures:
                        new_goal = ep_experi['ach_goals'][future]
                        her_ob = np.concatenate(
                            (ob[:-self.goal_dim], new_goal), axis=0)
                        her_new_ob = np.concatenate(
                            (new_ob[:-self.goal_dim], new_goal), axis=0)
                        res = self.env.cal_reward(ach_goal.copy(), new_goal,
                                                  act)
                        her_reward, _, done = res
                        self.memory.append(her_ob, act,
                                           her_reward * self.reward_scale,
                                           her_new_ob, ach_goal.copy(), done)
            self.global_step += 1
            if epoch >= self.warmup_iter:
                for t_train in range(self.train_steps):
                    act_loss, cri_loss = self.train_net()
                    epoch_critic_losses.append(cri_loss)
                    epoch_actor_losses.append(act_loss)

            if epoch % self.log_interval == 0:
                tnow = time.time()
                stats = {}
                if self.ob_norm:
                    stats['ob_oms_mean'] = safemean(self.obs_oms.mean.numpy())
                    stats['ob_oms_std'] = safemean(self.obs_oms.std.numpy())
                stats['total_rollout_steps'] = total_rollout_steps
                stats['rollout/return'] = safemean(
                    [rew for rew in epoch_episode_rewards])
                stats['rollout/ep_steps'] = safemean(
                    [l for l in epoch_episode_steps])
                if epoch >= self.warmup_iter:
                    stats['actor_loss'] = np.mean(epoch_actor_losses)
                    stats['critic_loss'] = np.mean(epoch_critic_losses)
                stats['epoch'] = epoch
                stats['actor_lr'] = self.actor_optim.param_groups[0]['lr']
                stats['critic_lr'] = self.critic_optim.param_groups[0]['lr']
                stats['time_elapsed'] = tnow - tfirststart
                for name, value in stats.items():
                    logger.logkv(name, value)
                logger.dumpkvs()
            if (epoch == 0 or epoch >= self.warmup_iter) and \
                    self.save_interval and\
                    epoch % self.save_interval == 0 and \
                    logger.get_dir():
                mean_final_dist, succ_rate = self.rollout()
                logger.logkv('epoch', epoch)
                logger.logkv('test/total_rollout_steps', total_rollout_steps)
                logger.logkv('test/mean_final_dist', mean_final_dist)
                logger.logkv('test/succ_rate', succ_rate)

                tra_mean_dist, tra_succ_rate = self.rollout(train_test=True)
                logger.logkv('train/mean_final_dist', tra_mean_dist)
                logger.logkv('train/succ_rate', tra_succ_rate)

                # self.log_model_weights()
                logger.dumpkvs()
                if mean_final_dist < self.closest_dist:
                    self.closest_dist = mean_final_dist
                    is_best = True
                else:
                    is_best = False
                self.save_model(is_best=is_best, step=self.global_step)

    def train_net(self):
        batch_data = self.memory.sample(batch_size=self.batch_size)
        for key, value in batch_data.items():
            batch_data[key] = torch.from_numpy(value)
        obs0_t = batch_data['obs0']
        obs1_t = batch_data['obs1']
        obs0_t = self.normalize(obs0_t, self.obs_oms)
        obs1_t = self.normalize(obs1_t, self.obs_oms)
        obs0 = Variable(obs0_t).float().cuda()
        with torch.no_grad():
            vol_obs1 = Variable(obs1_t).float().cuda()

        rewards = Variable(batch_data['rewards']).float().cuda()
        actions = Variable(batch_data['actions']).float().cuda()
        terminals = Variable(batch_data['terminals1']).float().cuda()

        cri_q_val = self.critic(obs0, actions)
        with torch.no_grad():
            target_net_act = self.actor_target(vol_obs1)
            target_net_q_val = self.critic_target(vol_obs1, target_net_act)
            # target_net_q_val.volatile = False
            target_q_label = rewards
            target_q_label += self.gamma * target_net_q_val * (1 - terminals)
            target_q_label = target_q_label.detach()

        self.actor.zero_grad()
        self.critic.zero_grad()
        cri_loss = self.critic_loss(cri_q_val, target_q_label)
        cri_loss.backward()
        if self.max_grad_norm is not None:
            torch.nn.utils.clip_grad_norm(self.critic.parameters(),
                                          self.max_grad_norm)
        self.critic_optim.step()

        self.critic.zero_grad()
        self.actor.zero_grad()
        net_act = self.actor(obs0)
        net_q_val = self.critic(obs0, net_act)
        act_loss = -net_q_val.mean()
        act_loss.backward()

        if self.max_grad_norm is not None:
            torch.nn.utils.clip_grad_norm(self.actor.parameters(),
                                          self.max_grad_norm)
        self.actor_optim.step()

        self.soft_update(self.actor_target, self.actor, self.tau)
        self.soft_update(self.critic_target, self.critic, self.tau)
        return act_loss.cpu().data.numpy(), cri_loss.cpu().data.numpy()

    def normalize(self, x, stats):
        if stats is None:
            return x
        return (x - stats.mean) / stats.std

    def denormalize(self, x, stats):
        if stats is None:
            return x
        return x * stats.std + stats.mean

    def net_mode(self, train=True):
        if train:
            self.actor.train()
            self.critic.train()
        else:
            self.actor.eval()
            self.critic.eval()

    def load_model(self, step=None, pretrain_dir=None):
        model_dir = self.model_dir
        if pretrain_dir is not None:
            ckpt_file = os.path.join(self.pretrain_dir, 'model_best.pth')
        else:
            if step is None:
                ckpt_file = os.path.join(model_dir, 'model_best.pth')
            else:
                ckpt_file = os.path.join(model_dir,
                                         'ckpt_{:08d}.pth'.format(step))
        if not os.path.isfile(ckpt_file):
            raise ValueError("No checkpoint found at '{}'".format(ckpt_file))
        mutils.print_yellow('Loading checkpoint {}'.format(ckpt_file))
        checkpoint = torch.load(ckpt_file)
        if pretrain_dir is not None:
            actor_dict = self.actor.state_dict()
            critic_dict = self.critic.state_dict()
            actor_pretrained_dict = {
                k: v
                for k, v in checkpoint['actor_state_dict'].items()
                if k in actor_dict
            }
            critic_pretrained_dict = {
                k: v
                for k, v in checkpoint['critic_state_dict'].items()
                if k in critic_dict
            }
            actor_dict.update(actor_pretrained_dict)
            critic_dict.update(critic_pretrained_dict)
            self.actor.load_state_dict(actor_dict)
            self.critic.load_state_dict(critic_dict)
            self.global_step = 0
        else:
            self.actor.load_state_dict(checkpoint['actor_state_dict'])
            self.critic.load_state_dict(checkpoint['critic_state_dict'])
            self.global_step = checkpoint['global_step']
        if step is None:
            mutils.print_yellow('Checkpoint step: {}'
                                ''.format(checkpoint['ckpt_step']))

        self.warmup_iter += self.global_step
        mutils.print_yellow('Checkpoint loaded...')

    def save_model(self, is_best, step=None):
        if step is None:
            step = self.global_step
        ckpt_file = os.path.join(self.model_dir,
                                 'ckpt_{:08d}.pth'.format(step))
        data_to_save = {
            'ckpt_step': step,
            'global_step': self.global_step,
            'actor_state_dict': self.actor.state_dict(),
            'actor_optimizer': self.actor_optim.state_dict(),
            'critic_state_dict': self.critic.state_dict(),
            'critic_optimizer': self.critic_optim.state_dict()
        }

        mutils.print_yellow('Saving checkpoint: %s' % ckpt_file)
        torch.save(data_to_save, ckpt_file)
        if is_best:
            torch.save(data_to_save,
                       os.path.join(self.model_dir, 'model_best.pth'))

    def rollout(self, train_test=False, render=False, record=False, slow_t=0):
        test_conditions = self.env.train_test_conditions \
            if train_test else self.env.test_conditions
        done_num = 0
        final_dist = []
        episode_length = []
        for idx in range(test_conditions):
            if train_test:
                obs = self.env.train_test_reset(cond=idx)
            else:
                obs = self.env.test_reset(cond=idx)
            for t_rollout in range(self.rollout_steps):
                obs = obs[0].copy()
                act = self.policy(obs, stochastic=False).flatten()
                obs, r, done, info = self.env.step(act)
                if render:
                    self.env.render()
                    if slow_t > 0:
                        time.sleep(slow_t)
                if done:
                    done_num += 1
                    break
            if record:
                print('dist: ', info['dist'])
            final_dist.append(info['dist'])
            episode_length.append(t_rollout)
        final_dist = np.array(final_dist)
        mean_final_dist = np.mean(final_dist)
        succ_rate = done_num / float(test_conditions)
        if record:
            with open('./test_data.json', 'w') as f:
                json.dump(final_dist.tolist(), f)

            print('\nDist statistics:')
            print("Minimum: {0:9.4f} Maximum: {1:9.4f}"
                  "".format(np.min(final_dist), np.max(final_dist)))
            print("Mean: {0:9.4f}".format(mean_final_dist))
            print("Standard Deviation: {0:9.4f}".format(np.std(final_dist)))
            print("Median: {0:9.4f}".format(np.median(final_dist)))
            print("First quartile: {0:9.4f}"
                  "".format(np.percentile(final_dist, 25)))
            print("Third quartile: {0:9.4f}"
                  "".format(np.percentile(final_dist, 75)))
            print('Success rate:', succ_rate)
        if render:
            while True:
                self.env.render()
        return mean_final_dist, succ_rate

    def log_model_weights(self):
        for name, param in self.actor.named_parameters():
            logger.logkv('actor/' + name, param.clone().cpu().data.numpy())
        for name, param in self.actor_target.named_parameters():
            logger.logkv('actor_target/' + name,
                         param.clone().cpu().data.numpy())
        for name, param in self.critic.named_parameters():
            logger.logkv('critic/' + name, param.clone().cpu().data.numpy())
        for name, param in self.critic_target.named_parameters():
            logger.logkv('critic_target/' + name,
                         param.clone().cpu().data.numpy())

    def random_action(self):
        act = np.random.uniform(-1., 1., self.ac_dim)
        return act

    def policy(self, obs, stochastic=True):
        self.actor.eval()
        ob = Variable(torch.from_numpy(obs)).float().cuda().view(1, -1)
        act = self.actor(ob)
        act = act.cpu().data.numpy()
        if stochastic:
            act = self.action_noise(act)
        self.actor.train()
        return act

    def cuda(self):
        self.critic.cuda()
        self.actor.cuda()
        if hasattr(self, 'critic_target'):
            self.critic_target.cuda()
            self.actor_target.cuda()
            self.critic_loss.cuda()

    def construct_optim(self, net, lr, weight_decay=None):
        if weight_decay is None:
            weight_decay = 0
        params = mutils.add_weight_decay([net], weight_decay=weight_decay)
        optimizer = optim.Adam(params, lr=lr, weight_decay=weight_decay)
        return optimizer

    def soft_update(self, target, source, tau):
        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(target_param.data * (1.0 - tau) +
                                    param.data * tau)

    def hard_update(self, target, source):
        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(param.data)
Пример #7
0
            episodes += 1
            state = env.reset()
            state = running_state(state)
            score = 0
            for _ in range(10000):
                if episodes % 50 == 0:
                    env.render()

                steps += 1
                mu, std, _ = actor(torch.Tensor(state).unsqueeze(0))
                action = get_action(mu, std)[0]
                next_state, reward, done, _ = env.step(action)
                next_state = running_state(next_state)

                if done:
                    mask = 0
                else:
                    mask = 1

                memory.append([state, action, reward, mask])

                score += reward
                state = next_state

                if done:
                    break
            scores.append(score)
        score_avg = np.mean(scores)
        print('{} episode score is {:.2f}'.format(episodes, score_avg))
        actor.train(), critic.train()
        train_model(actor, critic, memory, actor_optim, critic_optim)
Пример #8
0
class DDPG(object):
    def __init__(self, nb_status, nb_actions, args, writer):
        self.clip_actor_grad = args.clip_actor_grad
        self.nb_status = nb_status * args.window_length
        self.nb_actions = nb_actions
        self.writer = writer
        self.select_time = 0
        
        # Create Actor and Critic Network
        net_cfg = {
            'hidden1':args.hidden1, 
            'hidden2':args.hidden2, 
            'init_method':args.init_method
        }

        self.actor = Actor(self.nb_status, self.nb_actions, **net_cfg)
        self.actor_target = Actor(self.nb_status, self.nb_actions, **net_cfg)
        self.actor_optim  = Adam(self.actor.parameters(), lr=args.prate)

        self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg)
        self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg)
        self.critic_optim  = Adam(self.critic.parameters(), lr=args.rate)

        hard_update(self.actor_target, self.actor) # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)
        
        #Create replay buffer
        self.memory = rpm(args.rmsize)
        self.random_process = Myrandom(size=nb_actions)

        # Hyper-parameters
        self.batch_size = args.batch_size
        self.tau = args.tau
        self.discount = args.discount
        self.depsilon = 1.0 / args.epsilon

        # 
        self.epsilon = 1.0
        self.s_t = None # Most recent state
        self.a_t = None # Most recent action
        self.use_cuda = args.cuda
        # 
        if self.use_cuda: self.cuda()

    def update_policy(self, train_actor = True):
        # Sample batch
        state_batch, action_batch, reward_batch, \
            next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size)

        # Prepare for the target q batch
        next_q_values = self.critic_target([
            to_tensor(next_state_batch, volatile=True),
            self.actor_target(to_tensor(next_state_batch, volatile=True)),
        ])
        # print('batch of picture is ok')
        next_q_values.volatile = False

        target_q_batch = to_tensor(reward_batch) + \
            self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values

        # Critic update
        self.critic.zero_grad()

        q_batch = self.critic([to_tensor(state_batch), to_tensor(action_batch)])

        # print(reward_batch, next_q_values*self.discount, target_q_batch, terminal_batch.astype(np.float))
        value_loss = nn.MSELoss()(q_batch, target_q_batch)
        value_loss.backward()
        self.critic_optim.step()

        self.actor.zero_grad()

        policy_loss = -self.critic([
            to_tensor(state_batch),
            self.actor(to_tensor(state_batch))
        ])

        policy_loss = policy_loss.mean()
        policy_loss.backward()

        if self.clip_actor_grad is not None:
            torch.nn.utils.clip_grad_norm(self.actor.parameters(), float(self.clip_actor_grad))

            if self.writer != None:
                mean_policy_grad = np.array(np.mean([np.linalg.norm(p.grad.data.cpu().numpy().ravel()) for p in self.actor.parameters()]))
                #print(mean_policy_grad)
                self.writer.add_scalar('train/mean_policy_grad', mean_policy_grad, self.select_time)

        if train_actor:
            self.actor_optim.step()

        # Target update
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)

        return -policy_loss, value_loss

    def eval(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()

    def train(self):
        self.actor.train()
        self.actor_target.train()
        self.critic.train()
        self.critic_target.train()

    def cuda(self):
        self.actor.cuda()
        self.actor_target.cuda()
        self.critic.cuda()
        self.critic_target.cuda()

    def observe(self, r_t, s_t1, done):
        self.memory.append([self.s_t, self.a_t, r_t, s_t1, done])
        self.s_t = s_t1

    def random_action(self):
        action = np.random.uniform(-1.,1.,self.nb_actions)
        self.a_t = action
        return action

    def select_action(self, s_t, decay_epsilon=True, return_fix=False, noise_level=0):
        self.eval()
        # print(s_t.shape)
        action = to_numpy(
            self.actor(to_tensor(np.array([s_t])))
        ).squeeze(0)
            
        self.train()
        noise_level = noise_level * max(self.epsilon, 0)
        
        action = action * (1 - noise_level) + (self.random_process.sample() * noise_level)
        action = np.clip(action, -1., 1.)

        if decay_epsilon:
            self.epsilon -= self.depsilon

        self.a_t = action
        return action

    def reset(self, obs):
        self.s_t = obs
        self.random_process.reset_status()

    def load_weights(self, output, num=1):        
        if output is None: return
        self.actor.load_state_dict(
            torch.load('{}/actor{}.pkl'.format(output, num))
        )
        self.actor_target.load_state_dict(
            torch.load('{}/actor{}.pkl'.format(output, num))
        )
        self.critic.load_state_dict(
            torch.load('{}/critic{}.pkl'.format(output, num))
        )
        self.critic_target.load_state_dict(
            torch.load('{}/critic{}.pkl'.format(output, num))
        )

    def save_model(self, output, num):
        if self.use_cuda:
            self.actor.cpu()
            self.critic.cpu()
        torch.save(
            self.actor.state_dict(),
            '{}/actor{}.pkl'.format(output, num)
        )
        torch.save(
            self.critic.state_dict(),
            '{}/critic{}.pkl'.format(output, num)
        )
        if self.use_cuda:
            self.actor.cuda()
            self.critic.cuda()
Пример #9
0
class DDPG(object):
    def __init__(self, nb_status, nb_actions, args):
        self.num_actor = 3

        self.nb_status = nb_status * args.window_length
        self.nb_actions = nb_actions
        self.discrete = args.discrete
        self.pic = args.pic
        if self.pic:
            self.nb_status = args.pic_status
        
        # Create Actor and Critic Network
        net_cfg = {
            'hidden1':args.hidden1, 
            'hidden2':args.hidden2, 
            'use_bn':args.bn
        }
        if args.pic:
            self.cnn = CNN(3, args.pic_status)
            self.cnn_optim = Adam(self.cnn.parameters(), lr=args.crate)
        self.actors = [Actor(self.nb_status, self.nb_actions) for _ in range(self.num_actor)]
        self.actor_targets = [Actor(self.nb_status, self.nb_actions) for _ in
                              range(self.num_actor)]
        self.actor_optims = [Adam(self.actors[i].parameters(), lr=args.prate) for i in range(self.num_actor)]

        self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg)
        self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg)
        self.critic_optim  = Adam(self.critic.parameters(), lr=args.rate)

        for i in range(self.num_actor):
            hard_update(self.actor_targets[i], self.actors[i])  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)
        
        #Create replay buffer
        self.memory = rpm(args.rmsize) # SequentialMemory(limit=args.rmsize, window_length=args.window_length)
        self.random_process = Myrandom(size=nb_actions)

        # Hyper-parameters
        self.batch_size = args.batch_size
        self.tau = args.tau
        self.discount = args.discount
        self.depsilon = 1.0 / args.epsilon

        # 
        self.epsilon = 1.0
        self.s_t = None # Most recent state
        self.a_t = None # Most recent action
        self.use_cuda = args.cuda
        # 
        if self.use_cuda: self.cuda()

    def normalize(self, pic):
        pic = pic.swapaxes(0, 2).swapaxes(1, 2)
        return pic

    def update_policy(self, train_actor = True):
        # Sample batch
        state_batch, action_batch, reward_batch, \
            next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size)

        # Prepare for the target q batch
        if self.pic:
            state_batch = np.array([self.normalize(x) for x in state_batch])
            state_batch = to_tensor(state_batch, volatile=True)
            print('label 1')
            print('size = ', state_batch.shape)
            state_batch = self.cnn(state_batch)
            print('label 2')
            next_state_batch = np.array([self.normalize(x) for x in next_state_batch])
            next_state_batch = to_tensor(next_state_batch, volatile=True)
            next_state_batch = self.cnn(next_state_batch)
            next_q_values = self.critic_target([
                next_state_batch,
                self.actor_target(next_state_batch)
            ])
        else:
            index = np.random.randint(low=0, high=self.num_actor)
            next_q_values = self.critic_target([
                to_tensor(next_state_batch, volatile=True),
                self.actor_targets[index](to_tensor(next_state_batch, volatile=True)),
            ])
        # print('batch of picture is ok')
        next_q_values.volatile = False

        target_q_batch = to_tensor(reward_batch) + \
            self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values

        # Critic update
        self.critic.zero_grad()
        if self.pic: self.cnn.zero_grad()

        if self.pic:
            state_batch.volatile = False
            q_batch = self.critic([state_batch, to_tensor(action_batch)])
        else:
            q_batch = self.critic([to_tensor(state_batch), to_tensor(action_batch)])

        # print(reward_batch, next_q_values*self.discount, target_q_batch, terminal_batch.astype(np.float))
        value_loss = criterion(q_batch, target_q_batch)
        value_loss.backward()
        self.critic_optim.step()
        if self.pic: self.cnn_optim.step()

        sum_policy_loss = 0
        for i in range(self.num_actor):
            self.actors[i].zero_grad()

            policy_loss = -self.critic([
                to_tensor(state_batch),
                self.actors[i](to_tensor(state_batch))
            ])

            policy_loss = policy_loss.mean()
            policy_loss.backward()
            if train_actor:
                self.actor_optims[i].step()
            sum_policy_loss += policy_loss

            # Target update
            soft_update(self.actor_targets[i], self.actors[i], self.tau)

        soft_update(self.critic_target, self.critic, self.tau)

        return -sum_policy_loss / self.num_actor, value_loss

    def eval(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()

    def train(self):
        self.actor.train()
        self.actor_target.train()
        self.critic.train()
        self.critic_target.train()

    def cuda(self):
        for i in range(self.num_actor):
            self.actors[i].cuda()
            self.actor_targets[i].cuda()
        self.critic.cuda()
        self.critic_target.cuda()

    def observe(self, r_t, s_t1, done):
        self.memory.append([self.s_t, self.a_t, r_t, s_t1, done])
        self.s_t = s_t1

    def random_action(self):
        action = np.random.uniform(-1.,1.,self.nb_actions)
        self.a_t = action
        if self.discrete:
            return action.argmax()
        else:
            return action

    def select_action(self, s_t, decay_epsilon=True, return_fix=False, noise_level=0):
        actions = []
        status = []
        tot_score = []
        for i in range(self.num_actor):
            action = to_numpy(self.actors[i](to_tensor(np.array([s_t]), volatile=True))).squeeze(0)
            noise_level = noise_level * max(self.epsilon, 0)
            action = action + self.random_process.sample() * noise_level
            status.append(s_t)
            actions.append(action)
            tot_score.append(0.)

        scores = self.critic([to_tensor(np.array(status), volatile=True), to_tensor(np.array(actions), volatile=True)])
        for j in range(self.num_actor):
            tot_score[j] += scores.data[j][0]
        best = np.array(tot_score).argmax()

        if decay_epsilon:
            self.epsilon -= self.depsilon

        self.a_t = actions[best]
        return actions[best]

    def reset(self, obs):
        self.s_t = obs
        self.random_process.reset_status()

    def load_weights(self, output, num=0):        
        if output is None: return
        for i in range(self.num_actor):
            actor = self.actors[i]
            actor_target = self.actor_targets[i]
            actor.load_state_dict(
                torch.load('{}/actor{}_{}.pkl'.format(output, num, i))
            )
            actor_target.load_state_dict(
                torch.load('{}/actor{}_{}.pkl'.format(output, num, i))
            )
        self.critic.load_state_dict(
            torch.load('{}/critic{}.pkl'.format(output, num))
        )
        self.critic_target.load_state_dict(
            torch.load('{}/critic{}.pkl'.format(output, num))
        )

    def save_model(self, output, num):
        if self.use_cuda:
            for i in range(self.num_actor):
                self.actors[i].cpu()
            self.critic.cpu()
        for i in range(self.num_actor):
            torch.save(
                self.actors[i].state_dict(),
                '{}/actor{}_{}.pkl'.format(output, num, i)
            )
        torch.save(
            self.critic.state_dict(),
            '{}/critic{}.pkl'.format(output, num)
        )
        if self.use_cuda:
            for i in range(self.num_actor):
                self.actors[i].cuda()
            self.critic.cuda()
Пример #10
0
class DDPG(object):
    def __init__(self, nb_states, nb_actions, args):

        if args.seed > 0:
            self.seed(args.seed)

        self.nb_states = nb_states
        self.nb_actions = nb_actions

        actor_net_cfg = {
            'hidden1': 32,
            'hidden2': 32,
            'hidden3': 32,
            'init_w': args.init_w
        }

        critic_net_cfg = {
            'hidden1': 64,
            'hidden2': 64,
            'hidden3': 64,
            'init_w': args.init_w
        }

        self.actor = Actor(self.nb_states, self.nb_actions, **actor_net_cfg)
        self.actor_target = Actor(self.nb_states, self.nb_actions,
                                  **actor_net_cfg)
        self.actor_optim = Adam(self.actor.parameters(), lr=args.prate)

        self.critic = Critic(self.nb_states, self.nb_actions, **critic_net_cfg)
        self.critic_target = Critic(self.nb_states, self.nb_actions,
                                    **critic_net_cfg)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.rate)

        hard_update(self.actor_target,
                    self.actor)  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

        #Create replay buffer
        self.memory = SequentialMemory(limit=args.rmsize,
                                       window_length=args.window_length)
        self.random_process = OrnsteinUhlenbeckProcess(size=nb_actions,
                                                       theta=args.ou_theta,
                                                       mu=args.ou_mu,
                                                       sigma=args.ou_sigma)

        # Hyper-parameters
        self.batch_size = args.bsize
        self.tau = args.tau
        self.discount = args.discount
        self.depsilon = 1.0 / args.epsilon

        self.epsilon = 1.0
        self.s_t = None  # Most recent state
        self.a_t = None  # Most recent action
        self.is_training = True
        self.best_reward = -10

    def update_policy(self, shared_model, args):
        # Sample batch
        state_batch, action_batch, reward_batch, \
        next_state_batch, terminal_batch = self.memory.sample_and_split(self.batch_size, shared=args.use_more_states, num_states=args.num_states)

        # Prepare for the target q batch
        next_q_values = self.critic_target([
            to_tensor(next_state_batch, volatile=True),
            self.actor_target(to_tensor(next_state_batch, volatile=True)),
        ])
        next_q_values.volatile = False

        target_q_batch = to_tensor(reward_batch) + \
            self.discount*to_tensor(terminal_batch.astype(np.float))*next_q_values

        # Critic update
        self.critic_optim.zero_grad()

        q_batch = self.critic(
            [to_tensor(state_batch),
             to_tensor(action_batch)])

        value_loss = criterion(q_batch, target_q_batch)
        value_loss.backward()
        if args.shared:
            ensure_shared_grads(self.critic, shared_model.critic)

        self.critic_optim.step()

        # Actor update
        self.actor_optim.zero_grad()

        policy_loss = -self.critic(
            [to_tensor(state_batch),
             self.actor(to_tensor(state_batch))])

        policy_loss = policy_loss.mean()
        policy_loss.backward()
        if args.shared:
            ensure_shared_grads(self.actor, shared_model.actor)
        self.actor_optim.step()

        # Target update
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)

    def eval(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()

    def share_memory(self):
        self.critic.share_memory()
        self.actor.share_memory()

    def add_optim(self, actor_optim, critic_optim):
        self.actor_optim = actor_optim
        self.critic_optim = critic_optim

    def observe(self, r_t, s_t1, done):
        if self.is_training:
            self.memory.append(self.s_t, self.a_t, r_t, done)
            self.s_t = s_t1

    def update_models(self, agent):
        self.actor = deepcopy(agent.actor)
        self.actor_target = deepcopy(agent.actor_target)
        self.critic = deepcopy(agent.critic)
        self.critic_target = deepcopy(agent.critic_target)
        self.actor_optim = deepcopy(agent.actor_optim)
        self.critic_optim = deepcopy(agent.critic_optim)

    def random_action(self):
        action = np.random.uniform(-1., 1., self.nb_actions)
        self.a_t = action
        return action

    def train(self):
        self.critic.train()
        self.actor.train()

    def state_dict(self):
        return [
            self.actor.state_dict(),
            self.actor_target.state_dict(),
            self.critic.state_dict(),
            self.critic_target.state_dict()
        ]

    def load_state_dict(self, list_of_dicts):
        self.actor.load_state_dict(list_of_dicts[0])
        self.actor_target.load_state_dict(list_of_dicts[1])
        self.critic.load_state_dict(list_of_dicts[2])
        self.critic_target.load_state_dict(list_of_dicts[3])

    def select_action(self, s_t, decay_epsilon=True):
        action = to_numpy(self.actor(to_tensor(np.array([s_t])))).squeeze(0)
        action += self.is_training * max(self.epsilon,
                                         0) * self.random_process.sample()
        action = np.clip(action, -1., 1.)

        if decay_epsilon:
            self.epsilon -= self.depsilon

        self.a_t = action
        return action

    def reset(self, obs):
        self.s_t = obs
        self.random_process.reset_states()

    def load_weights(self, output):
        if output is None: return

        self.actor.load_state_dict(torch.load('{}/actor.pkl'.format(output)))

        self.critic.load_state_dict(torch.load('{}/critic.pkl'.format(output)))

    def save_model(self, output):
        torch.save(self.actor.state_dict(), '{}/actor.pkl'.format(output))
        torch.save(self.critic.state_dict(), '{}/critic.pkl'.format(output))

    def seed(self, s):
        torch.manual_seed(s)
class Agent():
    """Interacts with and learns from the environment."""
    
    def __init__(self, state_size, action_size, random_seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, random_seed).to(device)
        self.actor_target = Actor(state_size, action_size, random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, random_seed).to(device)
        self.critic_target = Critic(state_size, action_size, random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = Memory(capacity=BUFFER_SIZE, e=E, a=A, beta=BETA, beta_increment_per_sampling=BETA_INCREMENT_PER_SAMPLING,
                            batch_size=BATCH_SIZE)
        
        return None
    
    def append_sample(self, state, action, reward, next_state, done):
        """Calculate Error and append the tuple to
        Memory
        """
        
        state = torch.from_numpy(state).float().to(device)
        action = torch.from_numpy(action).float().to(device)
        next_state = torch.from_numpy(next_state).float().to(device)
        
        self.actor_target.eval()
        with torch.no_grad():
            actions_next = self.actor_target(next_state)
        self.actor_target.train()
        
        self.critic_target.eval()
        with torch.no_grad():
            Q_targets_next = self.critic_target(next_state, actions_next)
        self.critic_target.train()
        
        # Compute Q targets for current states (y_i)
        Q_targets = reward + (GAMMA * Q_targets_next * (1 - done))
        
        # Compute critic loss
        self.critic_local.eval()
        with torch.no_grad():
            Q_expected = self.critic_local(state, action)
        self.critic_local.train()
            
        critic_loss = torch.abs(Q_expected - Q_targets).detach().numpy()
        
        # clip loss
        
        critic_loss = min(1.0, critic_loss)

        # add to Memory
        self.memory.add(critic_loss, (state, action, reward, next_state, done))
        
        return None  
        
    
    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        #self.memory.add(state, action, reward, next_state, done)
        self.append_sample(state, action, reward, next_state, done)
        
        self.update()
        
        return None
    
    def update(self):
        """Train Agent
        """
        
        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        
        states, actions, rewards, next_states, dones, idxs, is_weights = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        #critic_loss = F.mse_loss(Q_expected, Q_targets)
        critic_loss = weighted_mse_loss(Q_expected, Q_targets, is_weights)
        
        # update priority in memory
        errors = torch.abs(Q_expected - Q_targets).detach().numpy()
        
        for i in range(BATCH_SIZE):
            idx = idxs[i]
            
            # clip errors
            
            self.memory.update(idx, min(1.0, errors[i]))
        # end of updating priority in memory
        
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)              

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
Пример #12
0
class DDPG(object):
    def __init__(self,
                 env,
                 mem_size=7 * int(1e3),
                 lr_critic=1e-3,
                 lr_actor=1e-4,
                 epsilon=1.,
                 max_epi=1500,
                 epsilon_decay=1. / (1e5),
                 gamma=.99,
                 target_update_frequency=200,
                 batch_size=64,
                 random_process=True,
                 max_step=None):
        self.CUDA = torch.cuda.is_available()

        self.orig_env = env  #for recording
        if max_step is not None:
            self.orig_env._max_episode_steps = max_step
        self.env = self.orig_env
        self.N_S = self.env.observation_space.shape[0]
        self.N_A = self.env.action_space.shape[0]
        self.MAX_EPI = max_epi
        self.LOW = self.env.action_space.low
        self.HIGH = self.env.action_space.high

        self.actor = Actor(self.N_S, self.N_A)
        self.critic = Critic(self.N_S, self.N_A)
        self.target_actor = Actor(self.N_S, self.N_A)
        self.target_critic = Critic(self.N_S, self.N_A)
        self.target_actor.eval()
        self.target_critic.eval()
        self.target_actor.load_state_dict(self.actor.state_dict())
        self.target_critic.load_state_dict(self.critic.state_dict())
        if self.CUDA:
            self.actor.cuda()
            self.critic.cuda()
            self.target_actor.cuda()
            self.target_critic.cuda()

        self.exp = Experience(mem_size)
        self.optim_critic = optim.Adam(self.critic.parameters(), lr=lr_critic)
        self.optim_actor = optim.Adam(self.actor.parameters(), lr=-lr_actor)
        self.random_process = OrnsteinUhlenbeckProcess(\
                size=self.N_A, theta=.15, mu=0, sigma=.2)
        self.EPSILON = epsilon
        self.EPSILON_DECAY = epsilon_decay
        self.GAMMA = gamma
        self.TARGET_UPDATE_FREQUENCY = target_update_frequency
        self.BATCH_SIZE = batch_size

        title = {common.S_EPI: [], common.S_TOTAL_R: []}
        self.data = pd.DataFrame(title)
        self.RAND_PROC = random_process

    def train(self, dir=None, interval=1000):
        if dir is not None:
            self.env = wrappers.Monitor(self.orig_env,
                                        '{}/train_record'.format(dir),
                                        force=True)
            os.mkdir(os.path.join(dir, 'models'))
        update_counter = 0
        epsilon = self.EPSILON
        for epi in trange(self.MAX_EPI, desc='train epi', leave=True):
            self.random_process.reset_states()
            o = self.env.reset()

            counter = 0
            acc_r = 0
            while True:
                counter += 1

                #if dir is not None:
                #    self.env.render()

                a = self.choose_action(o)

                if self.RAND_PROC:
                    a += max(epsilon, 0) * self.random_process.sample()
                    a = np.clip(a, -1., 1.)
                    epsilon -= self.EPSILON_DECAY

                o_, r, done, info = self.env.step(self.map_to_action(a))
                self.exp.push(o, a, r, o_, done)

                if epi > 0:
                    self.update_actor_critic()
                    update_counter += 1
                    if update_counter % self.TARGET_UPDATE_FREQUENCY == 0:
                        self.update_target()

                acc_r += r
                o = o_
                if done:
                    break
            if dir is not None:
                if (epi + 1) % interval == 0:
                    self.save(os.path.join(dir, 'models'),
                              str(epi + 1),
                              save_data=False)
            s = pd.Series([epi, acc_r], index=[common.S_EPI, common.S_TOTAL_R])
            self.data = self.data.append(s, ignore_index=True)

    def choose_action(self, state):
        self.actor.eval()
        s = Variable(torch.Tensor(state)).unsqueeze(0)
        if self.CUDA:
            s = s.cuda()
        a = self.actor(s).data.cpu().numpy()[0].astype('float64')
        self.actor.train()
        return a

    def map_to_action(self, a):
        return (self.LOW + self.HIGH) / 2 + a * (self.HIGH - self.LOW) / 2

    def update_target(self):
        self.target_actor.load_state_dict(self.actor.state_dict())
        self.target_critic.load_state_dict(self.critic.state_dict())

    def update_actor_critic(self):
        # sample minibatch
        minibatch = common.Transition(*zip(*self.exp.sample(self.BATCH_SIZE)))
        bat_o = Variable(torch.Tensor(minibatch.state))
        bat_a = Variable(torch.Tensor(minibatch.action))
        bat_r = Variable(torch.Tensor(minibatch.reward)).unsqueeze(1)
        bat_o_ = Variable(torch.Tensor(minibatch.next_state))
        bat_not_done_mask = list(
            map(lambda done: 0 if done else 1, minibatch.done))
        bat_not_done_mask = Variable(
            torch.ByteTensor(bat_not_done_mask)).unsqueeze(1)
        if self.CUDA:
            bat_o = bat_o.cuda()
            bat_a = bat_a.cuda()
            bat_r = bat_r.cuda()
            bat_o_ = bat_o_.cuda()
            bat_not_done_mask = bat_not_done_mask.cuda()

        # update critic
        bat_a_o_ = self.target_actor(bat_o_)

        Gt = bat_r
        Gt[bat_not_done_mask] += self.GAMMA * self.target_critic(
            bat_o_, bat_a_o_)[bat_not_done_mask]
        Gt.detach_()
        eval_o = self.critic(bat_o, bat_a)
        criterion = nn.MSELoss()
        if self.CUDA:
            criterion.cuda()
        loss = criterion(eval_o, Gt)
        self.optim_critic.zero_grad()
        loss.backward()
        self.optim_critic.step()

        # update actor
        self.critic.eval()
        bat_a_o = self.actor(bat_o)
        obj = torch.mean(self.critic(bat_o, bat_a_o))
        self.optim_actor.zero_grad()
        obj.backward()
        self.optim_actor.step()
        self.critic.train()

    def test(self, dir=None, n=1):
        if dir is not None:
            self.env = wrappers.Monitor(self.orig_env,
                                        '{}/test_record'.format(dir),
                                        force=True,
                                        video_callable=lambda episode_id: True)

        title = {common.S_EPI: [], common.S_TOTAL_R: []}
        df = pd.DataFrame(title)

        for epi in trange(n, desc='test epi', leave=True):
            o = self.env.reset()
            acc_r = 0
            while True:
                #if dir is not None:
                #    self.env.render()
                a = self.choose_action(o)
                o_, r, done, info = self.env.step(self.map_to_action(a))
                acc_r += r
                o = o_
                if done:
                    break
            s = pd.Series([epi, acc_r], index=[common.S_EPI, common.S_TOTAL_R])
            df = df.append(s, ignore_index=True)
        if dir is not None:
            df.to_csv('{}/test_data.csv'.format(dir))
        else:
            print df

    def save(self, dir, suffix='', save_data=True):
        torch.save(self.actor.state_dict(),
                   '{}/actor{}.pt'.format(dir, suffix))
        torch.save(self.critic.state_dict(),
                   '{}/critic{}.pt'.format(dir, suffix))
        if save_data:
            self.data.to_csv('{}/train_data{}.csv'.format(dir, suffix))

    def load_actor(self, dir):
        self.actor.load_state_dict(torch.load(dir))

    def load_critic(self, dir):
        self.critic.load_state_dict(torch.load(dir))

    def get_data(self):
        return self.data
Пример #13
0
class Policy:
    def __init__(self,
                 agent_index,
                 state_size,
                 action_size,
                 hidden_dims,
                 device,
                 random_seed=7,
                 buffer_size=1000000,
                 batch_size=100,
                 actor_learning_rate=1e-3,
                 gamma=0.99,
                 tau=1e-3,
                 critic_learning_rate=1e-4):
        super(Policy, self).__init__()
        self.agent_index = agent_index
        self.tau = tau
        self.gamma = gamma
        self.seed = random_seed
        self.device = device
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.action_size = action_size
        self.single_agent_state_size = state_size // 2
        self.single_agent_action_size = action_size // 2
        # actor networks - work as single agents
        self.actor = Actor(state_size=self.single_agent_state_size,
                           action_size=self.single_agent_action_size,
                           seed=self.seed,
                           hidden_dims=hidden_dims).to(device)
        self.target_actor = Actor(state_size=self.single_agent_state_size,
                                  action_size=self.single_agent_action_size,
                                  seed=self.seed,
                                  hidden_dims=hidden_dims).to(device)
        # set actor and target_actor with same weights & biases
        for local_param, target_param in zip(self.actor.parameters(),
                                             self.target_actor.parameters()):
            target_param.data.copy_(local_param.data)
        # critic networks - combine both agents
        self.critic = Critic(state_size=state_size,
                             action_size=action_size,
                             seed=self.seed,
                             hidden_dims=hidden_dims).to(device)
        self.target_critic = Critic(state_size=state_size,
                                    action_size=action_size,
                                    seed=self.seed,
                                    hidden_dims=hidden_dims).to(device)
        # set critic_local and critic_target with same weights & biases
        for local_param, target_param in zip(self.critic.parameters(),
                                             self.target_critic.parameters()):
            target_param.data.copy_(local_param.data)
        # optimizers
        self.actor_optimizer = Adam(self.actor.parameters(),
                                    lr=actor_learning_rate)
        self.critic_optimizer = Adam(self.critic.parameters(),
                                     lr=critic_learning_rate,
                                     weight_decay=0)

        # Replay memory
        self.memory = ReplayBuffer(action_size=action_size,
                                   buffer_size=self.buffer_size,
                                   batch_size=self.batch_size,
                                   seed=self.seed,
                                   device=self.device)
        self.t_update = 0

    def get_weights(self):
        """get the weights for the actor and critic models"""
        return self.actor.state_dict(), self.target_actor.state_dict(), \
               self.critic.state_dict(), self.target_critic.state_dict()

    def load_weights(self, values):
        """load the weights for the actor and critic models"""
        w1, w2, w3, w4 = values
        self.actor.load_state_dict(w1)
        self.target_actor.load_state_dict(w2)
        self.critic.load_state_dict(w3)
        self.target_critic.load_state_dict(w4)

    def step(self, states, actions, rewards, next_states, dones):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        if self.num_agents > 1:
            for agent in range(self.num_agents):
                self.memory.add(states[agent, :], actions[agent, :],
                                rewards[agent], next_states[agent, :],
                                dones[agent])
        else:
            self.memory.add(states, actions, rewards, next_states, dones)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences, self.gamma)

    def act(self, state, use_target=False, add_noise=True, noise_value=None):
        """Returns actions for given state as per current policy.

        Arguments:
            state (Tensor): input state
            use_target (bool): if True then use the target actor network, otherwise
                use the local one
            add_noise (bool): if True then add noise to the actions obtained
            noise_value (float): noise value to add (if adding noise)
        Returns:
            action (Tensor): action of shape (action_size)  # (2)
        """
        state = ensure_is_tensor(state, self.device)
        if use_target:
            actor_net = self.target_actor
        else:
            actor_net = self.actor
        actor_net.eval()
        with torch.no_grad():
            action = actor_net(state)
            if add_noise:
                action = action.cpu().data.numpy()
                action = np.clip(action + noise_value, -1, 1)
                action = ensure_is_tensor(action, self.device)
        actor_net.train()
        return action

    def learn(self, experiences, other_agent):
        """Update policy and value parameters using given batch of experience tuples.
         Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
         where:
             actor_target(state) -> action
             critic_target(state, action) -> Q-value

         Arguments:
             experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
             other_agent (Policy): the other agent
         """
        states, actions, rewards, next_states, dones = experiences
        self.t_update += 1
        if self.agent_index == 0:
            # states, actions, next_actions of the agent
            states_self = ensure_is_tensor(
                states[:, :self.single_agent_state_size], self.device)
            action_self = ensure_is_tensor(
                actions[:, :self.single_agent_action_size], self.device)
            next_states_self = ensure_is_tensor(
                next_states[:, :self.single_agent_state_size], self.device)
            # states, actions, next_actions of the other agent
            states_other = ensure_is_tensor(
                states[:, self.single_agent_state_size:], self.device)
            action_other = ensure_is_tensor(
                actions[:, self.single_agent_action_size:], self.device)
            next_states_other = ensure_is_tensor(
                next_states[:, self.single_agent_state_size:], self.device)
            # rewards and dones
            rewards = ensure_is_tensor(rewards[:, 0].reshape((-1, 1)),
                                       self.device)
            dones = ensure_is_tensor(dones[:, 0].reshape((-1, 1)), self.device)
        elif self.agent_index == 1:
            # states, actions, next_actions of the agent
            states_self = ensure_is_tensor(
                states[:, self.single_agent_state_size:], self.device)
            action_self = ensure_is_tensor(
                actions[:, self.single_agent_action_size:], self.device)
            next_states_self = ensure_is_tensor(
                next_states[:, self.single_agent_state_size:], self.device)
            # states, actions, next_actions of the other agent
            states_other = ensure_is_tensor(
                states[:, :self.single_agent_state_size], self.device)
            action_other = ensure_is_tensor(
                actions[:, :self.single_agent_action_size], self.device)
            next_states_other = ensure_is_tensor(
                next_states[:, :self.single_agent_state_size], self.device)
            # rewards and dones
            rewards = ensure_is_tensor(rewards[:, 1].reshape((-1, 1)),
                                       self.device)
            dones = ensure_is_tensor(dones[:, 1].reshape((-1, 1)), self.device)
        # s, a, s' for both agents
        states = ensure_is_tensor(states, self.device)
        actions = ensure_is_tensor(actions, self.device)
        next_states = ensure_is_tensor(next_states, self.device)
        # ---------------------------- update critic ---------------------------- #
        next_actions_self = self.act(next_states_self,
                                     use_target=True,
                                     add_noise=False)
        next_actions_other = other_agent.act(next_states_other,
                                             use_target=True,
                                             add_noise=False)
        # combine the next actions from both agents
        if self.agent_index == 0:
            actions_next = torch.cat([next_actions_self, next_actions_other],
                                     dim=1).float().detach().to(self.device)
        elif self.agent_index == 1:
            actions_next = torch.cat([next_actions_other, next_actions_self],
                                     dim=1).float().detach().to(self.device)
        # Get predicted next-state actions and Q values from target models
        self.target_critic.eval()
        with torch.no_grad():
            Q_targets_next = self.target_critic(
                next_states, actions_next).detach().to(self.device)
        self.target_critic.train()
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        # get the current action-value for the states and actions
        Q_expected = self.critic(states, actions)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        # Compute critic loss
        critic_loss = F.smooth_l1_loss(Q_expected, Q_targets.detach())
        # back propagate through the network
        critic_loss.backward()
        self.critic_optimizer.step()
        # ---------------------------- update actor ---------------------------- #
        if self.agent_index == 0:
            actions_pred = torch.cat([
                self.actor(states_self),
                other_agent.act(
                    states_other, use_target=False, add_noise=False)
            ],
                                     dim=1)
        elif self.agent_index == 1:
            actions_pred = torch.cat([
                other_agent.act(
                    states_other, use_target=False, add_noise=False),
                self.actor(states_self)
            ],
                                     dim=1)
        # Compute actor loss and minimize it
        self.actor_optimizer.zero_grad()
        actor_loss = -self.critic(states, actions_pred).mean()
        actor_loss.backward()
        self.actor_optimizer.step()
        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic, self.target_critic, self.tau)
        self.soft_update(self.actor, self.target_actor, self.tau)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Arguments:
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Пример #14
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, random_seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)
        print(1)
        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

    def step(self, state, action, reward, next_state, done, time_step,
             agent_list):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        if time_step % 2:
            for idx in range(state.shape[0]):
                agent_list[idx].memory.add(state[idx], action[idx],
                                           reward[idx], next_state[idx],
                                           done[idx])

        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE:
            if time_step % 2 == 0:
                for agent in agent_list:
                    experiences = agent.memory.sample()
                    agent.learn(experiences, GAMMA)

                    #import ipdb; ipdb.set_trace()
                    #experiences = self.memory.sample()
                    #self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        actions = []
        for idx in range(state.shape[0]):
            state = state.float().to(device)
            self.actor_local.eval()
            with torch.no_grad():
                action = self.actor_local(
                    state[idx, ...].unsqueeze(0)).cpu().data.numpy()
            if add_noise:
                action += self.noise.sample()
            actions.append(np.clip(action, -1, 1))
        return np.asarray(actions)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        self.critic_local.train()
        with torch.no_grad():
            actions_next = self.actor_target(next_states)
            Q_targets_next = self.critic_target(next_states, actions_next)
            # Compute Q targets for current states (y_i)
            #rewards = rewards - rewards.mean()
            #rewards = rewards / rewards.std()
            Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()
        self.critic_local.eval()
        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        self.actor_local.train()
        actions_pred = self.actor_local(states)
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)

        actor_loss = -self.critic_local(states, actions_pred).mean()
        # include q value normalization for actor to learn faster
        #actor_actions = -self.critic_local(states, actions_pred)
        #actor_no_mean = actor_actions - actor_actions.mean()
        #actor_loss = (actor_no_mean/actor_no_mean.std()).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.actor_local.parameters(), 1)
        self.actor_optimizer.step()
        self.actor_local.eval()
        # ----------------------- update target networks ----------------------- #

        with torch.no_grad():
            self.soft_update(self.critic_local, self.critic_target, TAU)
            self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Пример #15
0
def main():
    env = gym.make(args.env_name)
    env.seed(args.seed)
    torch.manual_seed(args.seed)

    num_inputs = env.observation_space.shape[0]
    num_actions = env.action_space.shape[0]
    running_state = ZFilter((num_inputs,), clip=5)

    print('state size:', num_inputs) 
    print('action size:', num_actions)

    actor = Actor(num_inputs, num_actions, args)
    critic = Critic(num_inputs, args)

    actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate)
    critic_optim = optim.Adam(critic.parameters(), lr=args.learning_rate, 
                              weight_decay=args.l2_rate)

    writer = SummaryWriter(comment="-ppo_iter-" + str(args.max_iter_num))
    
    if args.load_model is not None:
        saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model))
        ckpt = torch.load(saved_ckpt_path)

        actor.load_state_dict(ckpt['actor'])
        critic.load_state_dict(ckpt['critic'])

        running_state.rs.n = ckpt['z_filter_n']
        running_state.rs.mean = ckpt['z_filter_m']
        running_state.rs.sum_square = ckpt['z_filter_s']

        print("Loaded OK ex. Zfilter N {}".format(running_state.rs.n))

    
    episodes = 0    

    for iter in range(args.max_iter_num):
        actor.eval(), critic.eval()
        memory = deque()

        steps = 0
        scores = []

        while steps < args.total_sample_size: 
            state = env.reset()
            score = 0

            state = running_state(state)
            
            for _ in range(10000): 
                if args.render:
                    env.render()

                steps += 1

                mu, std = actor(torch.Tensor(state).unsqueeze(0))
                action = get_action(mu, std)[0]
                next_state, reward, done, _ = env.step(action)

                if done:
                    mask = 0
                else:
                    mask = 1

                memory.append([state, action, reward, mask])

                next_state = running_state(next_state)
                state = next_state

                score += reward

                if done:
                    break
            
            episodes += 1
            scores.append(score)
        
        score_avg = np.mean(scores)
        print('{}:: {} episode score is {:.2f}'.format(iter, episodes, score_avg))
        writer.add_scalar('log/score', float(score_avg), iter)

        actor.train(), critic.train()
        train_model(actor, critic, memory, actor_optim, critic_optim, args)

        if iter % 100:
            score_avg = int(score_avg)

            model_path = os.path.join(os.getcwd(),'save_model')
            if not os.path.isdir(model_path):
                os.makedirs(model_path)

            ckpt_path = os.path.join(model_path, 'ckpt_'+ str(score_avg)+'.pth.tar')

            save_checkpoint({
                'actor': actor.state_dict(),
                'critic': critic.state_dict(),
                'z_filter_n':running_state.rs.n,
                'z_filter_m': running_state.rs.mean,
                'z_filter_s': running_state.rs.sum_square,
                'args': args,
                'score': score_avg
            }, filename=ckpt_path)
Пример #16
0
def main():
    env = gym.make(args.env_name)
    env.seed(500)
    torch.manual_seed(500)

    state_size = env.observation_space.shape[0]
    action_size = env.action_space.shape[0]
    print('state size:', state_size)
    print('action size:', action_size)

    actor = Actor(state_size, action_size, args)
    critic = Critic(state_size, action_size, args)
    target_critic = Critic(state_size, action_size, args)

    actor_optimizer = optim.Adam(actor.parameters(), lr=args.actor_lr)
    critic_optimizer = optim.Adam(critic.parameters(), lr=args.critic_lr)

    hard_target_update(critic, target_critic)

    # initialize automatic entropy tuning
    target_entropy = -torch.prod(torch.Tensor(action_size)).item()
    log_alpha = torch.zeros(1, requires_grad=True)
    alpha = torch.exp(log_alpha)
    alpha_optimizer = optim.Adam([log_alpha], lr=args.alpha_lr)

    writer = SummaryWriter(args.logdir)

    replay_buffer = deque(maxlen=10000)
    recent_rewards = deque(maxlen=100)
    steps = 0

    for episode in range(args.max_iter_num):
        done = False
        score = 0

        state = env.reset()
        state = np.reshape(state, [1, state_size])

        while not done:
            if args.render:
                env.render()

            steps += 1

            mu, std = actor(torch.Tensor(state))
            action = get_action(mu, std)

            next_state, reward, done, _ = env.step(action)

            next_state = np.reshape(next_state, [1, state_size])
            mask = 0 if done else 1

            replay_buffer.append((state, action, reward, next_state, mask))

            state = next_state
            score += reward

            if steps > args.batch_size:
                mini_batch = random.sample(replay_buffer, args.batch_size)

                actor.train(), critic.train(), target_critic.train()
                alpha = train_model(actor, critic, target_critic, mini_batch,
                                    actor_optimizer, critic_optimizer,
                                    alpha_optimizer, target_entropy, log_alpha,
                                    alpha)

                soft_target_update(critic, target_critic, args.tau)

            if done:
                recent_rewards.append(score)

        if episode % args.log_interval == 0:
            print('{} episode | score_avg: {:.2f}'.format(
                episode, np.mean(recent_rewards)))
            writer.add_scalar('log/score', float(score), episode)

        if np.mean(recent_rewards) > args.goal_score:
            if not os.path.isdir(args.save_path):
                os.makedirs(args.save_path)

            ckpt_path = args.save_path + 'model.pth.tar'
            torch.save(actor.state_dict(), ckpt_path)
            print('Recent rewards exceed -300. So end')
            break
Пример #17
0
def main():
    env = gym.make(args.env_name)
    env.seed(args.seed)
    torch.manual_seed(args.seed)

    num_inputs = env.observation_space.shape[0]
    num_actions = env.action_space.shape[0]
    running_state = ZFilter((num_inputs,), clip=5)

    print('state size:', num_inputs) 
    print('action size:', num_actions)

    actor = Actor(num_inputs, num_actions, args)
    critic = Critic(num_inputs, args)
    discrim = Discriminator(num_inputs + num_actions, args)

    actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate)
    critic_optim = optim.Adam(critic.parameters(), lr=args.learning_rate, 
                              weight_decay=args.l2_rate) 
    discrim_optim = optim.Adam(discrim.parameters(), lr=args.learning_rate)
    
    # load demonstrations
    expert_demo, _ = pickle.load(open('./expert_demo/expert_demo.p', "rb"))
    demonstrations = np.array(expert_demo)
    print("demonstrations.shape", demonstrations.shape)
    
    writer = SummaryWriter(args.logdir)

    if args.load_model is not None:
        saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model))
        ckpt = torch.load(saved_ckpt_path)

        actor.load_state_dict(ckpt['actor'])
        critic.load_state_dict(ckpt['critic'])
        discrim.load_state_dict(ckpt['discrim'])

        running_state.rs.n = ckpt['z_filter_n']
        running_state.rs.mean = ckpt['z_filter_m']
        running_state.rs.sum_square = ckpt['z_filter_s']

        print("Loaded OK ex. Zfilter N {}".format(running_state.rs.n))

    
    episodes = 0
    train_discrim_flag = True

    for iter in range(args.max_iter_num):
        actor.eval(), critic.eval()
        memory = deque()

        steps = 0
        scores = []

        while steps < args.total_sample_size: 
            state = env.reset()
            score = 0

            state = running_state(state)
            
            for _ in range(10000): 
                if args.render:
                    env.render()

                steps += 1

                mu, std = actor(torch.Tensor(state).unsqueeze(0))
                action = get_action(mu, std)[0]
                next_state, reward, done, _ = env.step(action)
                irl_reward = get_reward(discrim, state, action)

                if done:
                    mask = 0
                else:
                    mask = 1

                memory.append([state, action, irl_reward, mask])

                next_state = running_state(next_state)
                state = next_state

                score += reward

                if done:
                    break
            
            episodes += 1
            scores.append(score)
        
        score_avg = np.mean(scores)
        print('{}:: {} episode score is {:.2f}'.format(iter, episodes, score_avg))
        writer.add_scalar('log/score', float(score_avg), iter)

        actor.train(), critic.train(), discrim.train()
        if train_discrim_flag:
            expert_acc, learner_acc = train_discrim(discrim, memory, discrim_optim, demonstrations, args)
            print("Expert: %.2f%% | Learner: %.2f%%" % (expert_acc * 100, learner_acc * 100))
            if expert_acc > args.suspend_accu_exp and learner_acc > args.suspend_accu_gen:
                train_discrim_flag = False
        train_actor_critic(actor, critic, memory, actor_optim, critic_optim, args)

        if iter % 100:
            score_avg = int(score_avg)

            model_path = os.path.join(os.getcwd(),'save_model')
            if not os.path.isdir(model_path):
                os.makedirs(model_path)

            ckpt_path = os.path.join(model_path, 'ckpt_'+ str(score_avg)+'.pth.tar')

            save_checkpoint({
                'actor': actor.state_dict(),
                'critic': critic.state_dict(),
                'discrim': discrim.state_dict(),
                'z_filter_n':running_state.rs.n,
                'z_filter_m': running_state.rs.mean,
                'z_filter_s': running_state.rs.sum_square,
                'args': args,
                'score': score_avg
            }, filename=ckpt_path)
Пример #18
0
class DDPG(object):
    def __init__(self, nb_status, nb_actions, args, writer):
        self.clip_actor_grad = args.clip_actor_grad
        self.nb_status = nb_status * args.window_length
        self.nb_actions = nb_actions
        self.discrete = args.discrete
        self.pic = args.pic
        self.writer = writer
        self.select_time = 0        
        if self.pic:
            self.nb_status = args.pic_status
        
        # Create Actor and Critic Network
        net_cfg = {
            'hidden1':args.hidden1, 
            'hidden2':args.hidden2, 
            'use_bn':args.bn,
            'init_method':args.init_method
        }
        if args.pic:
            self.cnn = CNN(1, args.pic_status)
            self.cnn_target = CNN(1, args.pic_status)
            self.cnn_optim = Adam(self.cnn.parameters(), lr=args.crate)
        self.actor = Actor(self.nb_status, self.nb_actions, **net_cfg)
        self.actor_target = Actor(self.nb_status, self.nb_actions, **net_cfg)
        self.actor_optim  = Adam(self.actor.parameters(), lr=args.prate)

        self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg)
        self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg)
        self.critic_optim  = Adam(self.critic.parameters(), lr=args.rate)

        hard_update(self.actor_target, self.actor) # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)
        if args.pic:
            hard_update(self.cnn_target, self.cnn)
        
        #Create replay buffer
        self.memory = rpm(args.rmsize) # SequentialMemory(limit=args.rmsize, window_length=args.window_length)
        self.random_process = Myrandom(size=nb_actions)

        # Hyper-parameters
        self.batch_size = args.batch_size
        self.tau = args.tau
        self.discount = args.discount
        self.depsilon = 1.0 / args.epsilon

        # 
        self.epsilon = 1.0
        self.s_t = None # Most recent state
        self.a_t = None # Most recent action
        self.use_cuda = args.cuda
        # 
        if self.use_cuda: self.cuda()

    def normalize(self, pic):
        pic = pic.swapaxes(0, 2).swapaxes(1, 2)
        return pic

    def update_policy(self):
        # Sample batch
        state_batch, action_batch, reward_batch, \
            next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size)

        # Prepare for the target q batch
        if self.pic:
            state_batch = np.array([self.normalize(x) for x in state_batch])
            state_batch = to_tensor(state_batch, volatile=True)
            state_batch = self.cnn(state_batch)
            next_state_batch = np.array([self.normalize(x) for x in next_state_batch])
            next_state_batch = to_tensor(next_state_batch, volatile=True)
            next_state_batch = self.cnn_target(next_state_batch)
            next_q_values = self.critic_target([
                next_state_batch,
                self.actor_target(next_state_batch)
            ])
        else:
            next_q_values = self.critic_target([
                to_tensor(next_state_batch, volatile=True),
                self.actor_target(to_tensor(next_state_batch, volatile=True)),
            ])
        # print('batch of picture is ok')
        next_q_values.volatile = False

        target_q_batch = to_tensor(reward_batch) + \
            self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values

        # Critic update
        self.critic.zero_grad()
        if self.pic: self.cnn.zero_grad()

        if self.pic:
            state_batch.volatile = False
            q_batch = self.critic([state_batch, to_tensor(action_batch)])
        else:
            q_batch = self.critic([to_tensor(state_batch), to_tensor(action_batch)])

        # print(reward_batch, next_q_values*self.discount, target_q_batch, terminal_batch.astype(np.float))
        value_loss = criterion(q_batch, target_q_batch)
        value_loss.backward()
        self.critic_optim.step()
        if self.pic: self.cnn_optim.step()

        self.actor.zero_grad()
        if self.pic: self.cnn.zero_grad()

        if self.pic:
            state_batch.volatile = False
            policy_loss = -self.critic([
                state_batch,
                self.actor(state_batch)
            ])
        else:
            policy_loss = -self.critic([
                to_tensor(state_batch),
                self.actor(to_tensor(state_batch))
            ])

        policy_loss = policy_loss.mean()
        policy_loss.backward()

        if self.clip_actor_grad is not None:
            torch.nn.utils.clip_grad_norm(self.actor.parameters(), float(self.clip_actor_grad))

            if self.writer != None:
                mean_policy_grad = np.array(np.mean([np.linalg.norm(p.grad.data.cpu().numpy().ravel()) for p in self.actor.parameters()]))
                #print(mean_policy_grad)
                self.writer.add_scalar('train/mean_policy_grad', mean_policy_grad, self.select_time)

        self.actor_optim.step()
        if self.pic: self.cnn_optim.step()

        # Target update
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)
        if self.pic:
            soft_update(self.cnn_target, self.cnn, self.tau)

        return -policy_loss, value_loss

    def eval(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()
        if(self.pic):
            self.cnn.eval()
            self.cnn_target.eval()

    def train(self):
        self.actor.train()
        self.actor_target.train()
        self.critic.train()
        self.critic_target.train()
        if(self.pic):
            self.cnn.train()
            self.cnn_target.train()

    def cuda(self):
        self.cnn.cuda()
        self.cnn_target.cuda()
        self.actor.cuda()
        self.actor_target.cuda()
        self.critic.cuda()
        self.critic_target.cuda()

    def observe(self, r_t, s_t1, done):
        self.memory.append([self.s_t, self.a_t, r_t, s_t1, done])
        self.s_t = s_t1

    def random_action(self, fix=False):
        action = np.random.uniform(-1.,1.,self.nb_actions)
        self.a_t = action
        if self.discrete and fix == False:
            action = action.argmax()
#        if self.pic:
#            action = np.concatenate((softmax(action[:16]), softmax(action[16:])))
        return action
        
    def select_action(self, s_t, decay_epsilon=True, return_fix=False, noise_level=0):
        self.eval()
        if self.pic:
            s_t = self.normalize(s_t)
            s_t = self.cnn(to_tensor(np.array([s_t])))
        if self.pic:
            action = to_numpy(
                self.actor_target(s_t)
            ).squeeze(0)
        else:
            action = to_numpy(
                self.actor(to_tensor(np.array([s_t])))
            ).squeeze(0)
        self.train()
        noise_level = noise_level * max(self.epsilon, 0)

        if np.random.uniform(0, 1) < noise_level:
            action = self.random_action(fix=True) # episilon greedy            

        if decay_epsilon:
            self.epsilon -= self.depsilon
        self.a_t = action
        
        if return_fix:
            return action
        if self.discrete:
            return action.argmax()
        else:
            return action

    def reset(self, obs):
        self.s_t = obs
        self.random_process.reset_status()

    def load_weights(self, output, num=1):        
        if output is None: return
        self.actor.load_state_dict(
            torch.load('{}/actor{}.pkl'.format(output, num))
        )
        self.actor_target.load_state_dict(
            torch.load('{}/actor{}.pkl'.format(output, num))
        )
        self.critic.load_state_dict(
            torch.load('{}/critic{}.pkl'.format(output, num))
        )
        self.critic_target.load_state_dict(
            torch.load('{}/critic{}.pkl'.format(output, num))
        )

    def save_model(self, output, num):
        if self.use_cuda:
            self.cnn.cpu()
            self.actor.cpu()
            self.critic.cpu()
        torch.save(
            self.actor.state_dict(),
            '{}/actor{}.pkl'.format(output, num)
        )
        torch.save(
            self.critic.state_dict(),
            '{}/critic{}.pkl'.format(output, num)
        )
        if self.use_cuda:
            self.cnn.cuda()
            self.actor.cuda()
            self.critic.cuda()
Пример #19
0
class Agent:
    def __init__(self, env, gamma, batch_size, buffer_size, lr_rate, tau):

        self.env = env
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]
        self.action_bound = env.action_space.high[0]

        self.gamma = gamma
        self.batch_size = batch_size
        self.buffer_size = buffer_size

        self.actor = Actor(self.state_dim, self.action_dim, self.action_bound,
                           lr_rate[0], tau)
        self.critic = Critic(self.state_dim, self.action_dim, lr_rate[1], tau)

        self.buffer = ReplayBuffer(self.buffer_size)
        self.save_epi_reward = []

    def ou_noise(self, x, rho=0.15, mu=0., dt=1e-1, sigma=0.2, dim=1):
        rho = torch.FloatTensor([rho])
        mu = torch.FloatTensor([mu])
        dt = torch.FloatTensor([dt])
        return x + rho * (mu - x) * dt + torch.sqrt(dt) * torch.normal(
            0., sigma, size=(dim, ))

    def td_target(self, rewards, q_values, dones):
        y_k = torch.zeros(q_values.shape)

        for i in range(q_values.shape[0]):
            if dones[i]:
                y_k[i] = rewards[i]
            else:
                y_k[i] = rewards[i] + self.gamma * q_values[i]
        return y_k

    def train(self, max_episode_num, save_path, save_names):
        self.actor.update_target_network()
        self.critic.update_target_network()

        for episode in range(max_episode_num):
            time, episode_reward, done = 0, 0, False
            state = self.env.reset()
            state = torch.from_numpy(state).type(torch.FloatTensor)

            pre_noise = torch.zeros(self.action_dim)

            while not done:
                #env.render()
                action = self.actor.predict(state)[0]
                noise = self.ou_noise(pre_noise, dim=self.action_dim)

                action = np.array([action.item()])
                action = np.clip(action, -self.action_bound, self.action_bound)

                next_state, reward, done, _ = self.env.step(action)
                next_state = torch.from_numpy(next_state).type(
                    torch.FloatTensor)
                action = torch.from_numpy(action).type(torch.FloatTensor)
                reward = torch.FloatTensor([reward])
                train_reward = torch.FloatTensor([(reward + 8) / 8])

                state = state.view(1, self.state_dim)
                next_state = next_state.view(1, self.state_dim)
                action = action.view(1, self.action_dim)
                reward = reward.view(1, 1)
                train_reward = reward.view(1, 1)

                self.buffer.add_buffer(state, action, train_reward, next_state,
                                       done)
                if self.buffer.buffer_size > 1000:
                    states, actions, rewards, next_states, dones = self.buffer.sample_batch(
                        self.batch_size)

                    actions_ = self.actor.target_predict(next_states)
                    actions_ = actions_.view(next_states.shape[0],
                                             self.action_dim)
                    target_qs = self.critic.target_predict(
                        next_states, actions_)
                    y_i = self.td_target(rewards, target_qs, dones)
                    self.critic.train(states, actions, y_i)

                    s_actions = self.actor.predict(states)
                    policy_loss = self.critic.predict(states, s_actions)
                    self.actor.train(policy_loss)

                    self.actor.update_target_network()
                    self.critic.update_target_network()

                pre_noise = noise
                state = next_state[0]
                episode_reward += reward[0]
                time += 1

            self.save_epi_reward.append(episode_reward.item())

            if len(self.save_epi_reward) < 20:
                print('Episode:', episode + 1, 'Time:',
                      time, 'Reward(ave of recent20):',
                      np.mean(self.save_epi_reward))
            else:
                print('Episode:', episode + 1, 'Time:', time,
                      'Reward(ave of recent20):',
                      np.mean(self.save_epi_reward[-20:]))

            if episode % 10 == 0:
                self.actor.save(save_path, save_names[0])
                self.critic.save(save_path, save_names[1])