예제 #1
0
class DQN(object):
    def __init__(self):
        self.pred_net, self.target_net = ConvNet(), ConvNet()
        # sync evac target
        self.update_target(self.target_net, self.pred_net, 1.0)
        # use gpu
        if USE_GPU:
            self.pred_net.cuda()
            self.target_net.cuda()
            
        # simulator step counter
        self.memory_counter = 0
        # target network step counter
        self.learn_step_counter = 0
        # loss function
        self.loss_function = nn.MSELoss()
        # ceate the replay buffer
        self.replay_buffer = ReplayBuffer(MEMORY_CAPACITY)
        
        # define optimizer
        self.optimizer = torch.optim.Adam(self.pred_net.parameters(), lr=LR)
        
    def update_target(self, target, pred, update_rate):
        # update target network parameters using predcition network
        for target_param, pred_param in zip(target.parameters(), pred.parameters()):
            target_param.data.copy_((1.0 - update_rate) \
                                    * target_param.data + update_rate*pred_param.data)
    
    def save_model(self):
        # save prediction network and target network
        self.pred_net.save(PRED_PATH)
        self.target_net.save(TARGET_PATH)

    def load_model(self):
        # load prediction network and target network
        self.pred_net.load(PRED_PATH)
        self.target_net.load(TARGET_PATH)

    def choose_action(self, x, EPSILON):
    	# x:state
        x = torch.FloatTensor(x)
        # print(x.shape)
        if USE_GPU:
            x = x.cuda()

        # epsilon-greedy策略
        if np.random.uniform() >= EPSILON:
            # greedy case
            action_value = self.pred_net(x) 	# (N_ENVS, N_ACTIONS, N_QUANT)
            action = torch.argmax(action_value, dim=1).data.cpu().numpy()
        else:
            # random exploration case
            action = np.random.randint(0, N_ACTIONS, (x.size(0)))
        return action

    def store_transition(self, s, a, r, s_, done):
        self.memory_counter += 1
        self.replay_buffer.add(s, a, r, s_, float(done))

    def learn(self):
        self.learn_step_counter += 1
        # target parameter update
        if self.learn_step_counter % TARGET_REPLACE_ITER == 0:
            self.update_target(self.target_net, self.pred_net, 1e-2)
    
        b_s, b_a, b_r, b_s_, b_d = self.replay_buffer.sample(BATCH_SIZE)
        # b_w, b_idxes = np.ones_like(b_r), None
            
        b_s = torch.FloatTensor(b_s)
        b_a = torch.LongTensor(b_a)
        b_r = torch.FloatTensor(b_r)
        b_s_ = torch.FloatTensor(b_s_)
        b_d = torch.FloatTensor(b_d)

        if USE_GPU:
            b_s, b_a, b_r, b_s_, b_d = b_s.cuda(), b_a.cuda(), b_r.cuda(), b_s_.cuda(), b_d.cuda()

        # action value for current state 
        q_eval = self.pred_net(b_s) 	
        mb_size = q_eval.size(0)
        q_eval = torch.stack([q_eval[i][b_a[i]] for i in range(mb_size)])

        # optimal action value for current state 
        q_next = self.target_net(b_s_) 				
        # best_actions = q_next.argmax(dim=1) 		
        # q_next = torch.stack([q_next[i][best_actions[i]] for i in range(mb_size)])
        q_next = torch.max(q_next, -1)[0]
        q_target = b_r + GAMMA * (1. - b_d) * q_next
        q_target = q_target.detach()

        # loss
        loss = self.loss_function(q_eval, q_target)
        logger.store(loss=loss)
        # backprop loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        return loss
예제 #2
0
class Agent():
    def __init__(self,
                 state_size,
                 action_size,
                 behavior_name,
                 index_player,
                 replay_memory_size=1e4,
                 batch_size=512,
                 gamma=0.99,
                 learning_rate=1e4,
                 target_tau=1e3,
                 update_rate=100,
                 seed=0):  #affect your agent vs other agents
        self.state_size = state_size
        self.current_state = []
        self.action_size = action_size
        self.buffer_size = int(replay_memory_size)
        self.batch_size = batch_size
        self.gamma = gamma
        self.learn_rate = learning_rate
        self.tau = target_tau
        self.update_rate = update_rate
        self.seed = random.seed(seed)
        self.behavior_name = behavior_name
        self.index_player = index_player
        self.close_ball_reward = 0
        self.touch_ball_reward = 0
        """
        Now we define two models: 
        (a) one netwoek will be updated every (step % update_rate == 0),
        (b) A target network, with weights updated to equal to equal to the network (a) at a slower (target_tau) rate.
        """

        self.network = QNetwork(state_size, action_size, seed).to(device)
        self.target_network = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.optimizer = optim.Adam(self.network.parameters(),
                                    lr=self.learn_rate)

        # Replay memory
        self.memory = ReplayBuffer(action_size, self.buffer_size,
                                   self.batch_size, seed)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def load_model(self, path_model, path_target=None):
        params = torch.load(path_model)
        self.network.set_params(params)
        self.network.load_state_dict(torch.load(path_model))
        if path_target != None:
            self.target_network.load_state_dict(torch.load(path_target))

    def model_step(self, state, action, reward, next_state):
        # save experience in replay memory
        self.memory.add(state, action, reward, next_state)

        # learn every UPDATE_EVERY time steps
        self.t_step = self.t_step + 1
        if self.t_step % self.update_rate == 0:

            # if enough samples are available in memory, get random subset and learn
            if len(self.memory) > self.batch_size:
                experiences = self.memory.sample()
                self.learn(experiences, self.gamma, self.t_step)

    def choose_action(self, state, eps=0.0):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.network.eval()
        with torch.no_grad():
            action_values = self.network(state)
        self.network.train()

        # epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy()
                             )  # return a number from 0 to action_size
        else:
            return random.choice(np.arange(
                self.action_size))  # return a number from 0 to action_size

    def learn(self, experiences, gamma, stp):
        states, actions, rewards, next_states = experiences

        # Get Q values from current observations (s,a) using model network
        # get max Q values for (s', a') from target model
        self.network.train()
        Q_sa = self.network(states).gather(1, actions)
        #print(Q_sa)
        Q_sa_prime_target_values = self.target_network(next_states).max(
            1)[0].to(device).float().detach()
        #Q_sa_prime_targets = Q_sa_prime_target_values.max(1)[0].unsqueeze(1)
        #print(Q_sa_prime_target_values)

        # compute Q targets for current states
        #print(rewards)

        Q_sa_targets = rewards + gamma * Q_sa_prime_target_values.unsqueeze(1)
        #print(Q_sa_targets)
        #input('train')

        #Q_sa_targets = Q_sa_targets.unsqueeze(1)

        # Compute loss (error)
        criterion = torch.nn.MSELoss(reduction='sum')
        loss = criterion(
            Q_sa.to(device),
            Q_sa_targets.to(device))  #F.mse_loss(Q_sa, Q_sa_targets)

        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # update target network
        if stp % 100 == 0:
            print('Updating Model')
            self.soft_update(self.network, self.target_network, self.tau)

    def soft_update(self, local_model, target_model, tau):

        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def Read(self):
        decision_steps, terminal_steps = env.get_steps(self.behavior_name)
        try:
            signal_front = np.array(
                sensor_front_sig(
                    decision_steps.obs[0][self.index_player, :]))  # 3 x 11 x 8
            signal_back = np.array(
                sensor_back_sig(
                    decision_steps.obs[1][self.index_player, :]))  # 3 x 3 x 8
            #pre_state = []
            signal_front = np.array(signal_front)
            #print(signal_front.shape)
            #print(signal_back.shape)
            r = np.concatenate((signal_front, signal_back), axis=1)
            #print(r.shape)
            #input('ff')
            #pre_state.extend(list(np.array(signal_front).flatten()))
            #pre_state.extend(list(np.array(signal_back).flatten()))
            #state = np.array(pre_state)
            self.current_state = r
            count_close_to_ball = 0
            count_touch_ball = 0
            count_back_touch = 0
            count_back_close = 0
            self.rew_d_to_our_post = 0
            self.rew_for_ball_dist = -0.1
            # Front Observation
            for i in range(len(signal_front[0])):
                if signal_front[0][i][0] == 1.0:
                    count_close_to_ball += 1
                    self.rew_for_ball_dist = max(
                        0.3 * (1 - signal_front[0][i][7]),
                        self.rew_for_ball_dist)

                    # Kicked the ball at the front
                    if signal_front[0][i][7] <= 0.03:
                        count_touch_ball += 1

                if signal_front[0][i][1] == 1.0:
                    self.rew_d_to_our_post = -0.1
                if signal_front[0][i][2] == 1.0:
                    self.rew_d_to_our_post = 0.1

            # Back observation
            for i in range(len(signal_back[0])):
                if signal_back[0][i][0] == 1.0:
                    count_back_close += 0.2

                    # Touches the ball at the back
                    if signal_back[0][i][7] <= 0.03:
                        count_back_touch += 0.3

            self.back_touch = 1 if count_back_touch > 0 else 0.2
            self.back_close = 1 if count_back_close > 0 else 0.1

            # add reward if kick the ball
            self.touch_ball_reward = 1 if count_touch_ball > 0 else -0.15
            # Penalize for back touching the ball
            if count_back_touch > 0:
                self.touch_ball_reward = -0.25

            # Penalize if the ball is not in view
            self.close_ball_reward = 0.25 if count_close_to_ball > 0 else -0.05
            # Penalize if the ball is behind the agent
            if count_back_close > 0:
                self.close_ball_reward = -0.1

            return self.current_state
        except:
            self.touch_ball_reward = 0
            self.close_ball_reward = 0

        return self.current_state

    def upd_after_goal(self, n_upds):
        self.memory.upd_goal(n_upds)
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences, self.gamma, self.t_step)

    def we_goll(self):
        self.memory.we_goll()
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences, self.gamma, self.t_step)
            experiences = self.memory.sample()
            self.learn(experiences, self.gamma, self.t_step)

    def us_goll(self):
        self.memory.us_goll()
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences, self.gamma, self.t_step)
            experiences = self.memory.sample()
            self.learn(experiences, self.gamma, self.t_step)
def train(sess, env, actor, critic, noise, reward, discrete, saver,
          checkpoint_path):
    # Set up summary writer
    summary_writer = tf.summary.FileWriter("ddpg_summary")

    actor.update()
    critic.update()

    replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED)

    # Initialize noise
    ou_level = 0.

    for i in range(MAX_EPISODES):

        if i % 100 == 0:
            saver.save(sess, checkpoint_path)

        # s 는 environment에서 제공하는 첫번째 state 정보.
        s = env.reset()

        ep_reward = 0
        ep_ave_max_q = 0

        # buffer 초기화
        episode_buffer = np.empty((0, 5), float)

        for j in range(MAX_EP_STEPS):

            # print(critic.w1.eval()[0,0])

            env.render()

            # a 는 actor의 current policy를 기반으로 예측한 q_value tensor [None x action_dim]
            a = actor.predict(np.reshape(s, (1, actor.state_dim)))

            # stochastic environment에서의 e-greedy 를 위해서
            # Noise 를 추가한다.
            # 아랫 부분은 ornstein_uhlenbeck_level이라는 내용을 참조해야 합니다.. constant action space에서
            # 학습을 위해서 사용하는 방법이라고 합니다.
            if i < NOISE_MAX_EP:
                ou_level = noise.ornstein_uhlenbeck_level(ou_level)
                a = a + ou_level

                # Set action for discrete and continuous action spaces
            if discrete:
                action = np.argmax(a)
            else:
                action = a[0]

            # 선택된 action을 기반으로 step을 진행시킨 후 결과를
            # 돌려받습니다.
            s2, r, terminal, info = env.step(action)

            # episode 내의 총 reward 값을 더합니다.
            ep_reward += r

            # ==========================================================================[중요한 부분]==============
            # Replay Buffer에 해당 정보를 더합니다.
            # episode_buffer라는 nparray에 [s, a, r, terminal, s2]의 배열을 넣어줍니다.
            episode_buffer = np.append(episode_buffer,
                                       [[s, a, r, terminal, s2]],
                                       axis=0)
            # ===================================================================================================

            # Replay Buffer에 Minibatch size 이상의 데이터가 담겨 있다면
            # 데이터를 가져와서 학습을 진행합니다.
            if replay_buffer.size() > MINIBATCH_SIZE:
                s_batch, a_batch, r_batch, t_batch, s2_batch = \
                    replay_buffer.sample_batch(MINIBATCH_SIZE)

                # critic을 통해서 Q(s, a)인 action-value function을 가져옵니다.
                # 이때 critic은 current policy를 평가하고 action-value function을 학습하므로,
                # actor의 예측 값을 가져옵니다.
                target_q = critic.predict_target(
                    s2_batch, actor.predict_target(s2_batch))

                y_i = []
                for k in range(MINIBATCH_SIZE):
                    if t_batch[k]:
                        y_i.append(r_batch[k])
                    else:
                        y_i.append(r_batch[k] + GAMMA * target_q[k])

                # Critic이 가진 action_value function을 학습합니다.
                # 이때 필요한 데이터는 state batch, action batch, reward batch 입니다.
                # reward는 DQN (deepmind etc...) 논문에서 사용했던 것 과 같이
                # terminal 이라면 마지막 reward 자체.
                # terminal이 아니라면, s2 에서의 q_value 값에 discount factor를 곱한 값과 s 에서의 reward를 더한 값을
                # reward로 계사합니다.
                predicted_q_value, _ = critic.train(
                    s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1)))

                # 모델의 성능 측정을 위해서 average Q 값을 확인합니다.
                # DQN (deepmind etc..) 논문에서 강화학습 모델의 진행도를 측정하기 위한 좋은
                # 지표로서 언급하였습니다.
                ep_ave_max_q += np.amax(predicted_q_value)

                # replay buffer에서 가져온 state_batch를 사용해서 actor의 current policy에 해당하는
                # q_value를 가져옵니다.
                # current_policy에 따른 q_value를 state 정보와 함께 넣어
                # Q(s, a)를 계산합니다 by critic
                a_outs = actor.predict(s_batch)
                grads = critic.action_gradients(s_batch, a_outs)

                # grads 는 (1, BATCH_SIZE, ACTION_DIM) 의 배열이므로
                # (BATCH_SIZE, ACTION_DIM)의 입력을 받는 actor.train 함수를 위해서 grads[0]을 취합니다.
                # actor의 policy를 업데이트 하기 위해서 critic의 gradients를 받아와서 train합니다.
                actor.train(s_batch, grads[0])

                # actor와 critic network 모두 업데이트 합니다.
                actor.update()
                critic.update()

            # s2 를 s 로 바꾸어 진행합니다.
            s = s2

            if terminal:

                episode_buffer = reward.discount(episode_buffer)

                # Add episode to replay buffer
                for step in episode_buffer:
                    replay_buffer.add(
                        np.reshape(step[0], (actor.state_dim, )),
                        np.reshape(step[1], (actor.action_dim, )), step[2],
                        step[3], np.reshape(step[4], (actor.state_dim, )))

                summary = tf.Summary()
                summary.value.add(tag='Perf/Reward',
                                  simple_value=float(ep_reward))
                summary.value.add(tag='Perf/Qmax',
                                  simple_value=float(ep_ave_max_q / float(j)))
                summary_writer.add_summary(summary, i)

                summary_writer.flush()

                print('| Reward: %.2i' % int(ep_reward), " | Episode", i,
                      '| Qmax: %.4f' % (ep_ave_max_q / float(j)))

                break
예제 #4
0
파일: main.py 프로젝트: twni2016/Meta-SAC
            # Number of updates per step in environment
            for i in range(config['updates_per_step']):
                # Update parameters of all the networks
                agent.train(memory, config['batch_size'])
                updates += 1

        next_state, reward, done, _ = env.step(action)  # Step
        episode_steps += 1
        total_numsteps += 1
        episode_reward += reward

        # Ignore the "done" signal if it comes from hitting the time horizon.
        # (https://github.com/openai/spinningup/blob/master/spinup/algos/sac/sac.py)
        done_bool = float(
            done) if episode_steps < env._max_episode_steps else 0
        memory.add(state, action, next_state, reward,
                   done_bool)  # Append transition to memory

        state = next_state

    if total_numsteps > config['num_steps']:
        break

    # writer.add_scalar('reward/train', episode_reward, i_episode)
    print("Episode: {}, total numsteps: {}, episode steps: {}, reward: {}".
          format(i_episode, total_numsteps, episode_steps,
                 round(episode_reward, 2)))

    if total_numsteps > test_step and config['eval'] == True:
        test(env)
        test_step += 10000
예제 #5
0
class DQN(object):
    def __init__(self):
        self.pred_net, self.target_net = ConvNet(), ConvNet()
        # sync evac target
        self.update_target(self.target_net, self.pred_net, 1.0)
        # use gpu
        if USE_GPU:
            self.pred_net.cuda()
            self.target_net.cuda()

        # simulator step counter
        self.memory_counter = 0
        # target network step counter
        self.learn_step_counter = 0

        # ceate the replay buffer
        self.replay_buffer = ReplayBuffer(MEMORY_CAPACITY)

        # define optimizer
        self.optimizer = torch.optim.Adam(self.pred_net.parameters(), lr=LR)

    # Update target network
    def update_target(self, target, pred, update_rate):
        # update target network parameters using predcition network
        for target_param, pred_param in zip(target.parameters(),
                                            pred.parameters()):
            target_param.data.copy_((1.0 - update_rate) \
                                    * target_param.data + update_rate*pred_param.data)

    def save_model(self):
        # save prediction network and target network
        self.pred_net.save(PRED_PATH)
        self.target_net.save(TARGET_PATH)

    def load_model(self):
        # load prediction network and target network
        self.pred_net.load(PRED_PATH)
        self.target_net.load(TARGET_PATH)

    def choose_action(self, x, EPSILON):
        # x:state
        x = torch.FloatTensor(x)
        # print(x.shape)
        if USE_GPU:
            x = x.cuda()

        # epsilon-greedy
        if np.random.uniform() >= EPSILON:
            # greedy case
            action_value, tau = self.pred_net(
                x)  # (N_ENVS, N_ACTIONS, N_QUANT)
            action_value = action_value.mean(dim=2)
            action = torch.argmax(action_value, dim=1).data.cpu().numpy()
            # print(action)
        else:
            # random exploration case
            action = np.random.randint(0, N_ACTIONS, (x.size(0)))
        return action

    def store_transition(self, s, a, r, s_, done):
        self.memory_counter += 1
        self.replay_buffer.add(s, a, r, s_, float(done))

    def learn(self):
        self.learn_step_counter += 1
        # target parameter update
        if self.learn_step_counter % TARGET_REPLACE_ITER == 0:
            self.update_target(self.target_net, self.pred_net, 1e-2)

        b_s, b_a, b_r, b_s_, b_d = self.replay_buffer.sample(BATCH_SIZE)
        print(b_d)
        b_w, b_idxes = np.ones_like(b_r), None

        b_s = torch.FloatTensor(b_s)
        b_a = torch.LongTensor(b_a)
        b_r = torch.FloatTensor(b_r)
        b_s_ = torch.FloatTensor(b_s_)
        b_d = torch.FloatTensor(b_d)

        if USE_GPU:
            b_s, b_a, b_r, b_s_, b_d = b_s.cuda(), b_a.cuda(), b_r.cuda(
            ), b_s_.cuda(), b_d.cuda()

        # action value distribution prediction
        q_eval, q_eval_tau = self.pred_net(
            b_s)  # (m, N_ACTIONS, N_QUANT), (N_QUANT, 1)
        mb_size = q_eval.size(0)
        # squeeze去掉第一维
        # torch.stack函数是将矩阵进行叠加,默认dim=0,即将[]中的n个矩阵变成n维
        # index_select函数是进行索引查找。
        q_eval = torch.stack([
            q_eval[i].index_select(0, b_a[i]) for i in range(mb_size)
        ]).squeeze(1)
        # (m, N_QUANT)
        # 在q_eval第二维后面加一个维度
        q_eval = q_eval.unsqueeze(2)  # (m, N_QUANT, 1)
        # note that dim 1 is for present quantile, dim 2 is for next quantile

        # get next state value
        q_next, q_next_tau = self.target_net(
            b_s_)  # (m, N_ACTIONS, N_QUANT), (N_QUANT, 1)
        best_actions = q_next.mean(dim=2).argmax(dim=1)  # (m)
        q_next = torch.stack([
            q_next[i].index_select(0, best_actions[i]) for i in range(mb_size)
        ]).squeeze(1)
        # q_nest: (m, N_QUANT)
        # q_target = R + gamma * (1 - terminate) * q_next
        q_target = b_r.unsqueeze(1) + GAMMA * (1. - b_d.unsqueeze(1)) * q_next
        # q_target: (m, N_QUANT)
        # detach表示该Variable不更新参数
        q_target = q_target.unsqueeze(1).detach()  # (m , 1, N_QUANT)

        # quantile Huber loss
        print('q_target', q_target.shape)
        print('q_eval', q_eval.shape)
        print('q_target_', q_target.detach().shape)
        u = q_target.detach() - q_eval  # (m, N_QUANT, N_QUANT)
        tau = q_eval_tau.unsqueeze(0)  # (1, N_QUANT, 1)
        # note that tau is for present quantile
        # w = |tau - delta(u<0)|
        weight = torch.abs(tau - u.le(0.).float())  # (m, N_QUANT, N_QUANT)
        loss = F.smooth_l1_loss(q_eval, q_target.detach(), reduction='none')
        # (m, N_QUANT, N_QUANT)
        loss = torch.mean(weight * loss, dim=1).mean(dim=1)

        # calculate importance weighted loss
        b_w = torch.Tensor(b_w)
        if USE_GPU:
            b_w = b_w.cuda()
        loss = torch.mean(b_w * loss)

        # backprop loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        return loss
예제 #6
0
class DQN(object):
    def __init__(self):
        self.pred_net, self.target_net = ConvNet(), ConvNet()
        # sync eval target
        self.update_target(self.target_net, self.pred_net, 1.0)
        # use gpu
        if USE_GPU:
            self.pred_net.cuda()
            self.target_net.cuda()

        # simulator step conter
        self.memory_counter = 0
        # target network step counter
        self.learn_step_counter = 0

        # ceate the replay buffer
        self.replay_buffer = ReplayBuffer(MEMORY_CAPACITY)

        # define optimizer
        self.optimizer = torch.optim.Adam(self.pred_net.parameters(), lr=LR)

        # discrete values
        self.value_range = torch.FloatTensor(V_RANGE)  # (N_ATOM)
        if USE_GPU:
            self.value_range = self.value_range.cuda()

    def update_target(self, target, pred, update_rate):
        # update target network parameters using predcition network
        for target_param, pred_param in zip(target.parameters(),
                                            pred.parameters()):
            target_param.data.copy_((1.0 - update_rate) \
                                    * target_param.data + update_rate*pred_param.data)

    def save_model(self):
        # save prediction network and target network
        self.pred_net.save(PRED_PATH)
        self.target_net.save(TARGET_PATH)

    def load_model(self):
        # load prediction network and target network
        self.pred_net.load(PRED_PATH)
        self.target_net.load(TARGET_PATH)

    def choose_action(self, x, EPSILON):
        x = torch.FloatTensor(x)
        if USE_GPU:
            x = x.cuda()

        if np.random.uniform() >= EPSILON:
            # greedy case
            action_value_dist = self.pred_net(x)  # (N_ENVS, N_ACTIONS, N_ATOM)
            action_value = torch.sum(action_value_dist *
                                     self.value_range.view(1, 1, -1),
                                     dim=2)  # (N_ENVS, N_ACTIONS)
            action = torch.argmax(action_value, dim=1).data.cpu().numpy()
        else:
            # random exploration case
            action = np.random.randint(0, N_ACTIONS, (x.size(0)))
        return action

    def store_transition(self, s, a, r, s_, done):
        self.memory_counter += 1
        self.replay_buffer.add(s, a, r, s_, float(done))

    def learn(self):
        self.learn_step_counter += 1
        # target parameter update
        if self.learn_step_counter % TARGET_REPLACE_ITER == 0:
            self.update_target(self.target_net, self.pred_net, 1e-2)

        b_s, b_a, b_r, b_s_, b_d = self.replay_buffer.sample(BATCH_SIZE)
        b_w, b_idxes = np.ones_like(b_r), None

        b_s = torch.FloatTensor(b_s)
        b_a = torch.LongTensor(b_a)
        b_s_ = torch.FloatTensor(b_s_)

        if USE_GPU:
            b_s, b_a, b_s_ = b_s.cuda(), b_a.cuda(), b_s_.cuda()

        # action value distribution prediction
        q_eval = self.pred_net(b_s)  # (m, N_ACTIONS, N_ATOM)
        mb_size = q_eval.size(0)
        q_eval = torch.stack([
            q_eval[i].index_select(0, b_a[i]) for i in range(mb_size)
        ]).squeeze(1)
        # (m, N_ATOM)

        # target distribution
        q_target = np.zeros((mb_size, N_ATOM))  # (m, N_ATOM)

        # get next state value
        q_next = self.target_net(b_s_).detach()  # (m, N_ACTIONS, N_ATOM)
        # next value mean
        q_next_mean = torch.sum(q_next * self.value_range.view(1, 1, -1),
                                dim=2)  # (m, N_ACTIONS)
        best_actions = q_next_mean.argmax(dim=1)  # (m)
        q_next = torch.stack([
            q_next[i].index_select(0, best_actions[i]) for i in range(mb_size)
        ]).squeeze(1)
        q_next = q_next.data.cpu().numpy()  # (m, N_ATOM)

        # categorical projection
        '''
        next_v_range : (z_j) i.e. values of possible return, shape : (m, N_ATOM)
        next_v_pos : relative position when offset of value is V_MIN, shape : (m, N_ATOM)
        '''
        # we vectorized the computation of support and position
        next_v_range = np.expand_dims(b_r, 1) + GAMMA * np.expand_dims((1. - b_d),1) \
        * np.expand_dims(self.value_range.data.cpu().numpy(),0)
        next_v_pos = np.zeros_like(next_v_range)
        # clip for categorical distribution
        next_v_range = np.clip(next_v_range, V_MIN, V_MAX)
        # calc relative position of possible value
        next_v_pos = (next_v_range - V_MIN) / V_STEP
        # get lower/upper bound of relative position
        lb = np.floor(next_v_pos).astype(int)
        ub = np.ceil(next_v_pos).astype(int)
        # we didn't vectorize the computation of target assignment.
        for i in range(mb_size):
            for j in range(N_ATOM):
                # calc prob mass of relative position weighted with distance
                q_target[i, lb[i, j]] += (q_next * (ub - next_v_pos))[i, j]
                q_target[i, ub[i, j]] += (q_next * (next_v_pos - lb))[i, j]

        q_target = torch.FloatTensor(q_target)
        if USE_GPU:
            q_target = q_target.cuda()

        # calc huber loss, dont reduce for importance weight
        loss = q_target * (-torch.log(q_eval + 1e-8))  # (m , N_ATOM)
        loss = torch.mean(loss)

        # calc importance weighted loss
        b_w = torch.Tensor(b_w)
        if USE_GPU:
            b_w = b_w.cuda()
        loss = torch.mean(b_w * loss)

        # backprop loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
예제 #7
0
class Agent():
    """
    Initialize Agent, inclduing:
        DQN Hyperparameters
        Local and Targat State-Action Policy Networks
        Replay Memory Buffer from Replay Buffer Class (define below)
    """
    def __init__(self,
                 state_size,
                 action_size,
                 dqn_type='DQN',
                 replay_memory_size=1e5,
                 batch_size=64,
                 gamma=0.99,
                 learning_rate=1e-3,
                 target_tau=2e-3,
                 update_rate=4,
                 seed=0):
        """
        DQN Agent Parameters
        ====== 
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            dqn_type (string): can be either 'DQN' for vanillia dqn learning (default) or 'DDQN' for double-DQN.
            replay_memory size (int): size of the replay memory buffer (typically 5e4 to 5e6)
            batch_size (int): size of the memory batch used for model updates (typically 32, 64 or 128)
            gamma (float): paramete for setting the discoun ted value of future rewards (typically .95 to .995)
            learning_rate (float): specifies the rate of model learing (typically 1e-4 to 1e-3))
            seed (int): random seed for initializing training point.
        """
        self.dqn_type = dqn_type
        self.state_size = state_size
        self.action_size = action_size
        self.buffer_size = int(replay_memory_size)
        self.batch_size = batch_size
        self.gamma = gamma
        self.learn_rate = learning_rate
        self.tau = target_tau
        self.update_rate = update_rate
        self.seed = random.seed(seed)
        """
        # DQN Agent Q-Network
        # For DQN training, two nerual network models are employed;
        # (a) A network that is updated every (step % update_rate == 0)
        # (b) A target network, with weights updated to equal the network at a slower (target_tau) rate.
        # The slower modulation of the target network weights operates to stablize learning.
        """
        self.network = QNetwork(state_size, action_size, seed).to(device)
        self.target_network = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.optimizer = optim.Adam(self.network.parameters(),
                                    lr=self.learn_rate,
                                    betas=BETAS)

        # Replay memory
        self.memory = ReplayBuffer(action_size, self.buffer_size,
                                   self.batch_size, seed)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    ########################################################
    # STEP() method
    #
    def step(self, state, action, reward, next_state, done, update=True):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % self.update_rate
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > self.batch_size:
                experiences = self.memory.sample()
                if update:
                    self.learn(experiences, self.gamma)

########################################################
# ACT() method
#

    def act(self, state, eps=0.0):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.network.eval()
        with torch.no_grad():
            action_values = self.network(state)
        self.network.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

########################################################
# LEARN() method
# Update value parameters using given batch of experience tuples.

    def learn(self, experiences, gamma, DQN=True):
        """
        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """

        states, actions, rewards, next_states, dones = experiences

        # Get Q values from current observations (s, a) using model nextwork
        Qsa = self.network(states).gather(1, actions)

        if (self.dqn_type == 'DDQN'):
            #Double DQN
            #************************
            Qsa_prime_actions = self.network(next_states).detach().max(
                1)[1].unsqueeze(1)
            Qsa_prime_targets = self.target_network(
                next_states)[Qsa_prime_actions].unsqueeze(1)

        else:
            #Regular (Vanilla) DQN
            #************************
            # Get max Q values for (s',a') from target model
            Qsa_prime_target_values = self.target_network(next_states).detach()
            Qsa_prime_targets = Qsa_prime_target_values.max(1)[0].unsqueeze(1)

        # Compute Q targets for current states
        Qsa_targets = rewards + (gamma * Qsa_prime_targets * (1 - dones))

        # Compute loss (error)
        loss = F.mse_loss(Qsa, Qsa_targets)

        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.network, self.target_network, self.tau)

    ########################################################
    """
    Soft update model parameters.
    θ_target = τ*θ_local + (1 - τ)*θ_target
    """

    def soft_update(self, local_model, target_model, tau):
        """
        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def save_the_model(self, iteration, f_name):
        if not os.path.exists('./save/dqn/'):
            os.makedirs('./save/dqn/')
        f_name = 'dqn_param_' + str(iteration) + '_' + f_name + '_model.pth'
        torch.save(self.network.state_dict(), './save/dqn/' + f_name)
        print('DQN Model Saved')

    def load_the_model(self, iteration, f_name):
        f_path = './save/dqn/dqn_param_' + str(
            iteration) + '_' + f_name + '_model.pth'
        self.network.load_state_dict(torch.load(f_path))
        print('DQN Model Loaded')
예제 #8
0
class Smoothing_DQN(object):
    def __init__(self):
        self.pred_net_Q1, self.target_net_Q1 = ConvNet(), ConvNet()
        self.pred_net_Q2, self.target_net_Q2 = ConvNet(), ConvNet()
        # sync evac target
        self.target_deque1 = deque(maxlen=n)
        self.target_deque2 = deque(maxlen=n)
        self.update_target(self.target_net_Q1, self.pred_net_Q1, 1.0)
        self.update_target(self.target_net_Q2, self.pred_net_Q2, 1.0)

        self.target_deque1.append(self.target_net_Q1)
        # use gpu
        if USE_GPU:
            self.pred_net_Q1.cuda()
            self.target_net_Q1.cuda()
            self.pred_net_Q2.cuda()
            self.target_net_Q2.cuda()
        # simulator step counter
        self.memory_counter = 0
        # target network step counter
        self.learn_step_counter = 0
        # loss function
        self.loss_function = nn.MSELoss()
        # ceate the replay buffer
        self.replay_buffer = ReplayBuffer(MEMORY_CAPACITY)

        # define optimizer
        self.optimizer = torch.optim.Adam(self.pred_net_Q1.parameters(), lr=LR)

        self.optimizer1 = torch.optim.Adam(self.pred_net_Q2.parameters(),
                                           lr=LR)

    def update_target(self, target, pred, update_rate):
        # update target network parameters using predcition network
        for target_param, pred_param in zip(target.parameters(),
                                            pred.parameters()):
            target_param.data.copy_((1.0 - update_rate) \
                                    * target_param.data + update_rate*pred_param.data)

    def save_model(self):
        # save prediction network and target network
        self.pred_net_Q1.save(PRED_PATH)
        self.target_net_Q1.save(TARGET_PATH)
        self.pred_net_Q2.save(PRED_PATH1)
        self.target_net_Q2.save(TARGET_PATH)

    def load_model(self):
        # load prediction network and target network
        self.pred_net_Q1.load(PRED_PATH)
        self.target_net_Q1.load(TARGET_PATH)
        self.pred_net_Q2.load(PRED_PATH)
        self.target_net_Q2.load(TARGET_PATH)

    def choose_action(self, x, EPSILON):
        # x:state
        x = torch.FloatTensor(x)
        # print(x.shape)
        if USE_GPU:
            x = x.cuda()

        # epsilon-greedy策略
        if np.random.uniform() >= EPSILON:
            # greedy case
            action_value = self.pred_net_Q1(x)
            action_value += self.pred_net_Q2(x)
            action = torch.argmax(action_value, dim=1).data.cpu().numpy()
        else:
            # random exploration case
            action = np.random.randint(0, N_ACTIONS, (x.size(0)))
        return action

    def store_transition(self, s, a, r, s_, done):
        self.memory_counter += 1
        self.replay_buffer.add(s, a, r, s_, float(done))

    def save_history(self):
        if self.memory_counter % dealy_interval == 0:
            self.target_deque1.append(self.pred_net_Q1)
        if self.memory_counter % dealy_interval + 100 == 0:
            self.target_deque2.append(self.pred_net_Q2)

    # def update_target(self):
    #     # weight=np.array([0.9,0.])
    #     if len(self.target_deque)<n:
    #         for target_param, pred_param in zip(self.target_net.parameters(), self.pred_net.parameters()):
    #           target_param.data.copy_((1.0 - 1e-2) \
    #                                   * target_param.data + 1e-2 * pred_param.data)
    #         return
    #     for i,net in enumerate(self.target_deque):
    #         for target_param, queue_net in zip(self.target_net.parameters(),net.parameters()):
    #           target_param.data.copy_( self.weight[i] * queue_net.data)
    def learn(self):
        self.learn_step_counter += 1
        # target parameter update
        if self.learn_step_counter % TARGET_REPLACE_ITER == 0:
            self.update_target(self.target_net_Q1, self.pred_net_Q1, 1e-2)
            self.update_target(self.target_net_Q2, self.pred_net_Q2, 1e-2)

        b_s, b_a, b_r, b_s_, b_d = self.replay_buffer.sample(BATCH_SIZE)
        # b_w, b_idxes = np.ones_like(b_r), None

        b_s = torch.FloatTensor(b_s)
        b_a = torch.LongTensor(b_a)
        b_r = torch.FloatTensor(b_r)
        b_s_ = torch.FloatTensor(b_s_)
        b_d = torch.FloatTensor(b_d)

        if USE_GPU:
            b_s, b_a, b_r, b_s_, b_d = b_s.cuda(), b_a.cuda(), b_r.cuda(
            ), b_s_.cuda(), b_d.cuda()

        # action value for current state
        q_eval1 = self.pred_net_Q1(b_s)
        mb_size = q_eval1.size(0)
        q_eval1 = torch.stack([q_eval1[i][b_a[i]] for i in range(mb_size)])

        q_eval2 = self.pred_net_Q2(b_s)
        mb_size = q_eval2.size(0)
        q_eval2 = torch.stack([q_eval2[i][b_a[i]] for i in range(mb_size)])
        # optimal action value for current state

        alpha = np.random.uniform(0, 1, len(self.target_deque1) + 1)
        alpha = alpha / alpha.sum()
        # print("alpha:",alpha,alpha.sum())
        q_next1 = self.target_net_Q1(b_s_)
        q_next1 = alpha[-1] * torch.max(q_next1, -1)[0]
        for i, target in enumerate(self.target_deque1):
            q_next_history = target(b_s_)
            q_next1 += alpha[i] * torch.max(q_next_history, -1)[0]

        alpha = np.random.uniform(0, 1, len(self.target_deque2) + 1)
        alpha = alpha / alpha.sum()
        # print("alpha:",alpha,alpha.sum())
        q_next2 = self.target_net_Q2(b_s_)
        q_next2 = alpha[-1] * torch.max(q_next2, -1)[0]
        for i, target in enumerate(self.target_deque2):
            q_next_history = target(b_s_)
            q_next2 += alpha[i] * torch.max(q_next_history, -1)[0]
        # print("q next:",q_next.shape)
        # best_actions = q_next.argmax(dim=1)
        # q_next = torch.stack([q_next[i][best_actions[i]] for i in range(mb_size)])

        # print("shape:",q_next.shape)

        q_target1 = b_r + GAMMA * (1. - b_d) * q_next1
        q_target1 = q_target1.detach()

        q_target2 = b_r + GAMMA * (1. - b_d) * q_next2
        q_target2 = q_target2.detach()

        # loss
        loss = self.loss_function(q_eval1, q_target2)
        logger.store(loss=loss)
        # backprop loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        loss = self.loss_function(q_eval2, q_target1)
        self.optimizer1.zero_grad()
        loss.backward()
        self.optimizer1.step()
        return loss
예제 #9
0
class DQN(object):
    def __init__(self):
        if USE_CNN:
            if USE_GPU:
                self.eval_net, self.target_net = ConvNet().cuda(), ConvNet(
                ).cuda()
            else:
                self.eval_net, self.target_net = ConvNet(), ConvNet()
        else:
            if USE_GPU:
                self.eval_net, self.target_net = Net().cuda(), Net().cuda()
            else:
                self.eval_net, self.target_net = Net(), Net()

        self.learn_step_counter = 0  # for target updating
        self.memory_counter = 0

        # Create the replay buffer
        if MEMORY_MODE == 'PER':
            self.replay_buffer = PrioritizedReplayBuffer(MEMORY_CAPACITY,
                                                         alpha=PER_ALPHA)
        else:
            self.replay_buffer = ReplayBuffer(MEMORY_CAPACITY)

        self.optimizer = torch.optim.Adam(self.eval_net.parameters(), lr=LR)

    def choose_action(self, x, EPSILON):
        if USE_GPU:
            x = Variable(torch.FloatTensor(x)).cuda()
        else:
            x = Variable(torch.FloatTensor(x))

        # input only one sample
        if np.random.uniform() < EPSILON:  # greedy
            actions_value = self.eval_net.forward(x.unsqueeze(0))
            if USE_GPU:
                action = torch.argmax(
                    actions_value).data.cpu().numpy()  # return the argmax
            else:
                action = torch.argmax(
                    actions_value).data.numpy()  # return the argmax;
        else:  # random
            action = np.random.randint(0, N_ACTIONS)
        return action

    def store_transition(self, s, a, r, s_, done):
        self.memory_counter += 1
        self.replay_buffer.add(s, a, r, s_, float(done))

    def learn(self, beta):
        # target parameter update
        if self.learn_step_counter % TARGET_REPLACE_ITER == 0:
            self.target_net.load_state_dict(self.eval_net.state_dict())
        self.learn_step_counter += 1

        # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
        if MEMORY_MODE == 'PER':
            experience = self.replay_buffer.sample(BATCH_SIZE, beta=beta)
            (b_state_memory, b_action_memory, b_reward_memory,
             b_next_state_memory, b_done, b_weights, b_idxes) = experience
        else:
            b_state_memory, b_action_memory, b_reward_memory, b_next_state_memory, b_done = self.replay_buffer.sample(
                BATCH_SIZE)
            b_weights, b_idxes = np.ones_like(b_reward_memory), None

        if USE_GPU:
            b_s = Variable(torch.FloatTensor(b_state_memory)).cuda()
            b_a = Variable(torch.LongTensor(b_action_memory)).cuda()
            b_r = Variable(torch.FloatTensor(b_reward_memory)).cuda()
            b_s_ = Variable(torch.FloatTensor(b_next_state_memory)).cuda()
            b_d = Variable(torch.FloatTensor(b_done)).cuda()
        else:
            b_s = Variable(torch.FloatTensor(b_state_memory))
            b_a = Variable(torch.LongTensor(b_action_memory))
            b_r = Variable(torch.FloatTensor(b_reward_memory))
            b_s_ = Variable(torch.FloatTensor(b_next_state_memory))
            b_d = Variable(torch.FloatTensor(b_done))

        # q_eval w.r.t the action in experience
        q_eval = self.eval_net(b_s).gather(1, b_a.unsqueeze(1)).view(
            -1)  # shape (batch, 1)

        if DOUBLE:
            _, best_actions = self.eval_net.forward(b_s_).detach().max(1)
            q_next = self.target_net(
                b_s_).detach()  # detach from graph, don't backpropagate
            q_target = b_r + GAMMA * (1. - b_d) * q_next.gather(
                1, best_actions.unsqueeze(1)).squeeze(1)  # shape (batch, 1)
        else:
            q_next = self.target_net(
                b_s_).detach()  # detach from graph, don't backpropagate
            q_target = b_r + GAMMA * (
                1. - b_d) * q_next.max(1)[0]  # shape (batch, 1)

        loss = F.smooth_l1_loss(q_eval, q_target, reduce=False)
        loss = torch.mean(torch.Tensor(b_weights).cuda() * loss)
        td_error = (q_target - q_eval).data.cpu().numpy()

        self.optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.eval_net.parameters(), 10.)
        self.optimizer.step()

        if MEMORY_MODE == 'PER':
            new_priorities = np.abs(td_error) + PER_EPSILON
            self.replay_buffer.update_priorities(b_idxes, new_priorities)

    def save_model(self):
        # save evaluation network and target network simultaneously
        self.eval_net.save(EVAL_PATH)
        self.target_net.save(TARGET_PATH)

    def load_model(self):
        # load evaluation network and target network simultaneously
        self.eval_net.load(EVAL_PATH)
        self.target_net.load(TARGET_PATH)
예제 #10
0
class Agent():
    def __init__(self,
                 state_size,
                 action_size,
                 dqn_type='DQN',
                 replay_memory_size=1e5,
                 batch_size=64,
                 gamma=0.99,
                 learning_rate=1e-3,
                 target_tau=2e-3,
                 update_rate=4,
                 seed=0):

        self.dqn_type = dqn_type
        self.state_size = state_size
        self.action_size = action_size
        self.buffer_size = int(replay_memory_size)
        self.batch_size = batch_size
        self.gamma = gamma
        self.learn_rate = learning_rate
        self.tau = target_tau
        self.update_rate = update_rate
        self.seed = random.seed(seed)
        """
        # DQN Agent Q-Network
        # For DQN training, two neural network models are employed;
        # (a) A network that is updated every (step % update_rate == 0)
        # (b) A target network, with weights updated to equal the network at a slower (target_tau) rate.
        # The slower modulation of the target network weights operates to stablize learning.
        """
        self.network = QNetwork(state_size, action_size, seed).to(device)
        self.target_network = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.optimizer = optim.Adam(self.network.parameters(),
                                    lr=self.learn_rate)

        # Replay memory
        self.memory = ReplayBuffer(action_size, self.buffer_size,
                                   self.batch_size, seed)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, actions, rewards, next_state, dones):
        # Save experience in replay memory
        for i in range(len(actions)):
            # print("Step ACTIONS", actions, actions[i], state[i])
            self.memory.add(state[i], actions[i], rewards[i], next_state[i],
                            dones[i])

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % self.update_rate
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > self.batch_size:
                experiences = self.memory.sample()
                self.learn(experiences, self.gamma)

    def act(self, state, eps=0.0):
        """Returns actions for given state as per current policy.

            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.network.eval()
        with torch.no_grad():
            action_values = self.network(state)
        self.network.train()

        num_agents = len(action_values[0])

        # print("AGENT ACT VALUES", action_values,  np.argmax(action_values.cpu().data.numpy()[0], 1),  np.array([random.choice(np.arange(self.action_size)) for i in range(num_agents)]))

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy()[0], 1)
        else:
            return np.array(
                np.array([
                    random.choice(np.arange(self.action_size))
                    for i in range(num_agents)
                ]))

    # Update value parameters using given batch of experience tuples.
    def learn(self, experiences, gamma, DQN=True):
        """
        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """

        states, actions, rewards, next_states, dones = experiences

        # Get Q values from current observations (s, a) using model nextwork
        Qsa = self.network(states).gather(1, actions)

        if (self.dqn_type == 'DDQN'):
            #Double DQN
            #************************
            Qsa_prime_actions = self.network(next_states).detach().max(
                1)[1].unsqueeze(1)
            Qsa_prime_targets = self.target_network(
                next_states)[Qsa_prime_actions].unsqueeze(1)

        else:
            #Regular (Vanilla) DQN
            #************************
            # Get max Q values for (s',a') from target model
            Qsa_prime_target_values = self.target_network(next_states).detach()
            Qsa_prime_targets = Qsa_prime_target_values.max(1)[0].unsqueeze(1)

        # Compute Q targets for current states
        Qsa_targets = rewards + (gamma * Qsa_prime_targets * (1 - dones))

        # Compute loss (error)
        loss = F.mse_loss(Qsa, Qsa_targets)

        # print(Qsa, Qsa_targets)
        # print(loss)

        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.network, self.target_network, self.tau)

    """
    Soft update model parameters.
    θ_target = τ*θ_local + (1 - τ)*θ_target
    """

    def soft_update(self, local_model, target_model, tau):
        """
        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
예제 #11
0
class QR_DQN(object):
    def __init__(self):
        self.pred_net, self.target_net = ConvNet(), ConvNet()
        # sync eval target
        self.update_target(self.target_net, self.pred_net, 1.0)
        # use gpu
        if USE_GPU:
            self.pred_net.cuda()
            self.target_net.cuda()
            
        # simulator step conter
        self.memory_counter = 0
        # target network step counter
        self.learn_step_counter = 0
        
        # ceate the replay buffer
        self.replay_buffer = ReplayBuffer(MEMORY_CAPACITY)
        
        # define optimizer
        self.optimizer = torch.optim.Adam(self.pred_net.parameters(), lr=LR)
        
    def update_target(self, target, pred, update_rate):
        # update target network parameters using predcition network
        for target_param, pred_param in zip(target.parameters(), pred.parameters()):
            target_param.data.copy_((1.0 - update_rate) \
                                    * target_param.data + update_rate*pred_param.data)
            
    def save_model(self):
        # save prediction network and target network
        self.pred_net.save(PRED_PATH)
        self.target_net.save(TARGET_PATH)

    def load_model(self):
        # load prediction network and target network
        self.pred_net.load(PRED_PATH)
        self.target_net.load(TARGET_PATH)

    def choose_action(self, x, EPSILON):
        x = torch.FloatTensor(x)
        if USE_GPU:
            x = x.cuda()

        if np.random.uniform() >= EPSILON:
            # greedy case
            action_value = self.pred_net(x).mean(dim=2) # (N_ENVS, N_ACTIONS)
            action = torch.argmax(action_value, dim=1).data.cpu().numpy()
        else:
            # random exploration case
            action = np.random.randint(0, N_ACTIONS, (x.size(0)))
        return action

    def store_transition(self, s, a, r, s_, done):
        self.memory_counter += 1
        self.replay_buffer.add(s, a, r, s_, float(done))

    def learn(self):
        self.learn_step_counter += 1
        # target parameter update
        if self.learn_step_counter % TARGET_REPLACE_ITER == 0:
            self.update_target(self.target_net, self.pred_net, 1e-2)
    
        b_s, b_a, b_r,b_s_, b_d = self.replay_buffer.sample(BATCH_SIZE)
        b_w, b_idxes = np.ones_like(b_r), None
            
        b_s = torch.FloatTensor(b_s)
        b_a = torch.LongTensor(b_a)
        b_r = torch.FloatTensor(b_r)
        b_s_ = torch.FloatTensor(b_s_)
        b_d = torch.FloatTensor(b_d)

        if USE_GPU:
            b_s, b_a, b_r, b_s_, b_d = b_s.cuda(), b_a.cuda(), b_r.cuda(), b_s_.cuda(), b_d.cuda()

        # action value distribution prediction
        q_eval = self.pred_net(b_s) # (m, N_ACTIONS, N_QUANT)
        mb_size = q_eval.size(0)
        q_eval = torch.stack([q_eval[i].index_select(0, b_a[i]) for i in range(mb_size)]).squeeze(1) 
        # (m, N_QUANT)
        q_eval = q_eval.unsqueeze(2) # (m, N_QUANT, 1)
        # note that dim 1 is for present quantile, dim 2 is for next quantile
        
        # get next state value
        q_next = self.target_net(b_s_).detach() # (m, N_ACTIONS, N_QUANT)
        best_actions = q_next.mean(dim=2).argmax(dim=1) # (m)
        q_next = torch.stack([q_next[i].index_select(0, best_actions[i]) for i in range(mb_size)]).squeeze(1)
        # (m, N_QUANT)
        q_target = b_r.unsqueeze(1) + GAMMA * (1. -b_d.unsqueeze(1)) * q_next 
        # (m, N_QUANT)
        q_target = q_target.unsqueeze(1) # (m , 1, N_QUANT)

        # quantile Huber loss
        u = q_target.detach() - q_eval # (m, N_QUANT, N_QUANT)
        tau = torch.FloatTensor(QUANTS_TARGET).view(1, -1, 1) # (1, N_QUANT, 1)
        # note that tau is for present quantile
        if USE_GPU:
            tau = tau.cuda()
        weight = torch.abs(tau - u.le(0.).float()) # (m, N_QUANT, N_QUANT)
        loss = F.smooth_l1_loss(q_eval, q_target.detach(), reduction='none')
        # (m, N_QUANT, N_QUANT)
        loss = torch.mean(weight * loss, dim=1).mean(dim=1)
        print('1',loss.shape)
        
        # calc importance weighted loss
        b_w = torch.Tensor(b_w)
        if USE_GPU:
            b_w = b_w.cuda()
        # loos = b_w * loss
        print('2',(b_w * loss).shape)
        loss = torch.mean(b_w * loss)
        
        # backprop loss
        self.optimizer.zero_grad()
        loss.backward()
#         torch.nn.utils.clip_grad_norm_(self.pred_net.parameters(),0.1)
        self.optimizer.step()
예제 #12
0
class Agent():
    '''Interact with and learn from environment.'''
    def __init__(self, state_size, action_size, seed):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.t_step = 0  # counter for activating learning every few steps
        self.running_c_loss = 0
        self.running_a_loss = 0
        self.training_cnt = 0

        # Actor network (w/ target network)
        self.actor_local = Actor(state_size, action_size, seed).to(device)
        self.actor_target = Actor(state_size, action_size, seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic network (w/ target network)
        self.critic_local = Critic(state_size, action_size, seed).to(device)
        self.critic_target = Critic(state_size, action_size, seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)

    def act(self, state, mode):
        '''Returns actions for given state as per current policy.

        Params
        ======
            state (array): current state
            mode (string): train or test
            epsilon (float): for epsilon-greedy action selection
        '''
        state = torch.from_numpy(state).unsqueeze(0).float().to(
            device)  # shape of state (1, state_size)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()

        if mode == 'test':
            return np.clip(action, -1, 1)

        elif mode == 'train':  # if train, then add OUNoise in action
            action += self.noise.sample()
            return np.clip(action, -1, 1)

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        # activate learning every few steps
        self.t_step = self.t_step + 1
        if self.t_step % LEARN_EVERY_STEP == 0:
            # Learn, if enough samples are available in memory
            if len(self.memory) > BATCH_SIZE:
                for _ in range(10):  # update 10 times per learning
                    experiences = self.memory.sample()
                    self.learn(experiences, GAMMA)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        self.running_c_loss += float(critic_loss.cpu().data.numpy())
        self.training_cnt += 1
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        self.running_a_loss += float(actor_loss.cpu().data.numpy())
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        #torch.nn.utils.clip_grad_norm_(self.actor_local.parameters(), 1)  # clip gradient to max 1
        self.actor_optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
예제 #13
0
class DQNAgent():
    def __init__(self, input_shape, action_size, buffer_size, batch_size,
                 gamma, lr, tau, update_every, device):
        """Initialize an Agent object.
        
        Params
        ======
            input_shape (tuple): dimension of each state
            action_size (int): dimension of each action
            buffer_size (int): replay buffer size
            batch_size (int):  minibatch size
            gamma (float): discount factor
            lr (float): learning rate 
            tau (float): Soft-parameter update
            update_every (int): how often to update the network
            device(string): Use Gpu or CPU
        """
        self.input_shape = input_shape
        self.action_size = action_size
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.lr = lr
        self.update_every = update_every
        self.tau = tau
        self.device = device

        # Q-Network
        self.policy_net = DQNLinear(input_shape, action_size).to(self.device)
        self.target_net = DQNLinear(input_shape, action_size).to(self.device)
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.lr)

        # Replay memory
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size,
                                   self.device)

        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % self.update_every

        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > self.batch_size:
                experiences = self.memory.sample()
                self.learn(experiences)

    def act(self, state, eps=0.01):
        state = torch.from_numpy(state).unsqueeze(0).to(self.device)
        self.policy_net.eval()
        with torch.no_grad():
            action_values = self.policy_net(state)
        self.policy_net.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences):
        states, actions, rewards, next_states, dones = experiences

        # Get expected Q values from policy model
        Q_expected_current = self.policy_net(states)
        Q_expected = Q_expected_current.gather(1,
                                               actions.unsqueeze(1)).squeeze(1)

        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.target_net(next_states).detach().max(1)[0]

        # Compute Q targets for current states
        Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        self.soft_update(self.policy_net, self.target_net, self.tau)

    # θ'=θ×τ+θ'×(1−τ)
    def soft_update(self, policy_model, target_model, tau):
        for target_param, policy_param in zip(target_model.parameters(),
                                              policy_model.parameters()):
            target_param.data.copy_(tau * policy_param.data +
                                    (1.0 - tau) * target_param.data)

    def load_model(self, path):

        checkpoint = torch.load(path)
        self.policy_net.load_state_dict(checkpoint['state_dict'])
        self.target_net.load_state_dict(checkpoint['state_dict'])
        self.optimizer.load_state_dict(checkpoint['optimizer'])
        scores = checkpoint['scores']

        return scores

    def save_model(self, path, scores):
        model = {
            "state_dict": self.policy_net.state_dict(),
            "optimizer": self.optimizer.state_dict(),
            "scores": scores
        }
        torch.save(model, path)
예제 #14
0
class AgentD4PG():
    """
    Agent implementing noisy agent
    """
    def __init__(self,
                 state_size,
                 action_size,
                 seed,
                 device=device,
                 epsilon=0.3):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.device = device
        self.epsilon = epsilon

        self.t_step = 0  # counter for activating learning every few steps
        self.running_c_loss = 0
        self.running_a_loss = 0
        self.training_cnt = 0

        # Actor network (w/ target network)
        self.actor_local = DDPGActor(state_size, action_size, seed).to(device)
        self.actor_target = DDPGActor(state_size, action_size, seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic network (w/ target network)
        self.critic_local = D4PGCritic(state_size, action_size, seed, N_ATOMS,
                                       Vmin, Vmax).to(device)
        self.critic_target = D4PGCritic(state_size, action_size, seed, N_ATOMS,
                                        Vmin, Vmax).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)

    def act(self, states, mode):
        states_v = torch.Tensor(np.array(states,
                                         dtype=np.float32)).to(self.device)

        self.actor_local.eval()
        with torch.no_grad():
            mu_v = self.actor_local(states_v)
            actions = mu_v.data.cpu().numpy()
        self.actor_local.train()

        if mode == "test":
            return np.clip(actions, -1, 1)

        elif mode == "train":
            actions += self.epsilon * np.random.normal(size=actions.shape)
            actions = np.clip(actions, -1, 1)
            return actions

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        # activate learning every few steps
        self.t_step = self.t_step + 1
        if self.t_step % LEARN_EVERY_STEP == 0:
            # Learn, if enough samples are available in memory
            if len(self.memory) > BATCH_SIZE:
                for _ in range(10):  # update 10 times per learning
                    experiences = self.memory.sample2()
                    self.learn(experiences, GAMMA)

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        crt_distr_v = self.critic_local(states, actions)
        last_act_v = self.actor_target(next_states)
        last_distr_v = F.softmax(self.critic_target(next_states, last_act_v),
                                 dim=1)

        proj_distr_v = distr_projection(last_distr_v,
                                        rewards,
                                        dones,
                                        gamma=gamma**REWARD_STEPS,
                                        device=device)

        prob_dist_v = -F.log_softmax(crt_distr_v, dim=1) * proj_distr_v
        critic_loss_v = prob_dist_v.sum(dim=1).mean()

        self.running_c_loss += float(critic_loss_v.cpu().data.numpy())
        self.training_cnt += 1

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss_v.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        crt_distr_v = self.critic_local(states, actions_pred)
        actor_loss_v = -self.critic_local.distr_to_q(crt_distr_v)
        actor_loss_v = actor_loss_v.mean()
        self.running_a_loss += float(actor_loss_v.cpu().data.numpy())

        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss_v.backward()
        # torch.nn.utils.clip_grad_norm_(self.actor_local.parameters(), 1)  # clip gradient to max 1
        self.actor_optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)