示例#1
0
def generate_memory(size, game='Pendulum'):

    if game.startswith('Pendulum'):
        env = PendulumWrapper()
    elif game.startswith('LunarLander'):
        env = LunarWrapper()

    memory = ReplayMemory(100000)

    for i in range(size):
        s = env.reset()
        a = env.action_space.sample()
        s_, r, d, _ = env.step(a)

        memory.push(s, a, r, s_, 1 - int(d))

    return memory
示例#2
0
class TransitionSaver:
    def __init__(self):
        self.processor = PreprocessImage(None)
        self.memory = ReplayMemory()
        self.transitions = []
        self.index = 0
        self.nsteps = 10

    def new_episode(self, first_state):
        self.state = self.processor._observation(first_state)

    def add_transition(self, action, next_state, reward, done):
        if not done and self.index < self.nsteps:
            next_state = self.processor._observation(next_state)
            self.transitions.insert(0, Transition(self.state, self.add_noop(action), next_state, torch.FloatTensor([reward]), torch.zeros(1)))

            transitions = []
            gamma = 1
            for trans in self.transitions:
                transitions.append(trans._replace(n_reward= trans.n_reward + gamma * reward))
                gamma = gamma * GAMMA
            self.transitions = transitions
        else:
            for trans in self.transitions:
                self.memory.push(trans)
            self.transitions = []
        self.state = next_state
    
    def add_noop(self, actions):
        actions.insert(0, 0)
        actions = torch.LongTensor(actions)
        actions[0] = (1 - actions[1:].max(0)[0])[0]
        return actions.max(0)[1]

    def save(self, fname):
        with open(fname, 'wb') as memory_file:
            pickle.dump(self.memory, memory_file)
示例#3
0
    env.reset()

    episode_record = []  # use this to record temporarily for one episode
    # for t in count():
    for t in range(2999):
        steps_done += 1
        # Select and perform an action
        # print(state.shape)
        action = select_action(torch.tensor(state).to(device))
        # print(action.item())
        next_state, reward, terminal, _ = env.step([action.item()])
        episode_record.append((next_state, reward))
        # print(next_state.shape)
        reward = torch.tensor([reward], device=device)
        # Store the transition in memory
        memory.push(torch.tensor([state]), torch.tensor([action]),
                    torch.tensor([next_state]), reward)
        # print("reward",reward)
        # Move to the next state
        state = next_state
        # Perform one step of the optimization (on the target network)
        optimize_model()
        if terminal:
            print('terminal')
            episode_durations.append(t + 1)
            break
        # Update the target network, copying all weights and biases in DQN
        if steps_done % TARGET_UPDATE == 0:
            target_net.load_state_dict(policy_net.state_dict())
    average_reward = evaluate_episode(episode_record)
    print("episode:", i_episode, 'average reward:', average_reward)
    torch.save(target_net.state_dict(),
示例#4
0
         rad = np.linalg.norm(s_next - kwargs["emb_goal"], 2)
         threshold = 3.5
         kwargs["emb_threshold"] = threshold
     else:
         rad = np.linalg.norm(ts_next - goal.reshape(-1), 2)
         threshold = 0.5
     r = -1
     if rad < threshold:
         count += 1
         # print(ts_next)
         r = 0
         s_next = None
     if is_shapedreward:
         r -= rad
     if not is_image:
         memory.push(ts, a, ts_next, r)
     else:
         memory.push(s, a, s_next, r)
 print("Number of goals reached in transitions: %d" % count)
 """
 Training Q-function
 """
 n_iters = len(transitions) // BATCH_SIZE
 for epoch in range(N_EPOCHS):
     loss = 0
     for it in range(n_iters):
         loss += optimize_model(memory, policy_net, target_net, optimizer,
                                GAMMA, BATCH_SIZE)
         if it % TARGET_UPDATE == 0:
             target_net.load_state_dict(policy_net.state_dict())
     pred_v, real_dist, emb_dist, reward, emb_reward = eval_task(
def run_dq_pole(num_episodes):
    logg = logging.getLogger(f"c.{__name__}.run_dq_pole")
    logg.debug(f"Start run_dq_pole")

    env = gym.make("CartPole-v0").unwrapped

    plt.ion()

    # if gpu is to be used
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    logg.debug(f"Using {device} as device")

    #  show_frame(env)

    # hyperparameters
    BATCH_SIZE = 128
    GAMMA = 0.999
    EPS_START = 0.9
    EPS_END = 0.05
    EPS_DECAY = 200
    TARGET_UPDATE = 10

    env.reset()
    # Get screen size so that we can initialize layers correctly based on shape
    # returned from AI gym. Typical dimensions at this point are close to 3x40x90
    # which is the result of a clamped and down-scaled render buffer in get_screen()
    init_screen = get_screen(env, device)
    _, _, screen_height, screen_width = init_screen.shape

    # Get number of actions from gym action space
    n_actions = env.action_space.n

    policy_net = DQN(screen_height, screen_width, n_actions).to(device)
    target_net = DQN(screen_height, screen_width, n_actions).to(device)
    target_net.load_state_dict(policy_net.state_dict())
    target_net.eval()

    optimizer = optim.RMSprop(policy_net.parameters())
    memory = ReplayMemory(10000)

    steps_done = 0

    # main training loop. At the beginning we reset the environment and
    # initialize the state Tensor. Then, we sample an action, execute it,
    # observe the next screen and the reward (always 1), and optimize our model
    # once. When the episode ends (our model fails), we restart the loop.

    #  num_episodes = 50
    episode_durations = []

    for i_episode in range(num_episodes):
        # Initialize the environment and state
        env.reset()
        last_screen = get_screen(env, device)
        current_screen = get_screen(env, device)
        state = current_screen - last_screen
        for t in count():
            # Select and perform an action
            action = select_action(
                state,
                n_actions,
                steps_done,
                device,
                policy_net,
                EPS_START,
                EPS_END,
                EPS_DECAY,
            )
            _, reward, done, _ = env.step(action.item())
            reward = torch.tensor([reward], device=device)

            # Observe new state
            last_screen = current_screen
            current_screen = get_screen(env, device)
            if not done:
                next_state = current_screen - last_screen
            else:
                next_state = None

            # Store the transition in memory
            memory.push(state, action, next_state, reward)

            # Move to the next state
            state = next_state

            # Perform one step of the optimization (on the target network)
            optimize_model(BATCH_SIZE, memory, device, policy_net, target_net,
                           GAMMA, optimizer)
            if done:
                episode_durations.append(t + 1)
                plot_durations(episode_durations)
                break
        # Update the target network, copying all weights and biases in DQN
        if i_episode % TARGET_UPDATE == 0:
            target_net.load_state_dict(policy_net.state_dict())

    print("Complete")
    env.render()
    # remember to close the env, avoid sys.meta_path undefined
    env.close()
    plt.ioff()
    plt.show()
示例#6
0
class DQNAgent(Agent):
    def __init__(self, model, env, **kwargs):
        Agent.__init__(self, **kwargs)
        self.update_step = 0
        self.eps = self.EPS_START
        self.global_step = 0
        self.model = model
        self.target_model = copy.deepcopy(model)
        self.in_size = model.in_size
        self.out_size = model.out_size
        self.memory = ReplayMemory(self.REPLAY_CAPACITY)
        self.opt = torch.optim.Adam(self.model.parameters(), lr=self.LR)
        self.env = env
        self.container = Container(self.model.SAVE_MODEL_NAME)

    def select_action(self, state):
        if self.is_training:
            self.global_step += 1
            self.eps = self.EPS_START - (self.EPS_START - self.EPS_END
                                         ) / self.EPS_DECAY * self.global_step
            if self.eps < self.EPS_END:
                self.eps = self.EPS_END

        if self.is_training and np.random.rand() < self.eps:
            return LongTensor([[np.random.randint(self.out_size)]])
        else:
            var = Variable(state).type(FloatTensor)
            out = self.model(var)
            return out.max(1)[1].data.view(1, 1)

    def _DQ_loss(self, y_pred, reward_batch, non_final_mask,
                 non_final_next_states):
        q_next = Variable(torch.zeros(self.BATCH_SIZE).type(FloatTensor))
        target_q = self.target_model(non_final_next_states)
        if self.DOUBLE_DQN:
            max_act = self.model(non_final_next_states).max(1)[1].view(-1, 1)
            q_next[non_final_mask] = target_q.gather(1, max_act).data.view(
                target_q.gather(1, max_act).data.shape[0])
        else:
            q_next[non_final_mask] = target_q.max(1)[0].data

        # next_state_values.volatile = False
        y = q_next * self.GAMMA + reward_batch
        loss = nn.functional.mse_loss(y_pred, y)
        return loss

    def _calc_loss(self):
        batch = self.memory.sample(self.BATCH_SIZE)
        non_final_mask = ByteTensor(
            tuple([s is not None for s in batch.next_state]))
        non_final_next_states = Variable(
            torch.cat([s for s in batch.next_state if s is not None]))

        state_batch = Variable(
            torch.cat([s for s in batch.state if s is not None]))
        action_batch = Variable(
            torch.cat([s for s in batch.action if s is not None]))
        reward_batch = Variable(
            torch.cat([s for s in batch.reward if s is not None]))

        y_pred = self.model(state_batch).gather(1, action_batch).squeeze()
        loss = self._DQ_loss(y_pred, reward_batch, non_final_mask,
                             non_final_next_states)
        self.container.add("y_pred", torch.mean(y_pred.data))
        self.container.add("loss", loss.data.item())
        return loss

    def update_policy(self):
        loss = self._calc_loss()
        self.opt.zero_grad()
        loss.backward()
        if self.GRADIENT_CLIPPING:
            for param in self.model.parameters():
                param.grad.data.clamp_(-self.GRADIENT_CLIPPING,
                                       self.GRADIENT_CLIPPING)
        self.opt.step()

    def update_target_network(self):
        if not self.SOFT_UPDATE:
            self.update_step = (self.update_step + 1) % self.TARGET_UPDATE_FREQ
            if self.update_step == 0:
                state_dict = self.model.state_dict()
                self.target_model.load_state_dict(copy.deepcopy(state_dict))
        else:
            tw = self.target_model.state_dict().values()
            sw = self.model.state_dict().values()
            for t, s in zip(tw, sw):
                t.add_(self.TARGET_UPDATE_FREQ * (s - t))

    def _forward(self, obs, is_train, update_memory):
        if self.state_processor:
            state = self.state_processor(obs)
        else:
            temp = obs[None, :] if len(obs.shape) == 1 else obs[None, None, :]
            state = torch.from_numpy(temp).type(FloatTensor)

        if self.GET_DEMO:
            action = self.rule_processor(obs)
        else:
            action = self.select_action(state)

        act = action.numpy().squeeze()
        if self.VERBOSE:
            print("action: {}".format(act))
        action_step = self.ACTION_REPEAT
        reward = 0
        done = False
        while action_step > 0:
            action_step -= 1
            next_obs, r, done, _ = self.env.step(act)

            # CartPole reward
            # x, x_dot, theta, theta_dot = next_obs
            # r1 = (self.env.x_threshold - abs(x)) / self.env.x_threshold - 0.8
            # r2 = (self.env.theta_threshold_radians - abs(theta)) / self.env.theta_threshold_radians - 0.5
            # r = r1 + r2

            # MountainCar reward
            # position, velocity = next_obs
            # r = abs(position - (-0.5))

            reward += r
            if done:
                break

        self.reward_episode += reward
        if update_memory:
            reward = FloatTensor([reward])
            self.memory.push(state, action, reward)
            if done:
                self.memory.push(None, None, None)

        if len(self.memory) >= self.REPLAY_START and is_train:
            self.update_policy()
            self.update_target_network()

        if self.is_render:
            self.env.render()

        return next_obs, done

    def fit(self,
            is_train,
            update_memory=True,
            num_step=np.inf,
            num_episode=np.inf,
            max_episode_length=np.inf,
            is_render=False):
        if num_step == np.inf and num_episode == np.inf:
            raise Exception("")
        if num_step != np.inf and num_episode != np.inf:
            raise Exception("")

        self.is_render = is_render
        while self.i_episode < num_episode and self.i_step < num_step:
            self.i_episode += 1
            print("------------------------")
            print("episode: {}, step: {}".format(self.i_episode, self.i_step))
            obs = self.env.reset()
            self.reward_episode = 0
            episode_step = 0
            while episode_step < max_episode_length:
                episode_step += 1
                self.i_step += 1
                obs, done = self._forward(obs, is_train, update_memory)
                if done:
                    self.reward_step_pairs.push(self.reward_episode,
                                                self.i_step)
                    if self.is_test:
                        self.container.add("reward", self.reward_episode,
                                           self.record_i_step)
                    self.print(is_train)
                    break

    def train(self, **kwargs):
        self.is_training = True
        if kwargs.pop("clear", True):
            self.i_episode = 0
            self.i_step = 0
            self.reward_step_pairs.reset()
        print("Training starts...")
        self.fit(True, **kwargs)
        # self.model.save()
        self.container.save()

    def run(self, **kwargs):
        self.is_training = False
        if kwargs.pop("clear", True):
            self.i_episode = 0
            self.i_step = 0
            self.reward_step_pairs.reset()
        print("Running starts...")
        self.fit(False, **kwargs)

    def _test(self, num_step):
        self.record_i_episode = self.i_episode
        self.record_i_step = self.i_step
        self.is_test = True
        self.run(num_step=num_step)
        self.i_episode = self.record_i_episode
        self.i_step = self.record_i_step
        self.is_test = False

    def train_test(self, num_step, test_period=1000, test_step=100):
        self.i_episode = 0
        self.i_step = 0
        while self.i_step < num_step:
            self._test(test_step)
            self.train(num_step=self.record_i_step + test_period, clear=False)
        self._test(test_step)

    def print(self, is_train):
        print("reward_episode {}".format(self.reward_episode))
        print("eps {}".format(self.eps))
        if is_train:
            print("loss_episode {}".format(self.container.get("loss")))
            print("y_pred_episode {}".format(self.container.get("y_pred")))
示例#7
0
class DQNagent(object):
    def __init__(self, filename='dqn0'):
        self.filename = './trained_agents/' + filename
        self.policy_net = DQN(self.filename + '.cfg')
        self.target_net = DQN(self.filename + '.cfg')
        self.memory = ReplayMemory(16384)
        self.gamma = 0.999

    def select_action(self, state, epsilon):
        if np.random.rand() < epsilon:
            idx = LongTensor([[random.randrange(self.policy_net.output_size)]])
        else:
            idx = self.policy_net(
                Variable(state,
                         volatile=True).type(FloatTensor)).data.max(1)[1].view(
                             1, 1)
        return idx

    def update(self, batch_size=16):
        if len(self.memory.memory) < batch_size:
            batch_size = len(self.memory.memory)

        transitions = self.memory.sample(batch_size)
        batch = Transition(*zip(*transitions))

        state_batch = Variable(torch.cat(batch.state))
        action_batch = Variable(torch.cat(batch.action))
        reward_batch = Variable(torch.cat(batch.reward))

        non_final_mask = ByteTensor(
            tuple(map(lambda s: s is not None, batch.next_state)))
        non_final_next_states = Variable(torch.cat(
            [s for s in batch.next_state if s is not None]),
                                         volatile=True)

        state_action_values = self.policy_net(state_batch).gather(
            1, action_batch)

        next_state_values = Variable(torch.zeros(batch_size).type(Tensor))
        next_state_values[non_final_mask] = self.target_net(
            non_final_next_states).max(1)[0]

        expected_state_action_values = (next_state_values *
                                        self.gamma) + reward_batch
        expected_state_action_values = Variable(
            expected_state_action_values.data)

        loss = F.mse_loss(state_action_values, expected_state_action_values)

        old_params = freeze_as_np_dict(self.policy_net.state_dict())
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            logging.debug(param.grad.data.sum())
            param.grad.data.clamp_(-1., 1.)
        self.optimizer.step()

        new_params = freeze_as_np_dict(self.policy_net.state_dict())
        check_params_changed(old_params, new_params)
        return loss.data[0]

    def train(self,
              env,
              n_epochs=30,
              epsilon_init=1.,
              epsilon_schedule='exp',
              eps_decay=None,
              lr=0.001,
              batch_size=32):
        if epsilon_schedule == 'linear':
            eps_range = np.linspace(epsilon_init, 0., n_epochs)
        elif epsilon_schedule == 'constant':
            eps_range = [epsilon_init for _ in range(n_epochs)]
        elif epsilon_schedule == 'exp':
            if not eps_decay:
                eps_decay = n_epochs // 4
            eps_range = [
                epsilon_init * math.exp(-1. * i / eps_decay)
                for i in range(n_epochs)
            ]

        history_file = open(self.filename + 'history', mode='a+')
        self.policy_net = self.policy_net.cuda()
        self.target_net = self.target_net.cuda()
        self.target_net.eval()
        self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=lr)

        losses, rewards, change_history = [], [], []

        for epoch in range(n_epochs):
            env.reset()
            last_screen = get_screen(env)
            current_screen = get_screen(env)
            state = current_screen - last_screen
            done = False
            epoch_losses = []
            epoch_rewards = []
            video = []

            while not done:
                if epoch % 10 == 1:
                    video.append(last_screen)
                action = self.select_action(state, eps_range[epoch])

                _, reward, done, _ = env.step(action[0, 0])

                last_screen = current_screen
                current_screen = get_screen(env)

                reward = Tensor([reward])
                if not done:
                    next_state = current_screen - last_screen
                else:
                    next_state = None

                self.memory.push(state, action, next_state, reward)
                state = next_state
                loss = self.update(batch_size=batch_size)

                epoch_losses.append(loss)
                epoch_rewards.append(reward)

            history_file.write(
                'Epoch {}: loss= {}, reward= {}, duration= {}\n'.format(
                    epoch, np.mean(epoch_losses), np.sum(epoch_rewards),
                    len(epoch_rewards)))

            losses.append(np.mean(epoch_losses))
            rewards.append(np.sum(epoch_rewards))

            if epoch % 10 == 1:
                self.target_net.load_state_dict(self.policy_net.state_dict())
                self.save(ext=str(epoch))
                self.make_video(video, ext='_train_' + str(epoch))

                with open(self.filename + '.train_losses', 'a+') as f:
                    for l in losses:
                        f.write(str(l) + '\n')
                losses = []
                with open(self.filename + '.train_rewards', 'a+') as f:
                    for r in rewards:
                        f.write(str(r) + '\n')
                rewards = []
        self.save()

    def test(self, env, n_epochs=30, verbose=False):
        rewards = []
        self.policy_net = self.policy_net.cuda()
        self.target_net = self.target_net.cuda()
        self.target_net.eval()

        for epoch in range(n_epochs):
            env.reset()
            done = False
            epoch_rewards = []
            video = []

            last_screen = get_screen(env)
            current_screen = get_screen(env)
            state = current_screen - last_screen

            while not done:
                if epoch % 5 == 0:
                    video.append(last_screen)
                action = self.select_action(state, 0.)

                _, reward, done, _ = env.step(action[0, 0])
                last_screen = current_screen
                current_screen = get_screen(env)

                if not done:
                    next_state = current_screen - last_screen
                else:
                    next_state = None

                epoch_rewards.append(reward)
                reward = Tensor([reward])
                state = next_state

                logging.debug(
                    'Test epoch {} :  reward= {}, duration= {}'.format(
                        epoch, np.sum(epoch_rewards), len(epoch_rewards)))
            rewards.append(np.sum(epoch_rewards))

            if epoch % 5 == 0:
                self.make_video(video, ext='_test_' + str(epoch))

            logging.info('Performance estimate : {} pm {}'.format(
                np.mean(rewards), np.std(rewards)))

    def make_video(self, replay, ext=''):
        n_frames = len(replay)
        b_s, n_channels, n_w, n_h = replay[0].shape
        writer = VideoWriter(self.filename + ext + '.mp4')
        for i in range(n_frames):
            writer.writeFrame(replay[i][0][[1, 2, 0]] * 255)
        writer.close()

    def save(self, ext=''):
        torch.save(self.policy_net.state_dict(),
                   self.filename + ext + '.pol.ckpt')
        torch.save(self.target_net.state_dict(),
                   self.filename + ext + '.tgt.ckpt')

    def load(self, filename):
        self.policy_net.load_state_dict(
            torch.load('./trained_agents/' + filename + '.pol.ckpt'))
        self.target_net.load_state_dict(
            torch.load('./trained_agents/' + filename + '.tgt.ckpt'))
def train(agent, env, num_episode=50, test_interval=25, num_test=20, num_iteration=200, iteration_cutoff=0, 
          BATCH_SIZE=128, num_sample=50, action_space=[-1,1], debug=True, memory=None, seed=2020,
          update_mode=UPDATE_PER_ITERATION, reward_mode=FUTURE_REWARD_NO, gamma=0.99, 
          loss_history=[], loss_historyA=[], lr_history=[], lr_historyA=[], reward_mean_var=(0,-1),
          save_sim_intv=50, save_sim_fnames=[], imdir='screencaps/', useVid=False, save_intm_models=False,
          not_use_rand_in_action=False, not_use_rand_in_test=True, 
         return_memory=False):
    test_hists = []
    steps = 0
    if memory is None:
        ### UPDate 11/05: Changed memory size based on number of agents
        memory = ReplayMemory(1000 * env.N)
    if iteration_cutoff <= 0:
        iteration_cutoff = num_iteration # Save all iterations into the memory
    
    # Values that would be useful
    N = env.N
    # Note that the seed only controls the numpy random, which affects the environment.
    # To affect pytorch, refer to further documentations: https://github.com/pytorch/pytorch/issues/7068
    np.random.seed(seed)
#     torch.manual_seed(seed)
    test_seeds = np.random.randint(0, 5392644, size=int(num_episode // test_interval)+1)
    
#     rmean = 0
#     rvar = -1
    (rmean, rvar) = reward_mean_var

    for e in range(num_episode):
        steps = 0
        state = env.reset()
        if agent.centralized:
            state = env.state
        state = torch.from_numpy(state).float()
        state = Variable(state)
        if debug:
            env.render()
        # Train History
        state_pool = []
        action_pool = []
        reward_pool = []
        next_state_pool = []
        loss_history.append([])
        loss_historyA.append([])

        for t in range(num_iteration):
#             agent.net.train()
            agent.set_train(True)
            # Try to pick an action, react, and store the resulting behavior in the pool here
            if agent.centralized:
                action = agent.select_action(state, **{
                        'steps_done':t, 'num_sample':50, 'action_space':action_space, 'rand':not_use_rand_in_action
                    }).T
            else:
                actions = []
                for i in range(N):
                    action = agent.select_action(state[i], **{
                        'steps_done':t, 'num_sample':50, 'action_space':action_space, 'rand':not_use_rand_in_action
                    })
                    actions.append(action)
                if torch.is_tensor(action):
                    action = torch.cat(actions).view(-1,env.N)#.T
                else:
                    action = np.array(actions).T # Shape would become (2,N)

            if torch.is_tensor(action):
                next_state, reward, done, _ = env.step(action.detach().numpy())
            else:
                next_state, reward, done, _ = env.step(action)
                
            if agent.centralized:
                next_state = env.state
            next_state = Variable(torch.from_numpy(next_state).float()) # The float() probably avoids bug in net.forward()
            action = action.T # Turn shape back to (N,2)

            if agent.needsExpert:
                # If we need to use expert input during training, then we consult it and get the best action for this state
                actions = env.controller()
                action = actions.T # Shape should already be (2,N), so we turn it into (N,2)
            
            if not(agent.centralized):
                # if reward_mode & FUTURE_REWARD_YES == 0:
                #     # Push everything directly inside if we don't use future discounts
                #     for i in range(N):
                #         memory.push(state[i], action[i], next_state[i], reward[i])
                # else:
                #     # Store and push them outside the loop
                #     state_pool.append(state)
                #     action_pool.append(action)
                #     reward_pool.append(reward)
                #     next_state_pool.append(next_state)
                pass
            else:
                # if reward_mode & FUTURE_REWARD_YES == 0:
                #     # Push everything directly inside if we don't use future discounts
                #     memory.push(state, action, next_state, reward)
                # else:
                #     # Store and push them outside the loop
                #     state_pool.append(state)
                #     action_pool.append(action)
                #     reward_pool.append(reward)
                #     next_state_pool.append(next_state)
                # Centralized training should directly use the real states, instead of observations
                reward = np.sum(reward)

            # Update 1028: Moved this training step outside the loop
            if update_mode == UPDATE_PER_ITERATION:
                # Added 1214: Push the samples to memory if no need for extra processing
                if reward_mode & FUTURE_REWARD_YES == 0 and reward_mode & FUTURE_REWARD_NORMALIZE == 0:
                    if agent.centralized:
                        memory.push(state, action, next_state, reward, reward)
                    else:
                        for i in range(N):
                            memory.push(state[i], action[i], next_state[i], reward[i], reward[i])
                # Learn
                if len(memory) >= BATCH_SIZE:
                    transitions = memory.sample(BATCH_SIZE)
                    batch = Transition(*zip(*transitions))
                    agent.optimize_model(batch, **{'B':BATCH_SIZE})
                elif len(memory) > 0:
                    transitions = memory.sample(len(memory))
                    batch = Transition(*zip(*transitions))
                    agent.optimize_model(batch, **{'B':len(memory)})
                loss_history[-1].append(agent.losses[:])
#                 print(e,t,agent.losses)
                agent.losses=[]
                # Also record scheduler history for learning rate. If the scheduler is a Plateau one, then
                # we can know from the learning rate if we're in a flatter area.
                # https://discuss.pytorch.org/t/how-to-retrieve-learning-rate-from-reducelronplateau-scheduler/54234/2
                # The scheduler requires the validation loss - can I just use the average training loss instead?
#                 try:
#                     agent.scheduler.step(np.mean(loss_history[-1]))
#                     lr_history.append(agent.optimizer.param_groups[0]['lr'])
#                 except:
#                     agent.schedulerC.step(np.mean(loss_history[-1]))
#                     lr_history.append(agent.optimizerC.param_groups[0]['lr'])
                try:
                    loss_historyA[-1].append(agent.lossesA[:])
                    agent.lossesA=[]
#                     agent.schedulerA.step(np.mean(loss_historyA[-1]))
#                     lr_historyA.append(agent.optimizerA.param_groups[0]['lr'])
                except:
                    pass
            elif update_mode == UPDATE_ON_POLICY:
                # This case would ditch sampling, and just update by the current thing.
                # Note that methods that use future cumulative reward would be highly incompatible with this...
                if not(agent.centralized) or reward_mode & FUTURE_REWARD_YES != 0:
                    print("Error: Update-on-policy might be incompatible with decentralized planning or cumulative reward")
                    return None
                if rvar == -1 and rmean == 0 and reward_mode & FUTURE_REWARD_NORMALIZE != 0:
                    rvar = np.abs(reward)
                    rmean = reward
                reward = (reward - rmean) / rvar
                
                batch = Transition(state, action, next_state, [[reward]], [[reward]])
                agent.optimize_model(batch, **{'B':1})
#                 batch = Transition(state, action, next_state, reward, reward)
# #                 transitions = [batch,batch]
# #                 agent.optimize_model(Transition(*zip(*transitions)), **{'B':2})
#                 transitions = [batch,batch]
#                 agent.optimize_model(batch, **{'B':1})
                loss_history[-1].append(agent.losses[:])
                agent.losses=[]
                try:
                    loss_historyA[-1].append(agent.lossesA[:])
                    agent.lossesA=[]
                except:
                    pass
                
            else:
                # Store and push them outside the loop
                state_pool.append(state)
                if torch.is_tensor(action):
                    action_pool.append(action.detach().numpy())
                else:
                    action_pool.append(action)
                reward_pool.append(reward)
                next_state_pool.append(next_state)
                    
            state = next_state
            steps += 1

            if debug:
                env.render()

            if debug and done:
                print("Took ", t, " steps to converge")
                break
        
        # Now outside the iteration loop - prepare for per-episode trainings
        if update_mode == UPDATE_ON_POLICY:
            pass
        elif update_mode == UPDATE_PER_EPISODE: #se:
            inst_reward = torch.tensor(reward_pool)
            if reward_mode & FUTURE_REWARD_YES != 0:
                for j in range(len(reward_pool)): ### IT was previously miswritten as "reward". Retard bug that might had effects
                    if j > 0:
                        reward_pool[-j-1] += gamma * reward_pool[-j]
            reward_pool = torch.tensor(reward_pool)
            if reward_mode & FUTURE_REWARD_NORMALIZE != 0:
                if rvar == -1 and rmean == 0:
                    rmean = reward_pool.mean()
                    rvar = reward_pool.std()
                    print("Updated mean and stdev: {0} and {1}".format(rmean.numpy(), rvar.numpy()))
                reward_pool = (reward_pool - rmean) / rvar
                inst_reward = (inst_reward - rmean) / rvar

            # Update: 0106 added option to only push the first few iterations into the memory.
            # if agent.centralized:
            # #             print(state_pool[0].shape, action_pool[0].shape)
            #     for j in range(len(reward_pool)):
            #         memory.push(state_pool[-j-1], action_pool[-j-1], 
            #                     next_state_pool[-j-1], reward_pool[-j-1], inst_reward[-j-1])
            # else:
            #     for j in range(len(reward_pool)):
            #         for i in range(N):
            #             memory.push(state_pool[-j-1][i], action_pool[-j-1][i], 
            #                         next_state_pool[-j-1][i], reward_pool[-j-1][i], inst_reward[-j-1][i])
            if agent.centralized:
                for j in range(iteration_cutoff):
                    print(j, len(reward_pool))
                    memory.push(state_pool[j], action_pool[j], 
                                next_state_pool[j], reward_pool[j], inst_reward[j])
            else:
                for j in range(iteration_cutoff):
                    for i in range(N):
                        memory.push(state_pool[j][i], action_pool[j][i], 
                                    next_state_pool[j][i], reward_pool[j][i], inst_reward[j][i])
            

        if update_mode == UPDATE_PER_EPISODE:
            if len(memory) >= BATCH_SIZE:
                transitions = memory.sample(BATCH_SIZE)
                batch = Transition(*zip(*transitions))
                agent.optimize_model(batch, **{'B':BATCH_SIZE})
            elif len(memory) > 0:
                transitions = memory.sample(len(memory))
                batch = Transition(*zip(*transitions))
                agent.optimize_model(batch, **{'B':len(memory)})
            loss_history[-1].append(agent.losses[:])
            agent.losses=[]
            # Also record scheduler history for learning rate. If the scheduler is a Plateau one, then
            # we can know from the learning rate if we're in a flatter area.
            # https://discuss.pytorch.org/t/how-to-retrieve-learning-rate-from-reducelronplateau-scheduler/54234/2
#             try:
#                 agent.scheduler.step(np.mean(loss_history[-1]))
#                 lr_history.append(agent.optimizer.param_groups[0]['lr'])
#             except:
#                 agent.schedulerC.step(np.mean(loss_history[-1]))
#                 lr_history.append(agent.optimizerC.param_groups[0]['lr'])
            try:
                loss_historyA[-1].append(agent.lossesA[:])
                agent.lossesA=[]
#                 agent.schedulerA.step(np.mean(loss_historyA[-1]))
#                 lr_historyA.append(agent.optimizerA.param_groups[0]['lr'])
            except:
                pass
        
        if debug:
            print("Episode ", e, " finished; t = ", t)
        
        if e % test_interval == 0:
            print("Test result at episode ", e, ": ")
            test_hist = test(agent, env, num_test, num_iteration, num_sample, action_space, 
                             seed=test_seeds[int(e/test_interval)], debug=debug, not_use_rand_in_action=not_use_rand_in_test)
            test_hists.append(test_hist)
        
        # Save demos of simulation if wanted
        if e % save_sim_intv == (save_sim_intv-1) and e > 0:
            try:
                fnames = [f+'_{0}'.format(e) for f in save_sim_fnames]
                plot_test(agent, env, fnames=fnames,
                    num_iteration=num_iteration, action_space=action_space, imdir=imdir,
                    debug=debug, useVid=useVid, not_use_rand=not_use_rand_in_test)
                for f in fnames:
                    os.system('ffmpeg -y -pattern_type glob -i "'+imdir+f+'*.jpg" '+f+'.gif')
            except:
                print("Failed to save simulation at e={0}".format(e))
            if save_intm_models and len(save_sim_fnames) > 0:
                agent.save_model(save_sim_fnames[0]+'_{0}'.format(e))
    if return_memory:
        return test_hists, memory
    else:
        return test_hists
示例#9
0
class Agent(object):
    def __init__(self,
                 state_space,
                 n_actions,
                 replay_buffer_size=50000,
                 batch_size=32,
                 hidden_size=12,
                 gamma=0.98):
        self.n_actions = n_actions
        self.state_space_dim = state_space
        self.policy_net = DQN(state_space, n_actions, hidden_size)
        self.target_net = DQN(state_space, n_actions, hidden_size)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()
        self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=1e-3)
        self.memory = ReplayMemory(replay_buffer_size)
        self.batch_size = batch_size
        self.gamma = gamma

    def update_network(self, updates=1):
        for _ in range(updates):
            self._do_network_update()

    def _do_network_update(self):
        if len(self.memory) < self.batch_size:
            return
        transitions = self.memory.sample(self.batch_size)
        # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
        # detailed explanation). This converts batch-array of Transitions
        # to Transition of batch-arrays.
        batch = Transition(*zip(*transitions))

        # Compute a mask of non-final states and concatenate the batch elements
        # (a final state would've been the one after which simulation ended)
        non_final_mask = 1 - torch.tensor(batch.done, dtype=torch.uint8)
        non_final_next_states = [
            s for nonfinal, s in zip(non_final_mask, batch.next_state)
            if nonfinal > 0
        ]
        non_final_next_states = torch.stack(non_final_next_states)
        state_batch = torch.stack(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)

        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
        # columns of actions taken. These are the actions which would've been taken
        # for each batch state according to policy_net
        state_action_values = self.policy_net(state_batch).gather(
            1, action_batch)

        # Compute V(s_{t+1}) for all next states.
        # Expected values of actions for non_final_next_states are computed based
        # on the "older" target_net; selecting their best reward with max(1)[0].
        # This is merged based on the mask, such that we'll have either the expected
        # state value or 0 in case the state was final.
        next_state_values = torch.zeros(self.batch_size)
        next_state_values[non_final_mask] = self.target_net(
            non_final_next_states).max(1)[0].detach()

        # Task 4: TODO: Compute the expected Q values
        expected_state_action_values = reward_batch + self.gamma * next_state_values

        # Compute Huber loss
        loss = F.smooth_l1_loss(state_action_values.squeeze(),
                                expected_state_action_values)

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1e-1, 1e-1)
        self.optimizer.step()

    def get_action(self, state, epsilon=0.05):
        sample = random.random()
        if sample > epsilon:
            with torch.no_grad():
                state = torch.from_numpy(state).float()
                q_values = self.policy_net(state)
                return torch.argmax(q_values).item()
        else:
            return random.randrange(self.n_actions)

    def update_target_network(self):
        self.target_net.load_state_dict(self.policy_net.state_dict())

    def store_transition(self, state, action, next_state, reward, done):
        action = torch.Tensor([[action]]).long()
        reward = torch.tensor([reward], dtype=torch.float32)
        next_state = torch.from_numpy(next_state).float()
        state = torch.from_numpy(state).float()
        self.memory.push(state, action, next_state, reward, done)
示例#10
0
class DDPG_Agent:
    def __init__(self, ob_sp, act_sp, alow, ahigh, writer, args):
        self.args = args
        self.alow = alow
        self.ahigh = ahigh
        self.policy = Policy_net(ob_sp, act_sp)
        self.policy_targ = Policy_net(ob_sp, act_sp)
        self.qnet = Q_net(ob_sp, act_sp)
        self.qnet_targ = Q_net(ob_sp, act_sp)

        self.policy.to(device)
        self.qnet.to(device)
        self.policy_targ.to(device)
        self.qnet_targ.to(device)
        self.MSE_loss = nn.MSELoss()
        self.noise = OUNoise(1, 1)

        hard_update(self.policy_targ, self.policy)
        hard_update(self.qnet_targ, self.qnet)

        self.p_optimizer = optim.Adam(self.policy.parameters(), lr=LR)
        self.q_optimizer = optim.Adam(self.qnet.parameters(), lr=LR)
        self.memory = ReplayMemory(int(1e6))
        self.epsilon_scheduler = LinearSchedule(E_GREEDY_STEPS,
                                                FINAL_STD,
                                                INITIAL_STD,
                                                warmup_steps=WARMUP_STEPS)
        self.n_steps = 0
        self.n_updates = 0
        self.writer = writer

    def get_action(self, state):
        if self.args.use_ounoise:
            noise = self.noise.sample()[0]
        else:
            noise = np.random.normal(
                0, self.epsilon_scheduler.value(self.n_steps))
        st = torch.from_numpy(state).view(1, -1).float()
        action = self.policy(st)
        action_with_noise = np.clip(action.item() + noise, self.alow,
                                    self.ahigh)
        if self.args.use_writer:
            self.writer.add_scalar("action mean", action.item(), self.n_steps)
            self.writer.add_scalar("action noise", noise, self.n_steps)
            self.writer.add_scalar("epsilon",
                                   self.epsilon_scheduler.value(self.n_steps),
                                   self.n_steps)
            self.writer.add_scalar("action", action_with_noise, self.n_steps)
        self.n_steps += 1
        return action_with_noise

    def store_transition(self, state, action, reward, next_state, done):

        self.memory.push(torch.from_numpy(state), torch.tensor(action),
                         torch.tensor(reward), torch.from_numpy(next_state),
                         torch.tensor(done))

    def reset(self):
        self.noise.reset()

    def train(self):
        batch = self.memory.sample(min(BATCH_SIZE, len(self.memory)))
        b_dict = [torch.stack(elem) for elem in Transition(*zip(*batch))]
        states, actions, rewards, next_states, dones = \
            b_dict[0], b_dict[1].view(-1, 1), \
            b_dict[2].view(-1, 1).float().to(device), b_dict[3], \
            b_dict[4].view(-1, 1).float().to(device)

        #  CRITIC LOSS: Q(s, a) += (r + gamma*Q'(s, π'(s)) - Q(s, a))
        # inputs computation
        inputs_critic = self.qnet(states, actions)
        # targets
        with torch.no_grad():
            policy_acts = self.policy_targ(next_states)
        targ_values = self.qnet_targ(next_states, policy_acts)
        targets_critics = rewards + GAMMA * (1 - dones) * targ_values
        loss_critic = self.MSE_loss(inputs_critic, targets_critics)
        self.q_optimizer.zero_grad()
        loss_critic.backward()
        # nn.utils.clip_grad_norm_(self.qnet.parameters(), GRAD_CLIP)
        self.q_optimizer.step()

        # ACTOR objective: derivative of Q(s, π(s | ø)) with respect to ø
        actor_loss = -self.qnet(states, self.policy(states)).mean()
        self.p_optimizer.zero_grad()
        actor_loss.backward()
        # nn.utils.clip_grad_norm_(self.policy.parameters(), GRAD_CLIP)
        self.p_optimizer.step()
        soft_update(self.policy_targ, self.policy, TAU)
        soft_update(self.qnet_targ, self.qnet, TAU)
        if self.args.use_writer:
            self.writer.add_scalar("critic_loss", loss_critic.item(),
                                   self.n_updates)
            self.writer.add_scalar("actor_loss", actor_loss.item(),
                                   self.n_updates)
        self.n_updates += 1
class Agent(nn.Module):
    def __init__(self, q_models, target_model, hyperbolic, k, gamma,
                 model_params, replay_buffer_size, batch_size, inp_dim, lr):
        super(Agent, self).__init__()
        if hyperbolic:
            self.q_models = torch.nn.ModuleList(q_models)
            self.target_models = torch.nn.ModuleList(target_model)
        else:
            self.q_models = q_models
            self.target_models = target_model
        self.optimizer = optim.RMSprop(self.q_models.parameters(), lr=1e-5)
        self.hyperbolic = hyperbolic
        self.n_actions = model_params.act_space
        self.k = k
        self.gamma = gamma
        self.memory = ReplayMemory(replay_buffer_size)
        self.batch_size = batch_size
        self.inp_dim = inp_dim

    def update_network(self, updates=1):
        for _ in range(updates):
            self._do_network_update()

    @staticmethod
    def get_hyperbolic_train_coeffs(k, num_models):
        coeffs = []
        gamma_intervals = np.linspace(0, 1, num_models + 2)
        for i in range(1, num_models + 1):
            coeffs.append(((gamma_intervals[i + 1] - gamma_intervals[i]) *
                           (1 / k) * gamma_intervals[i]**((1 / k) - 1)))
        return torch.tensor(coeffs) / sum(coeffs)

    def get_action(self, state_batch, epsilon=0.05):
        model_outputs = []
        take_random_action = random.random()
        if take_random_action > epsilon:
            return random.randrange(self.n_actions)
        elif self.hyperbolic:
            if take_random_action > epsilon:
                return random.randrange(self.n_actions)
            else:
                with torch.no_grad():
                    state_batch = torch.tensor(state_batch,
                                               dtype=torch.float32).view(
                                                   -1, self.inp_dim)
                    for ind, mdl in enumerate(self.q_models):
                        model_outputs.append(mdl(state_batch))
                    coeff = self.get_hyperbolic_train_coeffs(
                        self.k, len(self.q_models))
                    model_outputs = torch.cat(model_outputs, 1).reshape(
                        -1, len(self.q_models))
                    model_outputs = (model_outputs * coeff).sum(dim=1)
                    return torch.argmax(model_outputs).item()

    def get_state_act_vals(self, state_batch, action_batch=None):
        if self.hyperbolic:
            model_outputs = []
            for ind, mdl in enumerate(self.q_models):
                model_outputs.append(mdl(state_batch).gather(1, action_batch))
            model_outputs = torch.cat(model_outputs,
                                      1).reshape(-1, len(self.q_models))
            coeffs = self.get_hyperbolic_train_coeffs(self.k,
                                                      len(self.q_models))
            model_outputs = model_outputs * coeffs
            return model_outputs.sum(dim=1).reshape(-1, 1)
        else:
            model_output = self.q_models(state_batch).gather(1, action_batch)
            return model_output

    def get_max_next_state_vals(self, non_final_mask, non_final_next_states):
        if self.hyperbolic:
            target_outptus = []
            gammas = torch.tensor(np.linspace(0, 1,
                                              len(self.q_models) + 1),
                                  dtype=torch.float)[1:]
            for ind, mdl in enumerate(self.target_models):
                next_state_values = torch.zeros(self.batch_size)
                next_state_values[non_final_mask] = mdl(
                    non_final_next_states).max(1)[0].detach()
                target_outptus.append(next_state_values)
            target_outptus = torch.cat(target_outptus,
                                       0).reshape(-1, len(self.target_models))
            target_outptus = target_outptus * gammas
            return target_outptus

    def _do_network_update(self):
        if len(self.memory) < self.batch_size:
            return
        transitions = self.memory.sample(self.batch_size)
        batch = Transition(*zip(*transitions))
        non_final_mask = ~torch.tensor(batch.done, dtype=torch.bool)
        non_final_next_states = [
            s for nonfinal, s in zip(non_final_mask, batch.next_state)
            if nonfinal > 0
        ]
        non_final_next_states = torch.stack(non_final_next_states)
        state_batch = torch.stack(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)
        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
        # columns of actions taken. These are the actions which would've been taken
        # for each batch state according to policy_net
        state_action_values = self.get_state_act_vals(state_batch,
                                                      action_batch)
        # Compute V(s_{t+1}) for all next states.
        # Expected values of actions for non_final_next_states are computed based
        # on the "older" target_net; selecting their best reward with max(1)[0].
        # This is merged based on the mask, such that we'll have either the expected
        # state value or 0 in case the state was final.
        state_action_values = state_action_values.view(-1, 1).repeat(
            1, len(self.q_models))
        next_state_values = self.get_max_next_state_vals(
            non_final_mask, non_final_next_states)
        expected_state_action_values = next_state_values + reward_batch.view(
            -1, 1).repeat(1, len(self.q_models))
        loss = (state_action_values - expected_state_action_values)**2
        coefs = self.get_hyperbolic_train_coeffs(self.k, len(self.q_models))
        loss = torch.sum(loss * coefs)
        # loss = F.smooth_l1_loss(state_action_values.squeeze(),
        #                         expected_state_action_values)

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def update_target_network(self):
        self.target_net.load_state_dict(self.policy_net.state_dict())

    def store_transition(self, state, action, next_state, reward, done):
        action = torch.Tensor([[action]]).long()
        reward = torch.tensor([reward], dtype=torch.float32)
        next_state = torch.from_numpy(next_state).float()
        state = torch.from_numpy(state).float()
        self.memory.push(state, action, next_state, reward, done)
示例#12
0
class Agent:
    
    def __init__(self, args):

        # which environment to load from the opencv database
        self.env_id = "PongNoFrameskip-v4"
        # create the environment
        self.env = Environment(self.env_id)

        # part of the q-value formula
        self.discount_factor = 0.99
        self.batch_size = 64
        # how often to update the network (backpropogation)
        self.update_frequency = 4
        # often synchronize with the target  network
        self.target_network_update_freq = 1000

        # keeps track of the frames for training, and retrieves them in batches 
        self.agent_history_length = 4
        self.memory = ReplayMemory(capacity=10000, batch_size=self.batch_size)

        # two neural networks. One for main and one for target
        self.main_network = PongNetwork(num_actions=self.env.get_action_space_size(), agent_history_length=self.agent_history_length)
        self.target_network = PongNetwork(num_actions=self.env.get_action_space_size(), agent_history_length=self.agent_history_length)
        
        # adam optimizer. just a standard procedure
        self.optimizer = Adam(learning_rate=1e-4, epsilon=1e-6)
        # we start with a high exploration rate then slowly decrease it
        self.init_explr = 1.0
        self.final_explr = 0.1
        self.final_explr_frame = 1000000
        self.replay_start_size = 10000

        # metrics for the loss 
        self.loss = tf.keras.losses.Huber()
        # this will be the mean of 100 last rewards
        self.loss_metric = tf.keras.metrics.Mean(name="loss")
        # comes from the q loss below
        self.q_metric = tf.keras.metrics.Mean(name="Q_value")

        # what is the max number of frames to train. probably won't reach here.
        self.training_frames = int(1e7)

        # path to save the checkpoints, logs and the weights
        self.checkpoint_path = "./checkpoints/" + args.run_name
        self.tensorboard_writer = tf.summary.create_file_writer(self.checkpoint_path + "/runs/")
        self.print_log_interval = 10
        self.save_weight_interval = 10
        self.env.reset()
           

     # calculate the network loss on the replay buffer (Q-learning)
    def update_main_q_network(self, state_batch, action_batch, reward_batch, next_state_batch, terminal_batch):
       
        with tf.GradientTape() as tape:
            ## THIS IS WHERE THE MAGIC HAPPENS!
            ## L = Q(s, a) - (r + discount_factor* Max Q(s’, a))
            next_state_q = self.target_network(next_state_batch)
            next_state_max_q = tf.math.reduce_max(next_state_q, axis=1)
            expected_q = reward_batch + self.discount_factor * next_state_max_q * (1.0 - tf.cast(terminal_batch, tf.float32))
            main_q = tf.reduce_sum(self.main_network(state_batch) * tf.one_hot(action_batch, self.env.get_action_space_size(), 1.0, 0.0), axis=1)
            loss = self.loss(tf.stop_gradient(expected_q), main_q)

        gradients = tape.gradient(loss, self.main_network.trainable_variables)
        clipped_gradients = [tf.clip_by_norm(grad, 10) for grad in gradients]
        self.optimizer.apply_gradients(zip(clipped_gradients, self.main_network.trainable_variables))

        self.loss_metric.update_state(loss)
        self.q_metric.update_state(main_q)

        return loss

    
     # calculate the network loss on the replay buffer (Double Q-learning)
    def update_main_dq_network(self, state_batch, action_batch, reward_batch, next_state_batch, terminal_batch):
        
        with tf.GradientTape() as tape:
            # THIS IS WHERE THE MAGIC HAPPENS!
            ## here we maintain two Q values: one to maximize the reward in the next state and one to update current state
            q_online = self.main_network(next_state_batch)  # Use q values from online network
            action_q_online = tf.math.argmax(q_online, axis=1)  # optimal actions from the q_online
            q_target = self.target_network(next_state_batch)  #  q values from target netowkr
            ddqn_q = tf.reduce_sum(q_target * tf.one_hot(action_q_online, self.env.get_action_space_size(), 1.0, 0.0), axis=1)
            expected_q = reward_batch + self.discount_factor * ddqn_q * (1.0 - tf.cast(terminal_batch, tf.float32))  # Corresponds to equation (4) in ddqn paper
            main_q = tf.reduce_sum(self.main_network(state_batch) * tf.one_hot(action_batch, self.env.get_action_space_size(), 1.0, 0.0), axis=1)
            loss = self.loss(tf.stop_gradient(expected_q), main_q)

        gradients = tape.gradient(loss, self.main_network.trainable_variables)
        clipped_gradients = [tf.clip_by_norm(grad, 10) for grad in gradients]
        self.optimizer.apply_gradients(zip(clipped_gradients, self.main_network.trainable_variables))

        self.loss_metric.update_state(loss)
        self.q_metric.update_state(main_q)

        return loss



    # get the next action index based on the state (84,84,4) and exploration rate
    def get_action(self, state, exploration_rate):
        recent_state = tf.expand_dims(state, axis=0)
        if tf.random.uniform((), minval=0, maxval=1, dtype=tf.float32) < exploration_rate:
            action = tf.random.uniform((), minval=0, maxval=self.env.get_action_space_size(), dtype=tf.int32)
        else:
            q_value = self.main_network(tf.cast(recent_state, tf.float32))
            action = tf.cast(tf.squeeze(tf.math.argmax(q_value, axis=1)), dtype=tf.int32)
        return action
        
    
    # get the epsilon value for the current based. Similar to https://openai.com/blog/openai-baselines-dqn/
    def get_eps(self, current_step, terminal_eps=0.01, terminal_frame_factor=25):
    
        terminal_eps_frame = self.final_explr_frame * terminal_frame_factor

        if current_step < self.replay_start_size:
            eps = self.init_explr
        elif self.replay_start_size <= current_step and current_step < self.final_explr_frame:
            eps = (self.final_explr - self.init_explr) / (self.final_explr_frame - self.replay_start_size) * (current_step - self.replay_start_size) + self.init_explr
        elif self.final_explr_frame <= current_step and current_step < terminal_eps_frame:
            eps = (terminal_eps - self.final_explr) / (terminal_eps_frame - self.final_explr_frame) * (current_step - self.final_explr_frame) + self.final_explr
        else:
            eps = terminal_eps
        return eps
    
        
    # copy over the weights between the main and target network to synchronize
    def update_target_network(self):
        main_vars = self.main_network.trainable_variables
        target_vars = self.target_network.trainable_variables
        for main_var, target_var in zip(main_vars, target_vars):
            target_var.assign(main_var)

    def train(self, algorithm='q'):
    
        total_step = 0
        episode = 0
        latest_mean_score = -99.99
        latest_100_score = deque(maxlen=100)
        # this is kinda arbitrary but looks like the best bot reach 20 when they are done training in this game
        max_reward = 20.0

        # train until the mean reward reaches 20
        while latest_mean_score < max_reward:
            
            # reset the variable for the upcoming episode
            state = self.env.reset()
            episode_step = 0
            episode_score = 0.0
            done = False


            while not done:
                # while the episode is not done, calculate the epsilon and get the next action
                eps = self.get_eps(tf.constant(total_step, tf.float32))
                action = self.get_action(tf.constant(state), tf.constant(eps, tf.float32))
            
                next_state, reward, done, info = self.env.step(action)
                episode_score += reward

                self.memory.push(state, action, reward, next_state, done)
                state = next_state

                # update the netwrok
                if (total_step % self.update_frequency == 0) and (total_step > self.replay_start_size):
                    indices = self.memory.get_minibatch_indices()
                    state_batch, action_batch, reward_batch, next_state_batch, terminal_batch = self.memory.generate_minibatch_samples(indices)
                    if algorithm == 'q':
                        self.update_main_q_network(state_batch, action_batch, reward_batch, next_state_batch, terminal_batch)
                    else:
                        self.update_main_dq_network(state_batch, action_batch, reward_batch, next_state_batch, terminal_batch)

                if (total_step % self.target_network_update_freq == 0) and (total_step > self.replay_start_size):
                    loss = self.update_target_network()
                
                total_step += 1
                episode_step += 1

                if done:
                    latest_100_score.append(episode_score)
                    self.write_summary(episode, latest_100_score, episode_score, total_step, eps)
                    episode += 1

                    if episode % self.print_log_interval == 0:
                        print("Episode: ", episode)
                        print("Latest 100 avg: {:.4f}".format(np.mean(latest_100_score)))
                        print("Progress: {} / {} ( {:.2f} % )".format(total_step, self.training_frames, 
                        np.round(total_step / self.training_frames, 3) * 100))
                        latest_mean_score = np.mean(latest_100_score)

                    if episode % self.save_weight_interval == 0:
                        print("Saving weights...")
                        self.main_network.save_weights(self.checkpoint_path + "/weights/episode_{}".format(episode))


    # write the summaries back to the tensorboard
    def write_summary(self, episode, latest_100_score, episode_score, total_step, eps):

        with self.tensorboard_writer.as_default():
            tf.summary.scalar("Reward", episode_score, step=episode)
            tf.summary.scalar("Latest 100 avg rewards", np.mean(latest_100_score), step=episode)
            tf.summary.scalar("Loss", self.loss_metric.result(), step=episode)
            tf.summary.scalar("Average Q", self.q_metric.result(), step=episode)
            tf.summary.scalar("Total Frames", total_step, step=episode)
            tf.summary.scalar("Epsilon", eps, step=episode)

        self.loss_metric.reset_states()
        self.q_metric.reset_states()
示例#13
0
class Agent:
    """Definition of the Agent that will interact with the environment.

    Attributes:
        REPLAY_MEM_SIZE (:obj:`int`): max capacity of Replay Memory

        BATCH_SIZE (:obj:`int`): Batch size. Default is 40 as specified in the paper.

        GAMMA (:obj:`float`): The discount, should be a constant between 0 and 1
            that ensures the sum converges. It also controls the importance of future
            expected reward.

        EPS_START(:obj:`float`): initial value for epsilon of the e-greedy action
            selection

        EPS_END(:obj:`float`): final value for epsilon of the e-greedy action
            selection

        LEARNING_RATE(:obj:`float`): learning rate of the optimizer
            (Adam)

        INPUT_DIM (:obj:`int`): input dimentionality withut considering batch size.

        HIDDEN_DIM (:obj:`int`): hidden layer dimentionality (for Linear models only)

        ACTION_NUMBER (:obj:`int`): dimentionality of output layer of the Q network

        TARGET_UPDATE (:obj:`int`): period of Q target network updates

        MODEL (:obj:`string`): type of the model.

        DOUBLE (:obj:`bool`): Type of Q function computation.
    """
    def __init__(self,
                 REPLAY_MEM_SIZE=10000,
                 BATCH_SIZE=40,
                 GAMMA=0.98,
                 EPS_START=1,
                 EPS_END=0.12,
                 EPS_STEPS=300,
                 LEARNING_RATE=0.001,
                 INPUT_DIM=24,
                 HIDDEN_DIM=120,
                 ACTION_NUMBER=3,
                 TARGET_UPDATE=10,
                 MODEL='ddqn',
                 DOUBLE=True):

        self.REPLAY_MEM_SIZE = REPLAY_MEM_SIZE
        self.BATCH_SIZE = BATCH_SIZE
        self.GAMMA = GAMMA
        self.EPS_START = EPS_START
        self.EPS_END = EPS_END
        self.EPS_STEPS = EPS_STEPS
        self.LEARNING_RATE = LEARNING_RATE
        self.INPUT_DIM = INPUT_DIM
        self.HIDDEN_DIM = HIDDEN_DIM
        self.ACTION_NUMBER = ACTION_NUMBER
        self.TARGET_UPDATE = TARGET_UPDATE
        self.MODEL = MODEL  # deep q network (dqn) or Dueling deep q network (ddqn)
        self.DOUBLE = DOUBLE  # to understand if use or do not use a 'Double' model (regularization)
        self.TRAINING = True  # to do not pick random actions during testing
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        print("Agent is using device:\t" + str(self.device))
        '''elif self.MODEL == 'lin_ddqn':
            self.policy_net = DuelingDQN(self.INPUT_DIM, self.HIDDEN_DIM, self.ACTION_NUMBER).to(self.device)
            self.target_net = DuelingDQN(self.INPUT_DIM, self.HIDDEN_DIM, self.ACTION_NUMBER).to(self.device)
        elif self.MODEL == 'lin_dqn':
            self.policy_net = DQN(self.INPUT_DIM, self.HIDDEN_DIM, self.ACTION_NUMBER).to(self.device)
            self.target_net = DQN(self.INPUT_DIM, self.HIDDEN_DIM, self.ACTION_NUMBER).to(self.device)
        '''

        if self.MODEL == 'ddqn':
            self.policy_net = ConvDuelingDQN(
                self.INPUT_DIM, self.ACTION_NUMBER).to(self.device)
            self.target_net = ConvDuelingDQN(
                self.INPUT_DIM, self.ACTION_NUMBER).to(self.device)
        elif self.MODEL == 'dqn':
            self.policy_net = ConvDQN(self.INPUT_DIM,
                                      self.ACTION_NUMBER).to(self.device)
            self.target_net = ConvDQN(self.INPUT_DIM,
                                      self.ACTION_NUMBER).to(self.device)

        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()

        self.optimizer = optim.Adam(self.policy_net.parameters(),
                                    lr=self.LEARNING_RATE)
        self.memory = ReplayMemory(self.REPLAY_MEM_SIZE)
        self.steps_done = 0
        self.training_cumulative_reward = []

    def select_action(self, state):
        """ the epsilon-greedy action selection"""
        state = state.unsqueeze(0).unsqueeze(1)
        sample = random.random()
        if self.TRAINING:
            if self.steps_done > self.EPS_STEPS:
                eps_threshold = self.EPS_END
            else:
                eps_threshold = self.EPS_START
        else:
            eps_threshold = self.EPS_END

        self.steps_done += 1
        # [Exploitation] pick the best action according to current Q approx.
        if sample > eps_threshold:
            with torch.no_grad():
                # Return the number of the action with highest non normalized probability
                # TODO: decide if diverge from paper and normalize probabilities with
                # softmax or at least compare the architectures
                return torch.tensor([self.policy_net(state).argmax()],
                                    device=self.device,
                                    dtype=torch.long)

        # [Exploration]  pick a random action from the action space
        else:
            return torch.tensor([random.randrange(self.ACTION_NUMBER)],
                                device=self.device,
                                dtype=torch.long)

    def optimize_model(self):
        if len(self.memory) < self.BATCH_SIZE:
            # it will return without doing nothing if we have not enough data to sample
            return
        transitions = self.memory.sample(self.BATCH_SIZE)
        # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
        # detailed explanation). This converts batch-array of Transitions
        # to Transition of batch-arrays.
        # Transition is the named tuple defined above.
        batch = Transition(*zip(*transitions))

        # Compute a mask of non-final states and concatenate the batch elements
        # (a final state would've been the one after which simulation ended)
        #
        # non_final_mask is a column vector telling wich state of the sampled is final
        # non_final_next_states contains all the non-final states sampled
        non_final_mask = torch.tensor(tuple(
            map(lambda s: s is not None, batch.next_state)),
                                      device=self.device,
                                      dtype=torch.bool)
        nfns = [s for s in batch.next_state if s is not None]
        non_final_next_states = torch.cat(nfns).view(len(nfns), -1)
        non_final_next_states = non_final_next_states.unsqueeze(1)

        state_batch = torch.cat(batch.state).view(self.BATCH_SIZE, -1)
        state_batch = state_batch.unsqueeze(1)
        action_batch = torch.cat(batch.action).view(self.BATCH_SIZE, -1)
        reward_batch = torch.cat(batch.reward).view(self.BATCH_SIZE, -1)

        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
        # columns of actions taken. These are the actions which would've been taken
        # for each batch state according to policy_net
        state_action_values = self.policy_net(state_batch).gather(
            1, action_batch)

        # Compute V(s_{t+1}) for all next states.
        # Expected values of actions for non_final_next_states are computed based
        # on the "older" target_net; selecting their best reward with max(1)[0].
        # This is merged based on the mask, such that we'll have either the expected
        # state value or 0 in case the state was final.
        # detach removes the tensor from the graph -> no gradient computation is
        # required
        next_state_values = torch.zeros(self.BATCH_SIZE, device=self.device)
        next_state_values[non_final_mask] = self.target_net(
            non_final_next_states).max(1)[0].detach()
        next_state_values = next_state_values.view(self.BATCH_SIZE, -1)

        # Compute the expected Q values
        expected_state_action_values = (next_state_values *
                                        self.GAMMA) + reward_batch
        # print("expected_state_action_values.shape:\t%s"%str(expected_state_action_values.shape))

        # Compute MSE loss
        loss = F.mse_loss(state_action_values, expected_state_action_values
                          )  # expected_state_action_values.unsqueeze(1)

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

    def optimize_double_dqn_model(self):
        if len(self.memory) < self.BATCH_SIZE:
            # it will return without doing nothing if we have not enough data to sample
            return
        transitions = self.memory.sample(self.BATCH_SIZE)
        # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
        # detailed explanation). This converts batch-array of Transitions
        # to Transition of batch-arrays.
        # Transition is the named tuple defined above.
        batch = Transition(*zip(*transitions))

        # Compute a mask of non-final states and concatenate the batch elements
        # (a final state would've been the one after which simulation ended)
        #
        # non_final_mask is a column vector telling wich state of the sampled is final
        # non_final_next_states contains all the non-final states sampled
        non_final_mask = torch.tensor(tuple(
            map(lambda s: s is not None, batch.next_state)),
                                      device=self.device,
                                      dtype=torch.bool)
        nfns = [s for s in batch.next_state if s is not None]
        non_final_next_states = torch.cat(nfns).view(len(nfns), -1)
        non_final_next_states = non_final_next_states.unsqueeze(1)

        state_batch = torch.cat(batch.state).view(self.BATCH_SIZE, -1)
        state_batch = state_batch.unsqueeze(1)
        action_batch = torch.cat(batch.action).view(self.BATCH_SIZE, -1)
        reward_batch = torch.cat(batch.reward).view(self.BATCH_SIZE, -1)
        # print("state_batch shape: %s\nstate_batch[0]:%s\nactionbatch shape: %s\nreward_batch shape: %s"%(str(state_batch.view(40,-1).shape),str(state_batch.view(40,-1)[0]),str(action_batch.shape),str(reward_batch.shape)))

        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
        # columns of actions taken. These are the actions which would've been taken
        # for each batch state according to policy_net
        state_action_values = self.policy_net(state_batch).gather(
            1, action_batch)

        # ---------- D-DQN Extra Line---------------
        _, next_state_action = self.policy_net(state_batch).max(1,
                                                                keepdim=True)

        # Compute V(s_{t+1}) for all next states.
        # Expected values of actions for non_final_next_states are computed based
        # on the actions given by policynet.
        # This is merged based on the mask, such that we'll have either the expected
        # state value or 0 in case the state was final.
        # detach removes the tensor from the graph -> no gradient computation is
        # required
        next_state_values = torch.zeros(self.BATCH_SIZE,
                                        device=self.device).view(
                                            self.BATCH_SIZE, -1)

        out = self.target_net(non_final_next_states)
        next_state_values[non_final_mask] = out.gather(
            1, next_state_action[non_final_mask])
        # next_state_values = next_state_values.view(self.BATCH_SIZE, -1)
        # Compute the expected Q values
        expected_state_action_values = (next_state_values *
                                        self.GAMMA) + reward_batch

        # Compute MSE loss
        loss = F.mse_loss(state_action_values, expected_state_action_values)

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

    def train(self, env, path, num_episodes=40):
        self.TRAINING = True
        cumulative_reward = [0 for t in range(num_episodes)]
        print("Training:")
        for i_episode in tqdm(range(num_episodes)):
            # Initialize the environment and state
            env.reset(
            )  # reset the env st it is set at the beginning of the time serie
            self.steps_done = 0
            state = env.get_state()
            for t in range(len(env.data)):  # while not env.done

                # Select and perform an action
                action = self.select_action(state)
                reward, done, _ = env.step(action)

                cumulative_reward[i_episode] += reward.item()

                # Observe new state: it will be None if env.done = True. It is the next
                # state since env.step() has been called two rows above.
                next_state = env.get_state()

                # Store the transition in memory
                self.memory.push(state, action, next_state, reward)

                # Move to the next state
                state = next_state

                # Perform one step of the optimization (on the policy network): note that
                # it will return without doing nothing if we have not enough data to sample

                if self.DOUBLE:
                    self.optimize_double_dqn_model()
                else:
                    self.optimize_model()

                if done:
                    break

            # Update the target network, copying all weights and biases of policy_net
            if i_episode % self.TARGET_UPDATE == 0:
                self.target_net.load_state_dict(self.policy_net.state_dict())

        # save the model
        if self.DOUBLE:
            model_name = env.reward_f + '_reward_double_' + self.MODEL + '_model'
            count = 0
            while os.path.exists(path +
                                 model_name):  # avoid overrinding models
                count += 1
                model_name = model_name + "_" + str(count)

        else:
            model_name = env.reward_f + '_reward_' + self.MODEL + '_model'
            count = 0
            while os.path.exists(path +
                                 model_name):  # avoid overrinding models
                count += 1
                model_name = model_name + "_" + str(count)

        torch.save(self.policy_net.state_dict(), path + model_name)

        return cumulative_reward

    def test(self, env_test, model_name=None, path=None):
        self.TRAINING = False
        cumulative_reward = [0 for t in range(len(env_test.data))]
        reward_list = [0 for t in range(len(env_test.data))]

        if model_name is None:
            pass
        elif path is not None:
            if re.match(".*_dqn_.*", model_name):
                self.policy_net = ConvDQN(self.INPUT_DIM,
                                          self.ACTION_NUMBER).to(self.device)
                if str(self.device) == "cuda":
                    self.policy_net.load_state_dict(
                        torch.load(path + model_name))
                else:
                    self.policy_net.load_state_dict(
                        torch.load(path + model_name,
                                   map_location=torch.device('cpu')))
            elif re.match(".*_ddqn_.*", model_name):
                self.policy_net = ConvDuelingDQN(
                    self.INPUT_DIM, self.ACTION_NUMBER).to(self.device)
                if str(self.device) == "cuda":
                    self.policy_net.load_state_dict(
                        torch.load(path + model_name))
                else:
                    self.policy_net.load_state_dict(
                        torch.load(path + model_name,
                                   map_location=torch.device('cpu')))
            else:
                raise RuntimeError(
                    "Please Provide a valid model name or valid path.")
        else:
            raise RuntimeError(
                'Path can not be None if model Name is not None.')

        env_test.reset(
        )  # reset the env st it is set at the beginning of the time serie
        state = env_test.get_state()
        for t in tqdm(range(len(env_test.data))):  # while not env.done

            # Select and perform an action
            action = self.select_action(state)

            reward, done, _ = env_test.step(action)

            cumulative_reward[t] += reward.item(
            ) + cumulative_reward[t - 1 if t - 1 > 0 else 0]
            reward_list[t] = reward

            # Observe new state: it will be None if env.done = True. It is the next
            # state since env.step() has been called two rows above.
            next_state = env_test.get_state()

            # Move to the next state
            state = next_state

            if done:
                break

        return cumulative_reward, reward_list
class Agent(object):
    def __init__(self,
                 num_actions,
                 gamma=0.98,
                 memory_size=5000,
                 batch_size=32):
        self.scaler = None
        self.featurizer = None
        self.q_functions = None
        self.gamma = gamma
        self.batch_size = batch_size
        self.num_actions = num_actions
        self.memory = ReplayMemory(memory_size)
        self.initialize_model()

    def initialize_model(self):
        # Draw some samples from the observation range and initialize the scaler
        obs_limit = np.array([4.8, 5, 0.5, 5])
        samples = np.random.uniform(-obs_limit, obs_limit,
                                    (1000, obs_limit.shape[0]))
        self.scaler = StandardScaler()
        self.scaler.fit(samples)

        # Initialize the RBF featurizer
        self.featurizer = FeatureUnion([
            ("rbf1", RBFSampler(gamma=5.0, n_components=100)),
            ("rbf2", RBFSampler(gamma=2.0, n_components=80)),
            ("rbf3", RBFSampler(gamma=1.0, n_components=50)),
        ])
        self.featurizer.fit(self.scaler.transform(samples))

        # Create a value approximator for each action
        self.q_functions = [
            SGDRegressor(learning_rate="constant", max_iter=500, tol=1e-3)
            for _ in range(self.num_actions)
        ]

        # Initialize it to whatever values; implementation detail
        for q_a in self.q_functions:
            q_a.partial_fit(self.featurize(samples),
                            np.zeros((samples.shape[0], )))

    def featurize(self, state):
        """ Test two different features for state representations
        """
        if len(state.shape) == 1:
            state = state.reshape(1, -1)
        # Task 1a: TODO: Use (s, abs(s)) as features # handcrafted feature vector: s = [1, -2, 3, -4], then (s, abs(s)) = [1, -2, 3, -4, 1, 2, 3, 4] (see slack discussion)
        #return np.concatenate((state, abs(state)), axis=1)
        # Task 1b: RBF features # radial basis function representations
        return self.featurizer.transform(self.scaler.transform(state))

    def get_action(self, state, epsilon=0.0):
        if np.random.random() < epsilon:
            a = int(np.random.random() * self.num_actions)
            return a
        else:
            featurized = self.featurize(state)
            qs = [q.predict(featurized)[0] for q in self.q_functions]
            qs = np.array(qs)
            a = np.argmax(qs, axis=0)
            return a

    def single_update(self, state, action, next_state, reward, done):
        # Calculate feature representations of the
        # Task 1: TODO: Set the feature state and feature next state
        featurized_state = self.featurize(state)
        featurized_next_state = self.featurize(next_state)

        # Task 1:  TODO Get Q(s', a) for the next state
        predictions = []
        for q_func in self.q_functions:  # one function approximator for each of the two actions
            predictions.append(
                q_func.predict(featurized_next_state)
            )  # calculate prediction for every function approximator q_function
        next_qs = np.max(predictions)  # chose highest predicted value

        # Calculate the updated target Q- values
        # Task 1: TODO: Calculate target based on rewards and next_qs
        if done:  # terminal state
            target = [reward + self.gamma * 0]
        else:  # not terminal state
            target = [reward + self.gamma * next_qs]

        # Update Q-value estimation
        self.q_functions[action].partial_fit(
            featurized_state,
            target)  # partial_fit() for mini-batch learning (see sklearn docs)

    def update_estimator(self):
        if len(self.memory) < self.batch_size:
            # Use the whole memory
            samples = self.memory.memory
        else:
            # Sample some data
            samples = self.memory.sample(
                self.batch_size
            )  # return random sample; length=32 # print("", )

        # Task 2: TODO: Reformat data in the minibatch
        states = np.array(
            [sample.state for sample in samples]
        )  # pick all the states from the batch, we have to retrieve the data of the batches
        action = np.array([
            sample.action for sample in samples
        ])  # return array with 32 elements (number of batch size)
        next_states = np.array([sample.next_state for sample in samples])
        rewards = np.array([sample.reward for sample in samples])
        dones = np.array([sample.done for sample in samples])

        # Task 2: TODO: Calculate Q(s', a)
        featurized_next_states = self.featurize(next_states)
        # we need to do the same for next_qs as in single_update but for every sample in the batch
        next_qs = []  # 32x1 (#samples x #functions)
        for s in featurized_next_states:
            arr = np.array([q.predict([s]) for q in self.q_functions])
            next_qs.append(np.max(arr))
        next_qs = np.array(next_qs)

        # Calculate the updated target values
        # Task 2: TODO: Calculate target based on rewards and next_qs
        targets = rewards + self.gamma * next_qs * (1 - dones)

        # Calculate featurized states
        featurized_states = self.featurize(states)

        # Get new weights for each action separately
        for a in range(self.num_actions):
            # Find states where a was taken
            idx = action == a

            # If a not present in the batch, skip and move to the next action
            if np.any(idx):
                act_states = featurized_states[idx]
                act_targets = targets[idx]

                # Perform a single SGD step on the Q-function params
                self.q_functions[a].partial_fit(act_states, act_targets)

    def store_transition(self, *args):
        self.memory.push(*args)
class DQNagent:

    def __init__(self, mem_size, epsilon, mini_batch_size, learning_rate, gamma):

        self.epsilon = epsilon
        self.mini_batch_size = mini_batch_size
        self.gamma = gamma

        self.update_counter = 0

        self.net = nn.Sequential(
            nn.Linear(2, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 3)
        ).float()

        self.net_target = copy.deepcopy(self.net)

        self.net = self.net.cuda()
        self.net_target = self.net_target.cuda()

        # self.net_target = nn.Sequential(
        #     nn.Linear(2, 128),
        #     nn.ReLU(),
        #     nn.Linear(128, 128),
        #     nn.ReLU(),
        #     nn.Linear(128, 3)
        # ).float()

        self.replay_memory = ReplayMemory(max_size=mem_size)

        self.criterion = nn.MSELoss()
        self.optimizer = optim.Adam(self.net.parameters(), lr=learning_rate)
    
    def get_action(self, obs, mode='e-greedy'):
        if mode == 'random':
            action = random.choice([0, 1, 2])
        elif mode == 'greedy':
            obs = torch.tensor(obs, dtype=torch.float).cuda()
            with torch.no_grad():
                action = torch.argmax(self.net(obs)).cpu().numpy().tolist()
        elif mode == 'e-greedy':
            action = random.choice([0, 1, 2])
            if random.random() >= self.epsilon:
                obs = torch.tensor(obs, dtype=torch.float).cuda()
                with torch.no_grad():
                    action = torch.argmax(self.net(obs)).cpu().numpy().tolist()
        # if not explore and random.random() >= self.epsilon:
        #     obs = torch.tensor(obs, dtype=torch.float).cuda()
        #     with torch.no_grad():
        #         action = torch.argmax(self.net(obs)).cpu().numpy().tolist()
        
        assert type(action) == int
        return action
    
    def store_transition(self, obs, action, reward, new_obs, done):
        self.replay_memory.push(obs, action, reward, new_obs, done)
    
    def update(self):
        
        if len(self.replay_memory) < self.mini_batch_size:
            return

        obs_batch, action_batch, reward_batch, new_obs_batch, done_batch = self.replay_memory.sample(self.mini_batch_size)

        new_obs_batch = torch.tensor(new_obs_batch, dtype=torch.float).cuda()
        # print(new_obs_batch.shape)
        # time.sleep(5)
        with torch.no_grad():
            target_batch = torch.tensor(reward_batch, dtype=torch.float).cuda()
            # print(target_batch.shape)
            # time.sleep(5)
            vals_new_obs = torch.max(self.net_target(new_obs_batch), dim=1)[0]
            # print(vals_new_obs.shape)
            # time.sleep(5)
            for i in range(self.mini_batch_size):
                if not done_batch[i]:
                    target_batch[i] += self.gamma * vals_new_obs[i]
            # target_batch = target_batch + self.gamma * vals_new_obs
        
        obs_batch = torch.tensor(obs_batch, dtype=torch.float).cuda()
        pred_batch = self.net(obs_batch)
        # print(pred_batch[:5])
        # print(pred_batch.size(0))
        # print(action_batch)
        # pred_batch_ = pred_batch[torch.arange(pred_batch.size(0)), action_batch]
        action_batch = torch.tensor(action_batch, dtype=torch.long).cuda()
        # print(action_batch[:5])
        pred_batch_ = pred_batch.gather(1, action_batch.unsqueeze(1)).squeeze(1)
        # print(pred_batch_[:5])
        # time.sleep(5)

        loss = self.criterion(pred_batch_, target_batch)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        self.update_counter += 1
        if self.update_counter%20 == 0:
            self.update_counter = 0
            for target_param, param in zip(self.net_target.parameters(), self.net.parameters()):
                target_param.data.copy_(param)
示例#16
0
class Agent(object):
    def __init__(self,
                 env_name,
                 state_space,
                 n_actions,
                 replay_buffer_size=500000,
                 batch_size=32,
                 hidden_size=64,
                 gamma=0.99):
        self.env_name = env_name
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.train_device = device
        self.n_actions = n_actions
        self.state_space_dim = state_space
        if "CartPole" in self.env_name:
            self.policy_net = CartpoleDQN(state_space, n_actions, 4)
            self.target_net = CartpoleDQN(state_space, n_actions, 4)
            self.target_net.load_state_dict(self.policy_net.state_dict())
            self.target_net.eval()
            self.optimizer = optim.Adam(self.policy_net.parameters(), lr=1e-4)
        elif "WimblepongVisualSimpleAI-v0" in self.env_name:
            self.policy_net = Policy(state_space, n_actions, 4)
            self.target_net = Policy(state_space, n_actions, 4)
            self.target_net.load_state_dict(self.policy_net.state_dict())
            self.target_net.eval()
            self.optimizer = optim.Adam(self.policy_net.parameters(), lr=5e-4)
        else:
            raise ValueError(
                "Wrong environment. An agent has not been specified for %s" %
                env_name)
        self.memory = ReplayMemory(replay_buffer_size)
        self.batch_size = batch_size
        self.gamma = gamma

    def update_network(self, updates=1):
        for _ in range(updates):
            self._do_network_update()

    def _do_network_update(self):
        if len(self.memory) < self.batch_size:
            return
        transitions = self.memory.sample(self.batch_size)
        # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
        # detailed explanation). This converts batch-array of Transitions
        # to Transition of batch-arrays.
        batch = Transition(*zip(*transitions))

        # Compute a mask of non-final states and concatenate the batch elements
        # (a final state would've been the one after which simulation ended)
        non_final_mask = 1 - torch.tensor(batch.done, dtype=torch.uint8).to(
            self.train_device)
        non_final_mask = non_final_mask.type(torch.bool)
        non_final_next_states = [
            s for nonfinal, s in zip(non_final_mask, batch.next_state)
            if nonfinal > 0
        ]
        non_final_next_states = torch.stack(non_final_next_states).to(
            self.train_device)
        state_batch = torch.stack(batch.state).to(self.train_device)
        action_batch = torch.cat(batch.action).to(self.train_device)
        reward_batch = torch.cat(batch.reward).to(self.train_device)

        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
        # columns of actions taken. These are the actions which would've been taken
        # for each batch state according to policy_net
        state_action_values = self.policy_net(state_batch).gather(
            1, action_batch).to(self.train_device)

        # Compute V(s_{t+1}) for all next states.
        # Expected values of actions for non_final_next_states are computed based
        # on the "older" target_net; selecting their best reward with max(1)[0].
        # This is merged based on the mask, such that we'll have either the expected
        # state value or 0 in case the state was final.
        next_state_values = torch.zeros(self.batch_size)
        next_state_values[non_final_mask] = self.target_net(
            non_final_next_states).max(1)[0].detach()

        # Task 4: TODO: Compute the expected Q values
        expected_state_action_values = reward_batch + self.gamma * next_state_values

        # Compute Huber loss
        loss = F.smooth_l1_loss(state_action_values.squeeze(),
                                expected_state_action_values)

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1e-1, 1e-1)
        self.optimizer.step()

    def get_action(self, state, epsilon=0.05):
        #print('initial get action',state.shape)

        #print('final get action',state.shape)
        sample = random.random()
        if sample > epsilon:
            with torch.no_grad():
                #print('a',state)
                state = torch.from_numpy(state)
                #print('b',state)
                state = state.unsqueeze(0)
                q_values = self.policy_net(state)
                return torch.argmax(q_values).item()
        else:
            return random.randrange(3)

    def preprocessing(self, observation):
        """ Preprocess the received information: 1) Grayscaling 2) Reducing quality (resizing)
        Params:
            observation: image of pong
        """
        # Grayscaling
        #img_gray = rgb2gray(observation)
        img_gray = np.dot(observation,
                          [0.2989, 0.5870, 0.1140]).astype(np.uint8)

        # Normalize pixel values
        img_norm = img_gray / 255.0

        # Downsampling: we receive squared image (e.g. 200x200) and downsample by x2.5 to (80x80)
        img_resized = cv2.resize(img_norm, dsize=(80, 80))
        #img_resized = img_norm[::2.5,::2.5]
        return img_resized

    def stack_images(self, observation, img_collection, timestep):
        """ Stack up to four frames together
        """
        # image preprocessing
        img_preprocessed = self.preprocessing(observation)

        if (timestep == 0):  # start of new episode
            # img_collection get filled with zeros again
            img_collection = deque(
                [np.zeros((80, 80), dtype=np.int) for i in range(4)], maxlen=4)
            # fill img_collection 4x with the first frame
            img_collection.append(img_preprocessed)
            img_collection.append(img_preprocessed)
            img_collection.append(img_preprocessed)
            img_collection.append(img_preprocessed)
            # Stack the images in img_collection
            img_stacked = np.stack(img_collection, axis=2)
        else:
            # Delete first/oldest entry and append new image
            #img_collection.pop(0)
            img_collection.append(img_preprocessed)

            # Stack the images in img_collection
            img_stacked = np.stack(img_collection,
                                   axis=2)  # TODO: right axis??

        return img_stacked, img_collection

    def update_target_network(self):
        self.target_net.load_state_dict(self.policy_net.state_dict())

    def store_transition(self, state, action, next_state, reward, done):
        action = torch.Tensor([[action]]).long().to(self.train_device)
        reward = torch.tensor([reward],
                              dtype=torch.float32).to(self.train_device)
        next_state = torch.from_numpy(next_state).float().to(self.train_device)
        state = torch.from_numpy(state).float().to(self.train_device)
        self.memory.push(state, action, next_state, reward, done)

    def load_model(self):
        #load_path = '/home/isaac/codes/autonomous_driving/highway-env/data/2020_09_03/Intersection_egoattention_dqn_ego_attention_1_22:00:25/models'
        #policy.load_state_dict(torch.load("./model50000ep_WimblepongVisualSimpleAI-v0_0.mdl"))
        """ Load already created model
        return:
            none
        """
        weights = torch.load("FROM2100v2WimblepongVisualSimpleAI-v0_1900.mdl",
                             map_location=self.train_device)
        self.policy_net.load_state_dict(weights, strict=False)

    def get_name(self):
        """ Interface function to retrieve the agents name
        """
        return self.name

    def reset(self):
        """ Resets the agent’s state after an episode is finished
示例#17
0
def train(eps_decay, gamma, lr, network, seed=131):
    id = 'LunarLander-v2'
    env = gym.make(id).unwrapped
    n_actions = env.action_space.n
    n_states = env.observation_space.shape[0]
    # set seed
    random.seed(seed)
    env.seed(seed)

    # initiate the network
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    if network not in NETWORK.keys():
        raise ValueError('Network key not existed!')

    fc1_unit, fc2_unit = NETWORK.get(network)
    policy_net = DQN(state_size=n_states,
                     action_size=n_actions,
                     fc1_unit=fc1_unit,
                     fc2_unit=fc2_unit,
                     seed=131).to(device)
    target_net = DQN(state_size=n_states,
                     action_size=n_actions,
                     fc1_unit=fc1_unit,
                     fc2_unit=fc2_unit,
                     seed=1).to(device)
    target_net.load_state_dict(policy_net.state_dict())

    # initiate the memory replayer and optimizer
    memory = ReplayMemory(MEMORY_CAPACITY)
    # optimizer = optim.RMSprop(policy_net.parameters())
    optimizer = optim.Adam(policy_net.parameters(), lr=lr)

    # initiate the global steps
    steps_done = 0
    # Here my watch started
    rewards = []
    for i_episode in range(N_EPISODES):
        cumulative_reward = 0
        state = env.reset()
        state = torch.tensor([state])
        for t in count():
            if t > N_STEPS_TIMEOUT:
                break
            action, steps_done = select_action(state=state,
                                               policy_net=policy_net,
                                               n_actions=n_actions,
                                               steps_done=steps_done,
                                               device=device,
                                               eps_end=EPS_END,
                                               eps_start=EPS_START,
                                               eps_decay=eps_decay)

            state_next, reward, done, _ = env.step(action.item())
            # env.render()
            cumulative_reward = cumulative_reward + reward
            # convert it to tensor
            state_next = torch.tensor([state_next], device=device)
            reward = torch.tensor([reward], device=device, dtype=torch.float32)
            memory.push(state, action, state_next, reward)
            state = state_next

            # every step update the weights in the policy net
            optimize_model(memory=memory,
                           batch_size=BATCH_SIZE,
                           device=device,
                           policy_net=policy_net,
                           target_net=target_net,
                           optimizer=optimizer,
                           gamma=gamma)

            if done:
                break

        rewards.append(cumulative_reward)

        # update the target net after a while
        if i_episode % TARGET_UPDATE == 0:
            # If want the soft update the weights
            #         soft_update(local_model=policy_net, target_model=target_net, tau=TAU)
            target_net.load_state_dict(policy_net.state_dict())

        if np.min(rewards[-5:]) >= 200:
            break

    # save the rewards
    rewards_path = 'training_rewards_{lr}_{eps_decay}_{gamma}_{network}.pkl'.format(
        lr=lr, eps_decay=eps_decay, gamma=gamma, network=network)
    save_rewards(rewards=rewards, path=rewards_path, option='training_rewards')

    # save the policy net
    model_path = 'model_{lr}_{eps_decay}_{gamma}_{network}.pt'.format(
        lr=lr, eps_decay=eps_decay, gamma=gamma, network=network)
    save_model(model=policy_net, path=model_path)
    print("Finished parameter combo: {params}".format(
        params=[eps_decay, gamma, lr, network]))
示例#18
0
class Agent:
    def __init__(self,
                 state_space,
                 n_actions,
                 replay_buffer_size=50000,
                 batch_size=32,
                 hidden_size=64,
                 gamma=0.99):
        self.n_actions = n_actions
        self.state_space_dim = state_space
        self.policy_net = GenericNetwork(state_space,
                                         n_actions,
                                         hidden_size,
                                         name='dqn_network_')
        self.target_net = GenericNetwork(state_space,
                                         n_actions,
                                         hidden_size,
                                         name='target_dqn_network_')
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()
        self.memory = ReplayMemory(replay_buffer_size)
        self.batch_size = batch_size
        self.gamma = gamma
        self.action = {}
        self.j = 0

    def learn(self):
        """
        Learning function
        :return:
        """
        if len(self.memory) < self.batch_size:
            return
        transitions = self.memory.sample(self.batch_size)
        batch = Transition(*zip(*transitions))
        non_final_mask = 1 - T.tensor(batch.done, dtype=T.uint8)

        # avoid having an empty tensor
        test_tensor = T.zeros(self.batch_size)
        while T.all(T.eq(test_tensor, non_final_mask)).item() is True:
            transitions = self.memory.sample(self.batch_size)
            batch = Transition(*zip(*transitions))
            non_final_mask = 1 - T.tensor(batch.done, dtype=T.uint8)

        non_final_next_states = [
            s for nonfinal, s in zip(non_final_mask, batch.next_state)
            if nonfinal > 0
        ]
        non_final_next_states = T.stack(non_final_next_states)
        state_batch = T.stack(batch.state)
        action_batch = T.cat(batch.action)
        reward_batch = T.cat(batch.reward)

        state_action_values = self.policy_net(state_batch).gather(
            1, action_batch)

        next_state_values = T.zeros(self.batch_size)
        next_state_values[non_final_mask] = self.target_net(
            non_final_next_states).max(1)[0].detach()

        expected_state_action_values = (next_state_values *
                                        self.gamma) + reward_batch
        # Compute mse loss
        loss = F.mse_loss(state_action_values.squeeze(),
                          expected_state_action_values)
        # Optimize the model
        self.policy_net.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1e-1, 1e-1)
        self.policy_net.optimizer.step()

    def get_action(self, state, epsilon=0.05):
        """
        Used to select actions
        :param state:
        :param epsilon:
        :return: action
        """
        sample = random.random()
        if sample > epsilon:
            with T.no_grad():
                state = T.from_numpy(state).float()
                q_values = self.policy_net(state)
                self.action[self.j] = {
                    'list_of_actions': q_values,
                    'max': T.argmax(q_values).item()
                }
                self.j += 1
                return T.argmax(q_values).item() + 1
        else:
            action = random.randrange(self.n_actions)
            return action + 1

    def update_target_network(self):
        """
        Used to update target networks
        :return:
        """
        self.target_net.load_state_dict(self.policy_net.state_dict())

    def store_transition(self, state, action, reward, next_state, done):
        """
        Used for memory replay purposes
        :param state:
        :param action:
        :param reward:
        :param next_state:
        :param done:
        :return:
        """
        action = T.Tensor([[action]]).long()
        reward = T.tensor([reward], dtype=T.float32)
        next_state = T.from_numpy(next_state).float()
        state = T.from_numpy(state).float()
        self.memory.push(state, action, reward, next_state, done)

    def save_models(self):
        """
        Used to save models
        :return:
        """
        self.policy_net.save_checkpoint()
        self.target_net.save_checkpoint()

    def load_models(self):
        """
        Used to load models
        :return:
        """
        self.policy_net.load_checkpoint()
示例#19
0
class Agent(nn.Module):
    def __init__(self,
                 q_models,
                 target_model,
                 hyperbolic,
                 k,
                 gamma,
                 model_params,
                 replay_buffer_size,
                 batch_size,
                 inp_dim,
                 lr,
                 no_models,
                 act_space,
                 hidden_size,
                 loss_type,
                 target_update=False):
        super(Agent, self).__init__()
        if hyperbolic:
            self.q_models = DQN(state_space_dim=inp_dim,
                                action_space_dim=act_space,
                                hidden=hidden_size,
                                no_models=no_models)
            self.target_models = DQN(state_space_dim=inp_dim,
                                     action_space_dim=act_space,
                                     hidden=hidden_size,
                                     no_models=no_models)
            self.target_models.load_state_dict(self.q_models.state_dict())
            self.target_models.eval()
        else:
            self.q_models = q_models
        self.optimizer = optim.RMSprop(self.q_models.parameters(), lr=lr)
        self.hyperbolic = hyperbolic
        self.n_actions = model_params.act_space
        self.k = k
        # self.gammas = torch.tensor(np.linspace(0, 1, self.q_models.no_models + 1), dtype=torch.float)[1:]
        self.gammas = np.sort(
            np.random.uniform(0, 1, self.q_models.no_models + 1))
        self.gammas = np.append(self.gammas, 0.98)
        self.gammas = torch.tensor(np.sort(self.gammas))
        self.memory = ReplayMemory(replay_buffer_size)
        self.batch_size = batch_size
        self.inp_dim = inp_dim
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")
        self.target_models.to(self.device)
        self.q_models.to(self.device)
        self.gammas = self.gammas.to(self.device)
        self.loss_type = loss_type
        self.criterion = nn.MSELoss()
        self.use_target_network = target_update

    def update_network(self, updates=1):
        for _ in range(updates):
            loss = self._do_network_update()
        return loss

    def get_hyperbolic_train_coeffs(self, k, num_models):
        coeffs = []
        for i in range(1, num_models + 1):
            coeffs.append(((self.gammas[i + 1] - self.gammas[i]) * (1 / k) *
                           self.gammas[i]**((1 / k) - 1)))
        return torch.tensor(coeffs).to(self.device) / sum(coeffs)

    def get_action(self, state_batch, epsilon=0.05, get_among_last=False):
        # epsilon gets smaller as time goes by.
        # (glie_a/(glie_a + eps)) with eps in range(0, no_episodes)
        take_random_action = random.random()
        if take_random_action < epsilon:
            return random.randrange(self.n_actions)
        elif get_among_last:
            state_batch = torch.tensor(state_batch,
                                       dtype=torch.float32,
                                       device=self.device).view(
                                           -1, self.inp_dim)
            model_outputs = self.q_models(state_batch).reshape(
                2, self.q_models.no_models)
            return torch.argmax(model_outputs[:, -10].view(-1)).item()
            model_outputs = model_outputs * self.get_hyperbolic_train_coeffs(
                self.k, self.q_models.no_models)
            actions = torch.argmax(torch.sum(model_outputs, dim=1))
            return actions.item()
        elif self.hyperbolic:
            with torch.no_grad():
                state_batch = torch.tensor(state_batch,
                                           dtype=torch.float32,
                                           device=self.device).view(
                                               -1, self.inp_dim)
                model_outputs = self.q_models(state_batch.double()).reshape(
                    -1, 2)
                coeffs = self.get_hyperbolic_train_coeffs(
                    self.k, self.q_models.no_models).reshape(-1, 1)
                model_outputs = model_outputs * coeffs
                actions = torch.argmax(torch.sum(model_outputs, dim=0))
            return actions.item()

    def get_state_act_vals(self, state_batch, action_batch=None):
        if self.hyperbolic:
            action_batch = action_batch.repeat(
                1, self.q_models.no_models).reshape(-1, 1)
            model_outputs = self.q_models(state_batch.to(self.device).double())
            model_outputs = model_outputs.reshape(-1, self.n_actions)
            model_outputs = model_outputs.gather(1, action_batch)
            # .reshape(self.q_models.no_models * state_batch.shape[0],
            #          2).gather(1, action_batch.reshape(-1))
            return model_outputs
        else:
            model_output = self.q_models(state_batch).gather(1, action_batch)
            return model_output

    def get_max_next_state_vals(self, non_final_mask, non_final_next_states):
        if self.hyperbolic:
            with torch.no_grad():
                next_state_values = torch.zeros(self.batch_size).to(
                    self.device)
                # doing it like this, the model_no will come first and then the batch_no (b1m1, b1m2, b1m3..., b2m1,
                # ...b10m1, b10m2...
                # if False in non_final_mask:
                #     print(non_final_mask)
                #     print(len(non_final_next_states))
                non_final_mask = non_final_mask.reshape(-1, 1).repeat(
                    1, self.q_models.no_models).view(-1)
                # if False in non_final_mask:
                #     print([nf for nf in non_final_mask])
                next_state_values = next_state_values.view(-1, 1).repeat(
                    1, self.q_models.no_models).view(-1)
                if self.use_target_network:
                    # [b1m1o1, b1m1o2], -> max -> [b1m1]
                    # [b1m2o1, b1m2o2],           [b1m2]
                    # [b1m3o1, b1m3o3],           [b1m3]
                    # ...                         ...
                    #
                    next_state_values[non_final_mask] = \
                        self.target_models(non_final_next_states.to(self.device)).reshape(-1, self.n_actions).max(1)[0]
                    # if False in non_final_mask:
                    #     print("first", self.target_models(non_final_next_states.to(self.device)))
                    #     print("after reshaping", self.target_models(non_final_next_states.to(self.device)).reshape(-1, self.n_actions))
                    #     print(self.target_models(non_final_next_states.to(self.device)).shape)
                    #     print("next_state_values", next_state_values)
                else:
                    next_state_values[non_final_mask] = \
                        self.q_models(non_final_next_states.to(self.device)).reshape(-1, self.n_actions).max(1)[0]
                target_outptus = next_state_values
                return target_outptus * self.gammas[2:].repeat(self.batch_size)

    def _do_network_update(self):
        if len(self.memory) < self.batch_size:
            return
        transitions = self.memory.sample(self.batch_size)
        batch = Transition(*zip(*transitions))
        non_final_mask = ~torch.tensor(batch.done, dtype=torch.bool)
        non_final_next_states = [
            s for nonfinal, s in zip(non_final_mask, batch.next_state)
            if nonfinal
        ]
        non_final_next_states = torch.stack(non_final_next_states).to(
            self.device)
        state_batch = torch.stack(batch.state).to(self.device)
        action_batch = torch.cat(batch.action).to(self.device)
        reward_batch = torch.cat(batch.reward).to(self.device)
        state_action_values = self.get_state_act_vals(state_batch,
                                                      action_batch).view(-1)
        next_state_values = self.get_max_next_state_vals(
            non_final_mask, non_final_next_states)
        # this should be perfect
        expected_state_action_values = next_state_values + \
                                       reward_batch.view(-1, 1).repeat(1, self.q_models.no_models).view(-1)
        # print(reward_batch.view(-1, 1).repeat(1, self.q_models.no_models).view(-1).shape)
        if self.loss_type == "weighted_loss":
            loss = (state_action_values - expected_state_action_values)**2
            hyp_coef = self.get_hyperbolic_train_coeffs(
                self.k, self.q_models.no_models).repeat(self.batch_size)
            loss = (loss.reshape(-1).view(-1) * hyp_coef).view(-1)
            loss = torch.mean(loss)
        elif self.loss_type == "separate_summarized_loss":
            loss = F.smooth_l1_loss(state_action_values,
                                    expected_state_action_values).double()
            # loss = (state_action_values - expected_state_action_values) ** 2
            # loss = torch.sum(loss)
        elif self.loss_type == "one_output_loss":
            hyp_coef = self.get_hyperbolic_train_coeffs(
                self.k, self.q_models.no_models)
            state_action_values = state_action_values.reshape(
                self.batch_size, -1) * hyp_coef
            state_action_values = torch.sum(state_action_values, dim=1)
            expected_state_action_values = expected_state_action_values.reshape(
                self.batch_size, -1) * hyp_coef
            expected_state_action_values = torch.sum(
                expected_state_action_values, dim=1)
            loss = self.criterion(state_action_values,
                                  expected_state_action_values)

        loss_item = loss.item()
        # print(hyp_coef.repeat(self.batch_size).shape)
        # print(loss.shape)
        # loss = (state_action_values - expected_state_action_values) ** 2 * self.get_hyperbolic_train_coeffs(self.k,
        #                                                                                                     self.q_models.no_models).repeat(
        #     self.batch_size)
        # # loss = torch.sum(loss)
        # loss = F.smooth_l1_loss(stsave_figate_action_values.squeeze(),
        #                         expected_state_action_values)
        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.q_models.parameters():
            param.grad.data.clamp_(-1e-1, 1e-1)
        self.optimizer.step()
        return loss_item

    def update_target_network(self):
        self.target_models.load_state_dict(self.q_models.state_dict())

    def store_transition(self, state, action, next_state, reward, done):
        action = torch.Tensor([[action]]).long()
        reward = torch.tensor([reward], dtype=torch.float32)
        next_state = torch.from_numpy(next_state).float()
        state = torch.from_numpy(state).float()
        self.memory.push(state, action, next_state, reward, done)
示例#20
0
        observation, reward, done, _ = env.step(action.item())
        env.render()

        # record reward
        running_reward += reward
        reward = torch.tensor([reward], device=device)

        if not done:
            next_state = torch.tensor([observation],
                                      device=device,
                                      dtype=torch.float32)
        else:
            next_state = None

        # Store the transition in memory
        memory.push(current_state, action, next_state, reward)
        training_info["memory"] = memory

        # Compute the TD loss of current transition and store it into episode loss
        if not done:
            current_q = policy_net(current_state)[:, action].squeeze()
            target_q = policy_net(next_state).max() + reward.squeeze()
            target_q = torch.tensor(target_q.item(), device=device)
            trans_loss = F.smooth_l1_loss(current_q, target_q).item()
            # Record the TD loss
            running_episode_loss += trans_loss
            if trans_loss > training_info["max TD loss recorded"]:
                training_info["max TD loss recorded"] = trans_loss

        # Move to the next state
        current_state = next_state
示例#21
0
def main():
    parser = argparse.ArgumentParser(description='DQN Breakout Script')
    parser.add_argument('--use-cuda',
                        action='store_true',
                        default=False,
                        help='whether to use CUDA (default: False)')
    parser.add_argument('--batch-size',
                        type=int,
                        default=128,
                        metavar='M',
                        help='batch size (default: 128)')
    parser.add_argument('--gamma',
                        type=float,
                        default=0.999,
                        metavar='M',
                        help='gamma (default: 0.999)')
    parser.add_argument('--eps-start',
                        type=float,
                        default=0.9,
                        metavar='M',
                        help='eps start (default: 0.9)')
    parser.add_argument('--eps-end',
                        type=float,
                        default=0.05,
                        metavar='M',
                        help='eps end (default: 0.05)')
    parser.add_argument('--eps-decay',
                        type=int,
                        default=200,
                        metavar='M',
                        help='eps decay (default: 200)')
    parser.add_argument('--num-obs-in-state',
                        type=int,
                        default=4,
                        metavar='M',
                        help='num observations in state (default: 4)')
    parser.add_argument('--replay-memory-capacity',
                        type=int,
                        default=10000,
                        metavar='M',
                        help='replay memory capacity (default: 10000)')
    parser.add_argument('--num-episodes',
                        type=int,
                        default=10,
                        metavar='M',
                        help='num of episodes (default: 10)')
    parser.add_argument('--reset-period',
                        type=int,
                        default=5,
                        metavar='M',
                        help='period to reset target network (default: 5)')
    parser.add_argument('--atari-env',
                        type=str,
                        default='Breakout-v0',
                        metavar='M',
                        help='Atari environment to use (default: Breakout-v0)')
    args = parser.parse_args()

    env = gym.envs.make(args.atari_env)

    model = DQN(args.num_obs_in_state, (84, 84), env.action_space.shape[0])
    model_target = DQN(args.num_obs_in_state, (84, 84),
                       env.action_space.shape[0])

    if args.use_cuda:
        model.cuda()
        model_target.cuda()

    optimizer = optim.RMSprop(model.parameters())
    memory = ReplayMemory(args.replay_memory_capacity)

    epsilons = np.linspace(args.eps_start, args.eps_end, args.eps_decay)
    step_idx = 1
    reset_idx = 1

    tfs = get_transforms()

    episode_reward = 0.
    episode_length = 0

    for i_episode in range(args.num_episodes):
        # Initialize the environment and state
        obs = env.reset()
        state_processor = StateProcessor(args.num_obs_in_state, tfs, obs)
        state = state_processor.get_state()

        while True:
            episode_length += 1
            if step_idx < args.eps_decay:
                eps = epsilons[step_idx]
            else:
                eps = args.eps_end

            action = select_action(model, state, env.action_space.shape[0],
                                   eps, args.use_cuda)
            # print('%d %d' % (episode_length, action[0,0]))
            next_obs, reward, done, info = env.step(action[0, 0])
            episode_reward += reward
            reward = torch.Tensor([reward])
            if args.use_cuda:
                reward = reward.cuda()

            if not done:
                state_processor.push_obs(next_obs)
                next_state = state_processor.get_state()
            else:
                next_state = None  # None next_state marks done

            memory.push(state, action, next_state, reward)

            # optimize
            optimize_model(optimizer, memory, model, model_target,
                           args.batch_size, args.gamma, args.use_cuda)

            step_idx += 1
            reset_idx += 1
            if reset_idx == args.reset_period:
                reset_idx = 1
                model_target.load_state_dict(model.state_dict())

            if done:
                break

        print(episode_reward)
        print(episode_length)
        episode_reward = 0.
        episode_length = 0
            action, steps_done = select_action(state=state,
                                               policy_net=policy_net,
                                               n_actions=n_actions,
                                               steps_done=steps_done,
                                               device=device,
                                               eps_end=EPS_END,
                                               eps_start=EPS_START,
                                               eps_decay=EPS_DECAY)

            state_next, reward, done, _ = env.step(action.item())
            # env.render()
            cumulative_reward = cumulative_reward + reward
            # convert it to tensor
            state_next = torch.tensor([state_next], device=device)
            reward = torch.tensor([reward], device=device, dtype=torch.float32)
            memory.push(state, action, state_next, reward)
            state = state_next

            # every step update the weights in the policy net
            optimize_model(memory=memory,
                           batch_size=BATCH_SIZE,
                           device=device,
                           policy_net=policy_net,
                           target_net=target_net,
                           optimizer=optimizer,
                           gamma=GAMMA)

            if done:
                break

        rewards.append(cumulative_reward)
示例#23
0
class RaLLy():
    def __init__(self, name, env):
        self.name = name
        self.env = env
        self.eps = 0.005
        self.max_timesteps = 10000
        self.explore_noise = 0.5
        self.batch_size = 32
        self.discount = 0.99
        self.tau = 0.005
        self.max_episode_steps = 200
        self.memory = ReplayMemory(10000)

    def train(self):
        policy = DDPGTrainer()
        total_timesteps = 0
        episode_timesteps = 0
        episode_num = 0
        episode_done = True
        episode_reward = 0

        while total_timesteps < self.max_timesteps:
            if episode_done:
                if total_timesteps != 0:
                    print(
                        f"Total steps: {total_timesteps:12} | Episodes: {episode_num:3} | Total reward: {episode_reward}"
                    )
                    # TODO: get training stats
                    policy.train(self.memory, episode_timesteps,
                                 self.batch_size, self.discount, self.tau)

                # Reset environment
                episode_done = False
                episode_num += 1
                episode_timesteps = 0
                episode_reward = 0
                obs = env.reset()

            control, jump, boost, handbrake = policy.actor(torch.tensor(obs))
            action = torch.cat([control, jump, boost, handbrake])

            if self.explore_noise != 0:
                noise = np.random.normal(0, self.explore_noise, size=1)
                noise = torch.clamp(torch.Tensor(noise), -1, 1)
                noise = torch.cat([noise, torch.zeros(3)])
                action = action + noise
                action = torch.clamp(action, -1, 1)

            print(action)

            # Perform action
            new_obs, reward, done, _ = env.step(action.detach())
            episode_done = True if episode_timesteps + 1 == self.max_episode_steps else done
            done_bool = float(done)
            episode_reward += reward

            # Store data in replay buffer
            self.memory.push((obs, new_obs, action, reward, done_bool))

            obs = new_obs
            episode_timesteps += 1
            total_timesteps += 1
示例#24
0
class Agent(object):
    def __init__(self,
                 num_actions,
                 gamma=0.98,
                 memory_size=5000,
                 batch_size=32):
        self.scaler = None
        self.featurizer = None
        self.q_functions = None
        self.gamma = gamma
        self.batch_size = batch_size
        self.num_actions = num_actions
        self.memory = ReplayMemory(memory_size)
        self.initialize_model()

    def initialize_model(self):
        # Draw some samples from the observation range and initialize the scaler
        obs_limit = np.array([4.8, 5, 0.5, 5])
        samples = np.random.uniform(-obs_limit, obs_limit,
                                    (1000, obs_limit.shape[0]))
        self.scaler = StandardScaler()
        self.scaler.fit(samples)

        # Initialize the RBF featurizer
        self.featurizer = FeatureUnion([
            ("rbf1", RBFSampler(gamma=5.0, n_components=100)),
            ("rbf2", RBFSampler(gamma=2.0, n_components=80)),
            ("rbf3", RBFSampler(gamma=1.0, n_components=50)),
        ])
        self.featurizer.fit(self.scaler.transform(samples))

        # Create a value approximator for each action
        self.q_functions = [
            SGDRegressor(learning_rate="constant", max_iter=500, tol=1e-3)
            for _ in range(self.num_actions)
        ]

        # Initialize it to whatever values; implementation detail
        for q_a in self.q_functions:
            q_a.partial_fit(self.featurize(samples),
                            np.zeros((samples.shape[0], )))

    def featurize(self, state):
        if len(state.shape) == 1:
            state = state.reshape(1, -1)
        # Task 1: TODO: Use (s, abs(s)) as features
        #return np.concatenate((state, np.abs(state)), axis=1)
        # RBF features
        return self.featurizer.transform(self.scaler.transform(state))

    def get_action(self, state, epsilon=0.0):
        if np.random.random() < epsilon:
            a = int(np.random.random() * self.num_actions)
            return a
        else:
            featurized = self.featurize(state)
            qs = [q.predict(featurized)[0] for q in self.q_functions]
            qs = np.array(qs)
            a = np.argmax(qs, axis=0)
            return a

    def single_update(self, state, action, next_state, reward, done):
        # Calculate feature representations of the
        # Task 1: TODO: Set the feature state and feature next state

        featurized_state = self.featurize(state)
        featurized_next_state = self.featurize(next_state)

        # Task 1:  TODO Get Q(s', a) for the next state
        next_qs = [
            q.predict(featurized_next_state)[0] for q in self.q_functions
        ]

        # Calculate the updated target Q- values
        # Task 1: TODO: Calculate target based on rewards and next_qs
        if done:
            target = reward
        else:
            target = reward + self.gamma * np.max(next_qs)
        # Update Q-value estimation
        self.q_functions[action].partial_fit(featurized_state, [target])

    def update_estimator(self):
        if len(self.memory) < self.batch_size:
            # Use the whole memory
            samples = self.memory.memory
        else:
            # Sample some data
            samples = self.memory.sample(self.batch_size)
        # Task 2: TODO: Reformat data in the minibatch
        states = []
        action = []
        next_states = []
        rewards = []
        dones = []
        for s in samples:
            states.append(s.state)
            action.append(s.action)
            next_states.append(s.next_state)
            rewards.append(s.reward)
            dones.append(s.done)
        states = np.array(states)
        next_states = np.array(next_states)
        action = np.array(action)
        rewards = np.array(rewards)
        dones = np.array(dones)

        # Task 2: TODO: Calculate Q(s', a)
        featurized_next_states = self.featurize(next_states)
        next_qs = np.max(np.array(
            [q.predict(featurized_next_states) for q in self.q_functions]).T,
                         axis=1)

        # Calculate the updated target values
        # Task 2: TODO: Calculate target based on rewards and next_qs
        targets = rewards + self.gamma * next_qs * np.invert(dones)

        # Calculate featurized states
        featurized_states = self.featurize(states)
        # Get new weights for each action separately
        for a in range(self.num_actions):
            # Find states where a was taken
            idx = action == a

            # If a not present in the batch, skip and move to the next action
            if np.any(idx):
                act_states = featurized_states[idx]
                act_targets = targets[idx]
                # Perform a single SGD step on the Q-function params
                self.q_functions[a].partial_fit(act_states, act_targets)

    def store_transition(self, *args):
        self.memory.push(*args)
示例#25
0
        test_step = 0
        test_reward = 0
        done = False
        test_memory = ReplayMemory(10000, verbose=False)

        while not done:

            frames.append(test_env.render())

            action = get_action(net, tf.constant(state, tf.float32),
                                tf.constant(0.0, tf.float32))

            next_state, reward, done, info = test_env.step(action)
            test_reward += reward

            test_memory.push(state, action, reward, next_state, done)
            state = next_state

            test_step += 1

            if done and (info["ale.lives"] != 0):
                test_env.reset()
                test_step = 0
                done = False

        reward_set.append(test_reward)
        frame_set.append(frames)

    best_score = np.max(reward_set)
    print("Best score of current network ({} trials): {}".format(
        trial, best_score))
示例#26
0
def test_arb(arb_env, modules_list, n_epi=250, max_steps=500):
    s_dim, a_dim = 16, 4
    n_modules = len(modules_list)

    pi_tensors = get_pi(modules_list)
    arb = Arbitrator().to(device)
    returns = []
    all_rets = []
    memory = ReplayMemory(10000)
    for epi in range(n_epi):
        arb_env.reset()
        r_list = []
        steps = 0
        while steps < max_steps:
            state = get_state_vector(arb_env.cur_state)
            coeff = arb(state)
            pi_k = torch.zeros(s_dim, a_dim)
            for m in range(n_modules):
                pi_k += coeff[0][m] * pi_tensors[m]
            a = np.random.choice(
                4, p=pi_k[arb_env.cur_state].detach().cpu().numpy())
            s, a, s_, r, done = arb_env.step(a)
            r_list.append(r)
            reward = torch.FloatTensor([r], device=device)
            next_state = get_state_vector(s_)
            steps += 1
            memory.push(state, torch.FloatTensor([a], device=device),
                        next_state, reward)

            if done:
                state = get_state_vector(arb_env.cur_state)
                coeff = arb(state)
                pi_k = torch.zeros(s_dim, a_dim)
                for m in range(n_modules):
                    pi_k += coeff[0][m] * pi_tensors[m]

                a = np.random.choice(
                    4, p=pi_k[arb_env.cur_state].detach().cpu().numpy())
                # state = get_state_vector(arb_env.cur_state)
                next_state = state
                r = 100.
                steps += 1
                reward = torch.FloatTensor([r], device=device)
                r_list.append(r)
                memory.push(state, torch.FloatTensor([a], device=device),
                            next_state, reward)
                break

        rets = []
        return_so_far = 0
        for t in range(len(r_list) - 1, -1, -1):
            return_so_far = r_list[t] + 0.9 * return_so_far
            rets.append(return_so_far)
        # The returns are stored backwards in time, so we need to revert it
        rets = list(reversed(rets))
        all_rets.extend(rets)
        print("epi {} over".format(epi))
        if epi % 7 == 0:
            arb.optimize(memory, pi_tensors, torch.FloatTensor(all_rets))
            all_rets = []
            memory = ReplayMemory(10000)
        returns.append(sum(r_list))

    return returns
示例#27
0
    for i_step in tqdm(range(STEPS_PER_EPOCH)):

        # Does explorative actions for an ammount of steps
        if i_episode * STEPS_PER_EPOCH + i_step < START_STEPS:
            action = select_action(observation, ACTION_NOISE)
        else:
            action = torch.randn(
                env.action_space())  # should be implemented as actionspace BOX

        # Stepping the Environment
        obs_prime, reward, done, _ = env.step(action)
        episode_reward += reward

        if done:
            print("Got one")

        # pushes the performed action, state and reward into the cache
        cache.push(observation.unsqueeze(0), action.unsqueeze(0),
                   reward.unsqueeze(0).float(), obs_prime.unsqueeze(0),
                   done.unsqueeze(0).float())

        #Update to the most recent observation
        observation = obs_prime
        status = optimize_model()

    if status:
        test_policy()

print('Complete')
plt.show()