Пример #1
0
class TrainDQN:
    def __init__(self,
                 env,
                 sess,
                 learning_rate=1e-3,
                 seed=1234,
                 gamma=0.99,
                 max_eps=1.0,
                 min_eps=0.1,
                 render=False,
                 print_freq=20,
                 load_path=None,
                 save_path=None,
                 batch_size=32,
                 log_dir='logs/train',
                 max_steps=100000,
                 buffer_capacity=None,
                 max_episode_len=2000,
                 eps_decay_rate=-0.0001,
                 target_update_freq=1000,
                 ):
        """Trains an openai gym-like environment with deep q learning.
        Args:
            env: gym.Env where our agent resides
            seed: Random seed for reproducibility
            gamma: Discount factor
            max_eps: Starting exploration factor
            min_eps: Exploration factor to decay towards
            max_episode_len: Maximum length of an individual episode
            render: True to render the environment, else False
            print_freq: Displays logging information every 'print_freq' episodes
            load_path: (str) Path to load existing model from
            save_path: (str) Path to save model during training
            max_steps: maximum number of times to sample the environment
            buffer_capacity: How many state, action, next state, reward tuples the replay buffer should store
            max_episode_len: Maximum number of timesteps in an episode
            eps_decay_rate: lambda parameter in exponential decay for epsilon
            target_update_fraction: Fraction of max_steps update the target network
        """
        np.random.seed(seed)
        self.sess = sess
        self.env = env
        self.input_dim = env.observation_space.shape[0]
        self.output_dim = env.action_space.n
        self.max_steps = max_steps
        self.max_eps = max_eps
        self.min_eps = min_eps
        self.eps_decay_rate = eps_decay_rate
        self.max_episode_len = max_episode_len
        self.render = render
        self.print_freq = print_freq
        self.rewards = []
        self.metrics = []
        self.save_path = save_path
        self.load_path = load_path
        self.batch_size = batch_size
        self.num_updates = 0
        self.gamma = gamma
        self.buffer = ReplayBuffer(capacity=max_steps // 2 if buffer_capacity is None else buffer_capacity)
        self.target_update_freq = target_update_freq
        self.learning_rate = learning_rate

        with tf.variable_scope('q_network'):
            self.q_network = QNetworkBuilder(self.input_dim, self.output_dim, (64,))
        with tf.variable_scope('target_network'):
            self.target_network = QNetworkBuilder(self.input_dim, self.output_dim, (64,))
        self.update_target_network = [old.assign(new) for (new, old) in
                                      zip(tf.trainable_variables('q_network'),
                                          tf.trainable_variables('target_network'))]
        if self.load_path is not None:
            self.load()

        self.add_summaries(log_dir)

    def add_summaries(self, log_dir):
        tf.summary.scalar('Loss', self.q_network.loss, )
        tf.summary.scalar('Mean Estimated Value', tf.reduce_mean(self.q_network.output_pred))
        # Merge all the summaries and write them out to log_dir
        self.merged = tf.summary.merge_all()
        self.train_writer = tf.summary.FileWriter(log_dir, self.sess.graph)

    def learn(self):
        """Learns via Deep-Q-Networks (DQN)"""
        obs = self.env.reset()
        mean_reward = None
        total_reward = 0
        ep = 0
        ep_len = 0
        rand_actions = 0
        for t in range(self.max_steps):
            # weight decay from https://jaromiru.com/2016/10/03/lets-make-a-dqn-implementation/
            eps = self.min_eps + (self.max_eps - self.min_eps) * np.exp(
                self.eps_decay_rate * t)
            if self.render:
                self.env.render()

            # Take exploratory action with probability epsilon
            if np.random.uniform() < eps:
                action = self.env.action_space.sample()
                rand_actions += 1
            else:
                action = self.act(obs)

            # Execute action in emulator and observe reward and next state
            new_obs, reward, done, info = self.env.step(action)
            total_reward += reward

            # Store transition s_t, a_t, r_t, s_t+1 in replay buffer
            self.buffer.add((obs, action, reward, new_obs, done))

            # Perform learning step
            self.update()

            obs = new_obs
            ep_len += 1
            if done or ep_len >= self.max_episode_len:
                #         print("Episode Length:", ep_len)
                #         print(f"Episode {ep} Reward:{total_reward}")
                #         print(f"Random Action Percent: {rand_actions/ep_len}")
                ep += 1
                ep_len = 0
                rand_actions = 0
                self.rewards.append(total_reward)
                total_reward = 0
                obs = self.env.reset()

                if ep % self.print_freq == 0 and ep > 0:
                    new_mean_reward = np.mean(self.rewards[-self.print_freq - 1:])

                    print(f"-------------------------------------------------------")
                    print(f"Mean {self.print_freq} Episode Reward: {new_mean_reward}")
                    print(f"Exploration fraction: {eps}")
                    print(f"Total Episodes: {ep}")
                    print(f"Total timesteps: {t}")
                    print(f"-------------------------------------------------------")

                    # Add reward summary
                    summary = tf.Summary()
                    summary.value.add(tag=f'Mean {self.print_freq} Episode Reward',
                                      simple_value=new_mean_reward)
                    summary.value.add(tag=f'Epsilon', simple_value=eps)
                    self.train_writer.add_summary(summary, self.num_updates)

                    # Model saving inspired by Open AI Baseline implementation
                    if (mean_reward is None or new_mean_reward >= mean_reward) and self.save_path is not None:
                        print(f"Saving model due to mean reward increase:{mean_reward} -> {new_mean_reward}")
                        print(f'Location: {self.save_path}')
                        # save_path = f"{self.save_path}_model"
                        self.save()
                        mean_reward = new_mean_reward

    def act(self, observation):
        """Takes an action given the observation.
        Args:
            observation: observation from the environment
        Returns:
            integer index of the selected action
        """
        pred = self.sess.run([self.q_network.output_pred],
                             feed_dict={self.q_network.input_ph: np.reshape(observation, (1, self.input_dim))})
        return np.argmax(pred)

    def update(self):
        """Applies gradients to the Q network computed from a minibatch of self.batch_size."""
        if self.batch_size <= self.buffer.size():
            self.num_updates += 1

            # Update the Q network with model parameters from the target network
            if self.num_updates % self.target_update_freq == 0:
                self.sess.run(self.update_target_network)
                print('Updated Target Network')

            # Sample random minibatch of transitions from the replay buffer
            sample = self.buffer.sample(self.batch_size)
            states, action, reward, next_states, done = sample

            # Calculate discounted predictions for the subsequent states using target network
            next_state_pred = self.gamma * self.sess.run(self.target_network.output_pred,
                                                         feed_dict={
                                                             self.target_network.input_ph: next_states}, )

            # Adjust the targets for non-terminal states
            reward = reward.reshape(len(reward), 1)
            targets = reward
            loc = np.argwhere(done != True).flatten()
            if len(loc) > 0:
                max_q = np.amax(next_state_pred, axis=1)
                targets[loc] = np.add(
                    targets[loc],
                    max_q[loc].reshape(max_q[loc].shape[0], 1),
                    casting='unsafe')

            # Update discount factor and train model on batch
            _, loss = self.sess.run([self.q_network.opt, self.q_network.loss],
                                    feed_dict={self.q_network.input_ph: states,
                                               self.q_network.target_ph: targets.flatten(),
                                               self.q_network.action_indices_ph: action})

    def save(self):
        """Saves the Q network."""
        self.q_network.saver.save(self.sess, self.save_path)

    def load(self):
        """Loads the Q network."""
        self.q_network.saver.restore(self.sess, self.save_path)

    def plot_rewards(self, path=None):
        """Plots rewards per episode.
        Args:
            path: Location to save the rewards plot. If None, image will be displayed with plt.show()
        """
        plt.plot(self.rewards)
        plt.xlabel('Episode')
        plt.ylabel('Reward')
        if path is None:
            plt.show()
        else:
            plt.savefig(path)
            plt.close('all')
Пример #2
0
        plot_count_per_actions += episode_count_per_actions
        plot_episode_requested_agents += episode_episode_requested_agents
        plot_episode_count_requested_agent += episode_episode_count_requested_agent
        plot_episode_rewards.append(episode_reward)
        episodes.append(episode)

    episode_batch = episodes[0]
    episodes.pop(0)
    for episode in episodes:
        for key in episode_batch.keys():
            episode_batch[key] = np.concatenate(
                (episode_batch[key], episode[key]), axis=0)

    buffer.store_episode(episode_batch)
    for train_step in range(args.train_steps):
        mini_batch = buffer.sample(min(buffer.current_size, args.batch_size))
        agents.train(mini_batch, train_steps)
        train_steps += 1

    figure, axes = plt.subplots(nrows=2, ncols=2)

    # plt.rcParams["figure.figsize"] = (50, 50)
    plt.rcParams['lines.linewidth'] = 4

    index1 = ["Action 0", "Action 1", "Action 2"]
    axes[0, 0].bar(x=index1, height=plot_count_per_actions)
    axes[0, 0].set_title('Cumulative count over action space')

    # index2 = ["1 Agents", "2 Agents", "3 Agents", "4 Agents"]
    index2 = [f'{i+1} Agents' for i in range(N_AGENTS)]
    axes[0, 1].bar(x=index2, height=plot_episode_count_requested_agent)
class Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)

        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # get targets
        self.qnetwork_target.eval()
        with torch.no_grad():
            Q_targets_next = torch.max(self.qnetwork_target.forward(next_states), dim=1, keepdim=True)[0]

        Q_targets = rewards + (GAMMA * Q_targets_next * (1 - dones))

        # get outputs
        self.qnetwork_local.train()
        Q_expected = self.qnetwork_local.forward(states).gather(1, actions)

        # compute loss
        loss = F.mse_loss(Q_expected, Q_targets)

        # clear gradients
        self.optimizer.zero_grad()

        # update weights local network
        loss.backward()

        # take one SGD step
        self.optimizer.step()
        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
Пример #4
0
class NeuralNetworkAgent(Agent):
    def __init__(self,
                 api,
                 network_class,
                 sess,
                 save_path,
                 history_size=15,
                 restore_path=None,
                 verbose=False,
                 train=False,
                 test=False):
        super(NeuralNetworkAgent, self).__init__(api, verbose=verbose)

        # currently 7500 w/ 1000

        # Network
        self.network = network_class(sess,
                                     save_path,
                                     restore_path=restore_path,
                                     hist_size=history_size)
        self.replay_buffer = ReplayBuffer(max_size=2500)
        self.train = train
        self.history_size = history_size

        # Internal
        self.launched = False
        self.placed_move = False
        self.ctr = 0
        self.restart_game = 1
        self.game_restarted = True
        self.show_board = False
        self.last_move = -2
        self.start_state = np.zeros((20, 10, 1))
        self.possible_moves = [-1, 0, 6, 7]
        self.training_begun = False if not test else True
        self.epsilon = 1. if not test else 0
        self.decay = 0.999
        self.test = test

        self.prev_states = [self.start_state] * self.history_size

    def _controller_listener(self):
        piece_id = self.api.peekCPU(0x0042)
        game_state = self.api.peekCPU(0x0048)

        if piece_id != 19 and game_state == 1:
            # Train
            if self.train and self.replay_buffer.size(
            ) > 250 and not self.test:
                batch = self.replay_buffer.sample(batch_sz=250)
                self.network.train(batch)
                self.training_begun = True

                self.epsilon *= self.decay
                if self.epsilon < 0.010:
                    self.epsilon = 0.010

        if not self.placed_move:  # and (random_move >= 0 or self.restart_game > 0):
            # os.system('clear')
            print '--------------'
            is_random = False
            move = None
            if np.random.random() < self.epsilon or not self.training_begun:
                move = np.random.choice(self.possible_moves)
                is_random = True
            else:
                tensor = np.dstack([self.grid] + self.prev_states)
                pred = self.network.predict(tensor)[0]
                move = self.possible_moves[pred]

            if self.restart_game > 0:
                self.api.writeGamepad(0, 3, True)
                self.restart_game -= 1
                move = -2
            else:
                if move >= 0:
                    self.api.writeGamepad(0, move, True)
            self.placed_move = True
            self.show_board = True

            if self.last_move != -2 and piece_id != 19:
                print 'Random:', is_random
                S = self.grid.copy()
                self._update_board(self.api.peekCPU(0x0042))
                board = self._simulate_piece_drop(self.api.peekCPU(0x0042))
                n_empty = self._count_empty(self.grid)
                n_holes = self._count_holes(self.grid)
                height = self._count_height(board)
                levelness = self._determine_levelness(board)
                A = self.last_move
                # R  = self._count_total() + self._get_score() - n_empty
                #R = (-50 * height) + (-20 * n_holes) + (self._get_score())
                if height <= 2:
                    R = 1000
                else:
                    R = -200 * height
                R += -20 * n_holes + 10 * levelness  # 10 * self._get_score()
                SP = self.grid.copy()

                self.prev_states.insert(0, S)

                print np.dstack(self.prev_states).shape

                self.replay_buffer.add(
                    np.dstack(self.prev_states), self.possible_moves.index(A),
                    R, np.dstack([SP] + self.prev_states[:self.history_size]))

                self.prev_states = self.prev_states[:self.history_size]

                print self.epsilon
                self._print_transition(S, A, board, R)

            self.last_move = move
        else:
            self.placed_move = False

    def _frame_render_finished(self):
        """
        Renders the board the the current piece
        TODO: do this lazily, so we aren't calling read too often O_o
        """

        # To make things easier, we're going to modify the next piece drop
        # Always drop a certain type of block (currently square).
        self.api.writeCPU(0x00bf, 0x0a)

        piece_id = self.api.peekCPU(0x0042)
        game_state = self.api.peekCPU(0x0048)

        # Restart the game
        if piece_id == 19 and (game_state == 10 or game_state == 0):
            self.prev_states = [self.start_state] * self.history_size
            self.game_restarted = True
            self.restart_game = 1
            return

        # Probably a line clear... Skip
        if piece_id == 19 and game_state != 1:
            return

    def _piece_update(self, access_type, address, value):
        """
        Can be used to control the piece being dropped
        """
        if self.api.readCPU(0x0048) == 1:
            return 0x0a
        return value

    def agent_name(self):
        return 'NeuralNetworkAgent'
Пример #5
0
class DQN:
    def __init__(
        self,
        env,
        learning_rate=1e-3,
        seed=1234,
        gamma=0.99,
        max_eps=1.0,
        min_eps=0.1,
        render=False,
        print_freq=1,
        load_path=None,
        save_path=None,
        batch_size=32,
        log_dir='logs/train',
        max_steps=100000,
        buffer_capacity=None,
        max_episode_len=None,
        eps_decay_rate=-1e-4,
        target_update_freq=1000,
    ):
        tf.random.set_seed(seed)
        np.random.seed(seed)
        self.gamma = gamma
        self.render = render
        self.batch_size = batch_size
        self.print_freq = print_freq
        self.q_lr = learning_rate
        self.max_eps = max_eps
        self.min_eps = min_eps
        self.eps_decay_rate = eps_decay_rate
        self.buffer = ReplayBuffer(buffer_capacity)
        self.max_steps = max_steps
        self.target_update = target_update_freq
        self.model = QNetwork(env.action_space.n, name='q_network')
        self.target = QNetwork(env.action_space.n, name='target_network')
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=self.q_lr)
        self.summary_writer = tf.summary.create_file_writer(log_dir)
        self.env = env
        self.max_episode_len = max_episode_len if max_episode_len else self.env.spec.max_episode_steps
        self.rewards = []
        self.save_path = save_path

        if load_path is not None:
            self.model.load_weights(load_path)

    def act(self, state):
        return np.argmax(self.model(state))

    @tf.function
    def train_step(self, states, indices, targets):
        """
        Performs a single step of gradient descent on the Q network

        Args:
            states: numpy array of states with shape (batch size, state dim)
            indices: list indices of the selected actions
            targets: targets for computing the MSE loss

        """
        with tf.GradientTape() as tape:
            action_values = tf.gather_nd(self.model(states), indices)
            mse_loss = tf.keras.losses.MeanSquaredError()(action_values,
                                                          targets)

        gradients = tape.gradient(mse_loss, self.model.trainable_variables)
        self.optimizer.apply_gradients(
            zip(gradients, self.model.trainable_variables))

        # Log training information
        with self.summary_writer.as_default():
            tf.summary.scalar('MSE Loss',
                              mse_loss,
                              step=self.optimizer.iterations)
            tf.summary.scalar('Estimated Q Value',
                              tf.reduce_mean(action_values),
                              step=self.optimizer.iterations)

    def update(self):
        """
        Computes the target for the MSE loss and calls the tf.function for gradient descent
        """
        if len(self.buffer) >= self.batch_size:
            # Sample random minibatch of N transitions
            states, actions, rewards, next_states, dones = self.buffer.sample(
                self.batch_size)

            # Adjust the targets for non-terminal states
            next_state_pred = self.target(next_states)
            targets = rewards + self.gamma * next_state_pred.numpy().max(
                axis=1) * (1 - dones)
            batch_range = tf.range(start=0, limit=actions.shape[0])
            indices = tf.stack((batch_range, actions), axis=1)

            # update critic by minimizing the MSE loss
            self.train_step(states, indices, targets)

    def learn(self):
        """Learns via Deep-Q-Networks (DQN)"""
        obs = self.env.reset()
        total_reward = 0
        ep = 0
        ep_len = 0
        rand_actions = 0
        mean_reward = None
        for t in range(self.max_steps):

            if t % self.target_update == 0:
                copy_weights(self.model.variables, self.target.variables)

            # weight decay from https://jaromiru.com/2016/10/03/lets-make-a-dqn-implementation/
            eps = self.min_eps + (self.max_eps - self.min_eps) * np.exp(
                self.eps_decay_rate * t)
            if self.render:
                self.env.render()

            # Take exploratory action with probability epsilon
            if np.random.uniform() < eps:
                action = self.env.action_space.sample()
                rand_actions += 1
            else:
                action = self.act(np.expand_dims(obs, axis=0))

            # Execute action in emulator and observe reward and next state
            new_obs, reward, done, info = self.env.step(action)
            total_reward += reward

            # Store transition s_t, a_t, r_t, s_t+1 in replay buffer
            self.buffer.add((obs, action, reward, new_obs, done))

            # Perform learning step
            self.update()

            obs = new_obs
            ep_len += 1
            if done or ep_len >= self.max_episode_len:
                with self.summary_writer.as_default():
                    ep += 1
                    self.rewards.append(total_reward)
                    total_reward = 0
                    obs = self.env.reset()

                    if ep % self.print_freq == 0 and ep > 0:
                        new_mean_reward = np.mean(
                            self.rewards[-self.print_freq - 1:])

                        print(
                            f"-------------------------------------------------------"
                        )
                        print(
                            f"Mean {self.print_freq} Episode Reward: {new_mean_reward}"
                        )
                        print(f"Exploration fraction: {rand_actions / ep_len}")
                        print(f"Total Episodes: {ep}")
                        print(f"Total timesteps: {t}")
                        print(
                            f"-------------------------------------------------------"
                        )

                        tf.summary.scalar(
                            f'Mean {self.print_freq} Episode Reward',
                            new_mean_reward,
                            step=t)
                        tf.summary.scalar(f'Epsilon', eps, step=t)

                        # Model saving inspired by Open AI Baseline implementation
                        if (mean_reward is None or new_mean_reward >=
                                mean_reward) and self.save_path is not None:
                            print(
                                f"Saving model due to mean reward increase:{mean_reward} -> {new_mean_reward}"
                            )
                            print(f'Location: {self.save_path}')
                            mean_reward = new_mean_reward
                            self.model.save_weights(self.save_path)

                    ep_len = 0
                    rand_actions = 0
Пример #6
0
def train(conf,
          env,
          model,
          num_episodes=500,
          batch_size=100,
          buffer_size=10000):
    conf.buffer_size = buffer_size
    conf.batch_size = batch_size

    replay_buffer = ReplayBuffer(size=buffer_size)
    discount_rate = conf.discount_rate
    eps = conf.initial_eps
    decay_factor = conf.decay_factor
    for episode in range(num_episodes):
        print("Episode {}".format(episode))
        observation = env.reset()
        eps *= decay_factor
        done = False
        total_food = 0
        step = 0
        while not done:
            model_input = np.array([observation])
            prediction = model.predict(model_input)
            if np.random.random() < eps:
                action = np.random.randint(0, 4)
                was_random = True
            else:
                action = np.argmax(prediction)
                was_random = False

            debugger.print_step_before_move(step, observation, prediction,
                                            action, was_random)

            debugger.render_env_until_key_press(env)

            new_observation, reward, done, _ = env.step(action)

            replay_buffer.add(observation, action, reward, new_observation,
                              float(done))

            # target_action_score = reward + (0 if done else discount_rate * np.max(model.predict(
            #     np.array([new_observation]))))

            # label = prediction
            # label[0][action] = target_action_score
            # model.fit(model_input, label, epochs=1,
            #           verbose=0)

            obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                batch_size)
            labels = model.predict(obses_t)
            targets = discount_rate * np.max(model.predict(obses_tp1), axis=1)
            # print('targets', targets)
            # print('rewards', rewards)
            for i in range(len(dones)):
                if dones[i]:
                    targets[i] = 0
                targets[i] += rewards[i]
                labels[i][actions[i]] = targets[i]
            model.fit(obses_t, labels, epochs=1, verbose=0)

            weights, batch_idxes = np.ones_like(rewards), None

            # debugger.print_step_after_move(reward, target_action_score,
            #                       label, model.predict(model_input))

            if (reward > 0):
                total_food += 1
            step += 1

            observation = new_observation
        wandb.log({
            'episode': episode,
            'total_food': total_food,
            'eps': eps,
            'lifetime': step
        })
        print('Score: {}'.format(total_food))
        print()
    env.close()