Python ReplayMemory.sample примеры использования

Язык программирования: Python

Пространство имен/Пакет: model.replay_memory

Класс/Тип: ReplayMemory

Метод/Функция: sample

Примеров на hotexamples.com: 3

Python ReplayMemory.sample - 3 примера найдено. Это лучшие примеры Python кода для model.replay_memory.ReplayMemory.sample, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

ReplayMemory(5)

push(3)

sample(3)

add(2)

Основные методы

ReplayMemory (5)

push (3)

sample (3)

add (2)

Пример #1

Показать файл

class Agent(BaseAgent):
    '''Deep Trading Agent based on Deep Q Learning'''
    '''TODO: 
        1. play
    '''
    def __init__(self, sess, logger, config, env):
        super(Agent, self).__init__(config, logger)
        self.sess = sess
        self.logger = logger
        self.config = config
        params = DeepSenseParams(config)

        self.env = env
        self.history = History(logger, config)
        self.replay_memory = ReplayMemory(logger, config)

        with tf.variable_scope(STEPS):
            self.step_op = tf.Variable(0, trainable=False, name=STEP)
            self.step_input = tf.placeholder('int32', None, name=STEP_INPUT)
            self.step_assign_op = self.step_op.assign(self.step_input)

        self.build_dqn(params)

    @property
    def summary_writer(self):
        return self._summary_writer

    def train(self):
        start_step = self.step_op.eval()

        num_episodes, self.update_count, ep_reward = 0, 0, 0.
        total_reward, self.total_loss, self.total_q = 0., 0., 0.
        max_avg_ep_reward = 0
        ep_rewards, actions = [], []

        self.env.new_random_episode(self.history)

        for self.step in tqdm(range(start_step, self.max_step),
                              ncols=70,
                              initial=start_step):
            if self.step == self.learn_start:
                num_episodes, self.update_count, ep_reward = 0, 0, 0.
                total_reward, self.total_loss, self.total_q = 0., 0., 0.
                ep_rewards, actions = [], []

            # 1. predict
            action = self.predict(self.history.get())
            # 2. act
            screen, reward, terminal = self.env.act(action)
            # 3. observe
            self.observe(screen, reward, action, terminal)

            if terminal:
                self.env.new_random_episode(self.history)
                num_episodes += 1
                ep_rewards.append(ep_reward)
                ep_reward = 0.

            else:
                ep_reward += reward

            actions.append(action)
            total_reward += reward

            if self.step >= self.learn_start:
                if self.step % self.test_step == self.test_step - 1:
                    avg_reward = total_reward / self.test_step
                    avg_loss = self.total_loss / self.update_count
                    avg_q = self.total_q / self.update_count

                    try:
                        max_ep_reward = np.max(ep_rewards)
                        min_ep_reward = np.min(ep_rewards)
                        avg_ep_reward = np.mean(ep_rewards)
                    except:
                        max_ep_reward, min_ep_reward, avg_ep_reward = 0, 0, 0

                    message = 'avg_r: %.4f, avg_l: %.6f, avg_q: %3.6f, avg_ep_r: %.4f, max_ep_r: %.4f, min_ep_r: %.4f, # game: %d' \
                        % (avg_reward, avg_loss, avg_q, avg_ep_reward, max_ep_reward, min_ep_reward, num_game)
                    print_and_log_message(message, self.logger)

                    if max_avg_ep_reward * 0.9 <= avg_ep_reward:
                        self.step_assign_op.eval(
                            {self.step_input: self.step + 1})
                        self.save_model(self.step + 1)

                        max_avg_ep_reward = max(max_avg_ep_reward,
                                                avg_ep_reward)

                    if self.step > 180:
                        self.inject_summary(
                            {
                                'average.reward':
                                avg_reward,
                                'average.loss':
                                avg_loss,
                                'average.q':
                                avg_q,
                                'episode.max reward':
                                max_ep_reward,
                                'episode.min reward':
                                min_ep_reward,
                                'episode.avg reward':
                                avg_ep_reward,
                                'episode.num of game':
                                num_game,
                                'episode.rewards':
                                ep_rewards,
                                'episode.actions':
                                actions,
                                'training.learning_rate':
                                self.learning_rate_op.eval(
                                    {self.learning_rate_step: self.step}),
                            }, self.step)

                    num_game = 0
                    total_reward = 0.
                    self.total_loss = 0.
                    self.total_q = 0.
                    self.update_count = 0
                    ep_reward = 0.
                    ep_rewards = []
                    actions = []

    def predict(self, s_t, test_ep=None):
        ep = test_ep or (self.ep_end +
            max(0., (self.ep_start - self.ep_end) \
            * (self.ep_end_t - max(0., self.step - self.learn_start)) / self.ep_end_t))

        if random.random() < ep:
            action = random.randrange(self.env.action_size)
        else:
            action = self.q.action.eval({self.s_t: [s_t]})[0]

        return action

    def observe(self, screen, reward, action, terminal):
        #clip reward in the range min to max
        reward = max(self.min_reward, min(self.max_reward, reward))

        self.history.add(screen)
        self.replay_memory.add(screen, reward, action, terminal)

        if self.step > self.learn_start:
            if self.step % self.train_frequency == 0:
                self.q_learning_mini_batch()

            if self.step % self.target_q_update_step == self.target_q_update_step - 1:
                self.update_target_network()

    def q_learning_mini_batch(self):
        if self.replay_memory.count >= self.replay_memory.history_length:
            s_t, action, reward, s_t_plus_1, terminal = self.replay_memory.sample(
            )

            max_q_t_plus_1 = self.t_q.action.eval({self.t_s_t: s_t_plus_1})
            terminal = np.array(terminal) + 0.
            target_q = reward + (1 - terminal) * max_q_t_plus_1

            _, q_t, loss, avg_q_summary = self.sess.run(
                [
                    self.optimizer, self.q.values, self.loss,
                    self.q.avg_q_summary
                ], {
                    self.target_q: target_q,
                    self.action: action,
                    self.s_t: s_t,
                    self.learning_rate_step: self.step,
                })

            self.summary_writer.add_summary(avg_q_summary, self.step)
            self.total_loss += loss
            self.total_q += q_t.mean()
            self.update_count += 1

    def build_dqn(self, params):
        with tf.variable_scope(PREDICTION):
            self.s_t = tf.placeholder(dtype=tf.float32,
                                      shape=[
                                          None,
                                          self.replay_memory.history_length,
                                          self.replay_memory.num_channels
                                      ])
        self.q = DeepSense(params,
                           self.logger,
                           self.sess,
                           self.config,
                           name=Q_NETWORK)
        self.q.build_model(self.s_t)

        with tf.variable_scope(TARGET):
            self.t_s_t = tf.placeholder(dtype=tf.float32,
                                        shape=[
                                            None,
                                            self.replay_memory.history_length,
                                            self.replay_memory.num_channels
                                        ])
        self.t_q = DeepSense(params,
                             self.logger,
                             self.sess,
                             self.config,
                             name=T_Q_NETWORK)
        self.t_q.build_model(self.t_s_t, train=False)

        with tf.variable_scope(UPDATE_TARGET_NETWORK):
            self.q_weights_placeholders = {}
            self.t_weights_assign_ops = {}

            for name in self.q.weights.keys():
                self.q_weights_placeholders[name] = tf.placeholder(
                    tf.float32, self.q.weights[name].get_shape().as_list())
            for name in self.q.weights.keys():
                self.t_weights_assign_ops[name] = self.t_q.weights[
                    name].assign(self.q_weights_placeholders[name])

        with tf.variable_scope(TRAINING):
            self.target_q = tf.placeholder(tf.float32, [None], name=TARGET_Q)
            self.action = tf.placeholder(tf.int64, [None], name=ACTION)

            action_one_hot = tf.one_hot(self.action,
                                        self.env.action_size,
                                        1.0,
                                        0.0,
                                        name=ACTION_ONE_HOT)
            q_acted = tf.reduce_sum(self.q.values * action_one_hot,
                                    reduction_indices=1,
                                    name=Q_ACTED)

            with tf.variable_scope(LOSS):
                self.delta = self.target_q - q_acted

                self.global_step = tf.Variable(0, trainable=False)

                self.loss = tf.reduce_mean(clipped_error(self.delta),
                                           name=LOSS)

            with tf.variable_scope(OPTIMIZER):
                self.learning_rate_step = tf.placeholder(
                    tf.int64, None, name=LEARNING_RATE_STEP)
                self.learning_rate_op = tf.maximum(
                    self.learning_rate_minimum,
                    tf.train.exponential_decay(self.learning_rate,
                                               self.learning_rate_step,
                                               self.learning_rate_decay_step,
                                               self.learning_rate_decay,
                                               staircase=True))

                self.optimizer = tf.train.RMSPropOptimizer(
                    self.learning_rate_op, momentum=0.95,
                    epsilon=0.01).minimize(self.loss)

        with tf.variable_scope(SUMMARY):
            scalar_summary_tags = ['average.reward', 'average.loss', 'average.q', \
                'episode.max reward', 'episode.min reward', 'episode.avg reward', \
                'episode.num of game', 'training.learning_rate']

            self.summary_placeholders = {}
            self.summary_ops = {}

            for tag in scalar_summary_tags:
                self.summary_placeholders[tag] = \
                    tf.placeholder('float32', None, name=tag.replace(' ', '_'))
                self.summary_ops[tag] = \
                    tf.summary.scalar(
                        name="{}-{}".format(self.env_name, tag),
                        tensor=self.summary_placeholders[tag]
                    )

            histogram_summary_tags = ['episode.rewards', 'episode.actions']
            for tag in histogram_summary_tags:
                self.summary_placeholders[tag] = \
                    tf.placeholder('float32', None, name=tag.replace(' ', '_'))
                self.summary_ops[tag] = \
                    tf.summary.histogram(
                        name=tag,
                        self.summary_placeholders[tag]
                    )

        self._summary_writer = tf.summary.FileWriter(
            config[TENSORBOARD_LOG_DIR])
        self._summary_writer.add_graph(sess.graph)

        tf.initialize_all_variables().run()
        self._saver = tf.train.Saver(self.q.weights.values + [self.step_op],
                                     max_to_keep=30)

        self.load_model()
        self.update_target_network()

    def update_target_network(self):
        for name in self.q.weights.keys():
            self.t_weights_assign_ops[name].eval({
                self.q_weights_placeholders[name]:
                self.q.weights[name].eval()
            })

    def inject_summary(self, tag_dict, step):
        summary_str_lists = self.sess.run(
            [self.summary_ops[tag] for tag in tag_dict.keys()], {
                self.summary_placeholders[tag]: value
                for tag, value in tag_dict.items()
            })
        for summary_str in summary_str_lists:
            self.writer.add_summary(summary_str, self.step)

Пример #2

Показать файл

class DQN:
    def __init__(self, config, network, loss, optimizer):
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.memory = ReplayMemory(config['REPLAY'])
        self.policy_net = network.to(self.device)
        self.target_net = network.to(self.device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()
        self.loss = loss
        self.optimizer = optimizer(self.policy_net.parameters(), config['lr'])
        self.steps_done = 0
        self.config = config

    def update(self):
        self.target_net.load_state_dict(self.policy_net.state_dict())

    def select_action(self, state):
        EPS_START, EPS_END, EPS_DECAY, n_actions = self.config[
            'EPS_START'], self.config['EPS_END'], self.config[
                'EPS_DECAY'], self.config['ACTION_SPACE']
        sample = random.random()
        eps_threshold = EPS_END + (EPS_START - EPS_END) * \
            math.exp(-1. * self.steps_done / EPS_DECAY)
        self.steps_done += 1
        if sample > eps_threshold:
            with torch.no_grad():
                # t.max(1) will return largest column value of each row.
                # second column on max result is index of where max element was
                # found, so we pick action with the larger expected reward.
                return self.policy_net(state).max(1)[1].view(1, 1)
        else:
            return torch.tensor([[random.randrange(n_actions)]],
                                device=self.device,
                                dtype=torch.long)

    def optimize_model(self):
        BATCH_SIZE = self.config['BATCH_SIZE']
        if len(self.memory) < BATCH_SIZE:
            return
        transitions = self.memory.sample(BATCH_SIZE)
        # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
        # detailed explanation). This converts batch-array of Transitions
        # to Transition of batch-arrays.
        batch = Transition(*zip(*transitions))

        # Compute a mask of non-final states and concatenate the batch elements
        # (a final state would've been the one after which simulation ended)
        non_final_mask = torch.tensor(tuple(
            map(lambda s: s is not None, batch.next_state)),
                                      device=self.device,
                                      dtype=torch.bool)
        non_final_next_states = torch.cat(
            [s for s in batch.next_state if s is not None])
        state_batch = torch.cat(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)

        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
        # columns of actions taken. These are the actions which would've been taken
        # for each batch state according to policy_net
        state_action_values = self.policy_net(state_batch).gather(
            1, action_batch)

        # Compute V(s_{t+1}) for all next states.
        # Expected values of actions for non_final_next_states are computed based
        # on the "older" target_net; selecting their best reward with max(1)[0].
        # This is merged based on the mask, such that we'll have either the expected
        # state value or 0 in case the state was final.
        next_state_values = torch.zeros(BATCH_SIZE, device=self.device)
        next_state_values[non_final_mask] = self.target_net(
            non_final_next_states).max(1)[0].detach()
        # Compute the expected Q values
        GAMMA = self.config['GAMMA']
        expected_state_action_values = (next_state_values *
                                        GAMMA) + reward_batch

        # Compute Huber loss
        loss = self.loss(state_action_values,
                         expected_state_action_values.unsqueeze(1))

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

Пример #3

Показать файл

class Agent(BaseAgent):
    '''Deep Trading Agent based on Deep Q Learning'''
    '''TODO: 
        1. add summary ops
        2. timing and logging
        3. model saving
        4. increment self.step
    '''
    def __init__(self, sess, logger, config, env):
        super(Agent, self).__init__(config)
        self.sess = sess
        self.logger = logger
        self.config = config
        params = DeepSenseParams(config)

        self.env = env
        self.history = History(logger, config)
        self.replay_memory = ReplayMemory(logger, config)

        with tf.variable_scope(STEPS):
            self.step_op = tf.Variable(0, trainable=False, name=STEP)
            self.step_input = tf.placeholder('int32', None, name=STEP_INPUT)
            self.step_assign_op = self.step_op.assign(self.step_input)

        self.build_dqn(params)

    def train(self):
        start_step = self.step_op.eval()

        num_episodes, self.update_count, ep_reward = 0, 0, 0.
        total_reward, self.total_loss, self.total_q = 0., 0., 0.
        max_avg_ep_reward = 0
        ep_rewards, actions = [], []

        self.env.new_random_episode(self.history)

        for self.step in range(start_step, self.max_step):
            if self.step == self.learn_start:
                num_episodes, self.update_count, ep_reward = 0, 0, 0.
                total_reward, self.total_loss, self.total_q = 0., 0., 0.
                ep_rewards, actions = [], []

            # 1. predict
            action = self.predict(self.history.get())
            # 2. act
            screen, reward, terminal = self.env.act(action)
            # 3. observe
            self.observe(screen, reward, action, terminal)

            if terminal:
                self.env.new_random_episode(self.history)
                num_episodes += 1
                ep_rewards.append(ep_reward)
                ep_reward = 0.

            else:
                ep_reward += reward

            actions.append(action)
            total_reward += reward

    def predict(self, s_t, test_ep=None):
        ep = test_ep or (self.ep_end +
            max(0., (self.ep_start - self.ep_end) \
            * (self.ep_end_t - max(0., self.step - self.learn_start)) / self.ep_end_t))

        if random.random() < ep:
            action = random.randrange(self.env.action_size)
        else:
            action = self.q.action.eval({self.s_t: [s_t]})[0]

        return action

    def observe(self, screen, reward, action, terminal):
        #clip reward in the range min to max
        reward = max(self.min_reward, min(self.max_reward, reward))

        self.history.add(screen)
        self.replay_memory.add(screen, reward, action, terminal)

        if self.step > self.learn_start:
            if self.step % self.train_frequency == 0:
                self.q_learning_mini_batch()

            if self.step % self.target_q_update_step == self.target_q_update_step - 1:
                self.update_target_network()

    def q_learning_mini_batch(self):
        if self.replay_memory.count >= self.replay_memory.history_length:
            s_t, action, reward, s_t_plus_1, terminal = self.replay_memory.sample(
            )

            max_q_t_plus_1 = self.t_q.action.eval({self.t_s_t: s_t_plus_1})
            terminal = np.array(terminal) + 0.
            target_q = reward + (1 - terminal) * max_q_t_plus_1

            _, q_t, loss = self.sess.run(
                [self.optimizer, self.q.values, self.loss], {
                    self.target_q: target_q,
                    self.action: action,
                    self.s_t: s_t,
                    self.learning_rate_step: self.step,
                })

            self.total_loss += loss
            self.total_q += q_t.mean()
            self.update_count += 1

    def build_dqn(self, params):
        with tf.variable_scope(PREDICTION):
            self.s_t = tf.placeholder(dtype=tf.float32,
                                      shape=[
                                          None,
                                          self.replay_memory.history_length,
                                          self.replay_memory.num_channels
                                      ])
        self.q = DeepSense(params,
                           self.logger,
                           self.sess,
                           self.config,
                           name=Q_NETWORK)
        self.q.build_model(self.s_t)

        with tf.variable_scope(TARGET):
            self.t_s_t = tf.placeholder(dtype=tf.float32,
                                        shape=[
                                            None,
                                            self.replay_memory.history_length,
                                            self.replay_memory.num_channels
                                        ])
        self.t_q = DeepSense(params,
                             self.logger,
                             self.sess,
                             self.config,
                             name=T_Q_NETWORK)
        self.t_q.build_model(self.t_s_t, train=False)

        with tf.variable_scope(UPDATE_TARGET_NETWORK):
            self.q_weights_placeholders = {}
            self.t_weights_assign_ops = {}

            for name in self.q.weights.keys():
                self.q_weights_placeholders[name] = tf.placeholder(
                    tf.float32, self.q.weights[name].get_shape().as_list())
            for name in self.q.weights.keys():
                self.t_weights_assign_ops[name] = self.t_q.weights[
                    name].assign(self.q_weights_placeholders[name])

        with tf.variable_scope(TRAINING):
            self.target_q = tf.placeholder(tf.float32, [None], name=TARGET_Q)
            self.action = tf.placeholder(tf.int64, [None], name=ACTION)

            action_one_hot = tf.one_hot(self.action,
                                        self.env.action_size,
                                        1.0,
                                        0.0,
                                        name=ACTION_ONE_HOT)
            q_acted = tf.reduce_sum(self.q.values * action_one_hot,
                                    reduction_indices=1,
                                    name=Q_ACTED)

            with tf.variable_scope(LOSS):
                self.delta = self.target_q - q_acted

                self.global_step = tf.Variable(0, trainable=False)

                self.loss = tf.reduce_mean(clipped_error(self.delta),
                                           name=LOSS)
            with tf.variable_scope(OPTIMIZER):
                self.learning_rate_step = tf.placeholder(
                    tf.int64, None, name=LEARNING_RATE_STEP)
                self.learning_rate_op = tf.maximum(
                    self.learning_rate_minimum,
                    tf.train.exponential_decay(self.learning_rate,
                                               self.learning_rate_step,
                                               self.learning_rate_decay_step,
                                               self.learning_rate_decay,
                                               staircase=True))

                self.optimizer = tf.train.RMSPropOptimizer(
                    self.learning_rate_op, momentum=0.95,
                    epsilon=0.01).minimize(self.loss)

        # tf.initialize_all_variables().run()
        #initialize the q network and the target network with the same weights
        # self.update_target_network()

    def update_target_network(self):
        for name in self.q.weights.keys():
            self.t_weights_assign_ops[name].eval({
                self.q_weights_placeholders[name]:
                self.q.weights[name].eval()
            })