Python ReplayMemory.append примеры использования

Язык программирования: Python

Пространство имен/Пакет: parl.utils

Класс/Тип: ReplayMemory

Метод/Функция: append

Примеров на hotexamples.com: 5

Python ReplayMemory.append - 5 примеров найдено. Это лучшие примеры Python кода для parl.utils.ReplayMemory.append, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

ReplayMemory(26)

size(7)

append(5)

sample_batch(3)

make_index(1)

sample_batch_by_index(1)

Пример #1

Показать файл

def run_episode(env: Env,
                agent: parl.Agent,
                rpm: ReplayMemory,
                return_time: bool = False):
    if return_time:
        start_tp = time()
        total_sample_time = 0.
        total_learn_time = 0.

    total_reward, steps = 0., 0
    obs = env.reset()
    while True:
        steps += 1
        ls_tp = time()
        if np.random.random() < param_dict["EPSILON"]:
            action = np.random.uniform(-1., 1., size=(2, ))
        else:
            batch_obs = np.expand_dims(obs, axis=0)
            action = agent.predict(batch_obs.astype("float32"))
            action = np.squeeze(action)
            # add guassion noise, clip, map to corresponding interval
            action = np.clip(np.random.normal(action, 1.0), -1., 1.)
        if return_time:
            total_sample_time += time() - ls_tp

        action = action_mapping(action, env.action_space.low[0],
                                env.action_space.high[0])

        next_obs, reward, done, info = env.step(action)
        # with open("./log/sample.log", "a+", encoding="utf-8") as f:
        #     f.write(str(action) + "|" + str(next_obs))

        rpm.append(obs, action, param_dict["REWARD_SCALE"] * reward, next_obs,
                   done)

        # do warm up until rpm size reach MEMORY_WARMUP_SIZE
        if rpm.size() > param_dict["MEMORY_WARMUP_SIZE"]:
            batch_obs, batch_action, batch_reward, batch_next_obs, \
                batch_terminal = rpm.sample_batch(param_dict["BATCH_SIZE"])
            ls_tp = time()
            critic_cost = agent.learn(batch_obs, batch_action, batch_reward,
                                      batch_next_obs, batch_terminal)
            if return_time:
                total_learn_time += time() - ls_tp

            obs = next_obs
            total_reward += reward

        if done:
            break

    if return_time:
        run_time = time() - start_tp
        time_info = {
            "run time": run_time,
            "total sample time": total_sample_time,
            "total learn time": total_learn_time
        }
        return total_reward, steps, time_info
    else:
        return total_reward, steps

Пример #2

Показать файл

Файл: ddpg_controller.py Проект: 666DZY666/PaddleSlim

class DDPG(RLBaseController):
    def __init__(self, range_tables, use_gpu=False, **kwargs):
        self.use_gpu = use_gpu
        self.range_tables = range_tables - np.asarray(1)
        self.act_dim = len(self.range_tables)
        self.obs_dim = kwargs.get('obs_dim')
        self.model = kwargs.get(
            'model') if 'model' in kwargs else default_ddpg_model
        self.actor_lr = kwargs.get(
            'actor_lr') if 'actor_lr' in kwargs else 1e-4
        self.critic_lr = kwargs.get(
            'critic_lr') if 'critic_lr' in kwargs else 1e-3
        self.gamma = kwargs.get('gamma') if 'gamma' in kwargs else 0.99
        self.tau = kwargs.get('tau') if 'tau' in kwargs else 0.001
        self.memory_size = kwargs.get(
            'memory_size') if 'memory_size' in kwargs else 10
        self.reward_scale = kwargs.get(
            'reward_scale') if 'reward_scale' in kwargs else 0.1
        self.batch_size = kwargs.get(
            'controller_batch_size'
        ) if 'controller_batch_size' in kwargs else 1
        self.actions_noise = kwargs.get(
            'actions_noise') if 'actions_noise' in kwargs else default_noise
        self.action_dist = 0.0
        self.place = fluid.CUDAPlace(0) if self.use_gpu else fluid.CPUPlace()

        model = self.model(self.act_dim)

        if self.actions_noise:
            self.actions_noise = self.actions_noise()

        algorithm = parl.algorithms.DDPG(model,
                                         gamma=self.gamma,
                                         tau=self.tau,
                                         actor_lr=self.actor_lr,
                                         critic_lr=self.critic_lr)
        self.agent = DDPGAgent(algorithm, self.obs_dim, self.act_dim)
        self.rpm = ReplayMemory(self.memory_size, self.obs_dim, self.act_dim)

        self.pred_program = self.agent.pred_program
        self.learn_program = self.agent.learn_program
        self.param_dict = self.get_params(self.learn_program)

    def next_tokens(self, obs, params_dict, is_inference=False):
        batch_obs = np.expand_dims(obs, axis=0)
        self.set_params(self.pred_program, params_dict, self.place)
        actions = self.agent.predict(batch_obs.astype('float32'))
        ### add noise to action
        if self.actions_noise and is_inference == False:
            actions_noise = np.clip(
                np.random.normal(actions, scale=self.actions_noise.stdev_curr),
                -1.0, 1.0)
            self.action_dist = np.mean(np.abs(actions_noise - actions))
        else:
            actions_noise = actions
        actions_noise = action_mapping(actions_noise, self.range_tables)
        return actions_noise

    def _update_noise(self, actions_dist):
        self.actions_noise.update(actions_dist)

    def update(self, rewards, params_dict, obs, actions, obs_next, terminal):
        self.set_params(self.learn_program, params_dict, self.place)
        self.rpm.append(obs, actions, self.reward_scale * rewards, obs_next,
                        terminal)
        if self.actions_noise:
            self._update_noise(self.action_dist)
        if self.rpm.size() > self.memory_size:
            obs, actions, rewards, obs_next, terminal = rpm.sample_batch(
                self.batch_size)
        self.agent.learn(obs, actions, rewards, obs_next, terminal)
        params_dict = self.get_params(self.learn_program)
        return params_dict

Пример #3

Показать файл

class RLBenchmarkDispatcher(DispatcherBase):
  '''
  An RL benchmark for elevator system
  '''

  def load_settings(self):
    self._obs_dim = obs_dim(self._mansion)
    self._act_dim = act_dim(self._mansion)
    self._ele_num = self._mansion._elevator_number
    self._max_floor = self._mansion._floor_number
    self._global_step = 0
    for i in range(self._mansion._elevator_number):
      self._rpm = ReplayMemory(MEMORY_SIZE, self._obs_dim, 1)
      self._model = RLDispatcherModel(self._act_dim)
    hyperparas = {
        'action_dim': self._act_dim,
        'lr': 5.0e-4,
        'gamma': 0.998
    }
    #print ("action dimention:", self._obs_dim, self._act_dim)
    self._algorithm = DQN(self._model, hyperparas)
    self._agent = ElevatorAgent(self._algorithm, self._obs_dim, self._act_dim)
    self._warm_up_size = 2000
    self._statistic_freq = 1000
    self._loss_queue = deque()

  def feedback(self, state, action, r):
    self._global_step += 1
    observation_array = mansion_state_preprocessing(state)
    new_actions = list()
    for ele_act in action:
      new_actions.append(action_to_action_idx(ele_act, self._act_dim))
    if(self._global_step > self._warm_up_size):
      for i in range(self._ele_num):
        self._rpm.append(
            self._last_observation_array[i], 
            self._last_action[i], 
            self._last_reward, 
            deepcopy(observation_array[i]), False)
    self._last_observation_array = deepcopy(observation_array)
    self._last_action = deepcopy(new_actions)
    self._last_reward = r

    if self._rpm.size() > self._warm_up_size:
      batch_obs, batch_action, batch_reward, batch_next_obs, batch_terminal = \
          self._rpm.sample_batch(BATCH_SIZE)
      cost = self._agent.learn(batch_obs, batch_action, 
          batch_reward, batch_next_obs, batch_terminal)
      self._loss_queue.appendleft(cost)
      if(len(self._loss_queue) > self._statistic_freq):
        self._loss_queue.pop()
      if(self._global_step % self._statistic_freq == 0):
        self._mansion._config.log_notice("Temporal Difference Error(Average) %f", sum(self._loss_queue)/float(len(self._loss_queue)))

  def policy(self, state):
    self._exploration_ratio = 500000.0 / (500000.0 + self._global_step) + 0.02
    observation_array = mansion_state_preprocessing(state)
    q_values = self._agent.predict(observation_array)
    ret_actions = list()
    for i in range(self._ele_num):
      if(random.random() < self._exploration_ratio):
        action = random.randint(1, self._max_floor)
      else:
        action = np.argmax(q_values[i])
      ret_actions.append(action_idx_to_action(int(action), self._act_dim))
    return ret_actions

Пример #4

Показать файл

Файл: agent.py Проект: starinsun/multiagent-particle-envs

class MAAgent(parl.Agent):
    def __init__(self,
                 algorithm,
                 agent_index=None,
                 obs_dim_n=None,
                 act_dim_n=None,
                 batch_size=None,
                 speedup=False):
        assert isinstance(agent_index, int)
        assert isinstance(obs_dim_n, list)
        assert isinstance(act_dim_n, list)
        assert isinstance(batch_size, int)
        assert isinstance(speedup, bool)
        self.agent_index = agent_index
        self.obs_dim_n = obs_dim_n
        self.act_dim_n = act_dim_n
        self.batch_size = batch_size
        self.speedup = speedup
        self.n = len(act_dim_n)

        self.memory_size = int(1e6)
        self.min_memory_size = batch_size * 25  # batch_size * args.max_episode_len
        self.rpm = ReplayMemory(max_size=self.memory_size,
                                obs_dim=self.obs_dim_n[agent_index],
                                act_dim=self.act_dim_n[agent_index])
        self.global_train_step = 0

        super(MAAgent, self).__init__(algorithm)

        # Attention: In the beginning, sync target model totally.
        self.alg.sync_target(decay=0)

    def build_program(self):
        self.pred_program = fluid.Program()
        self.learn_program = fluid.Program()
        self.next_q_program = fluid.Program()
        self.next_a_program = fluid.Program()

        with fluid.program_guard(self.pred_program):
            obs = layers.data(name='obs',
                              shape=[self.obs_dim_n[self.agent_index]],
                              dtype='float32')
            self.pred_act = self.alg.predict(obs)

        with fluid.program_guard(self.learn_program):
            obs_n = [
                layers.data(name='obs' + str(i),
                            shape=[self.obs_dim_n[i]],
                            dtype='float32') for i in range(self.n)
            ]
            act_n = [
                layers.data(name='act' + str(i),
                            shape=[self.act_dim_n[i]],
                            dtype='float32') for i in range(self.n)
            ]
            target_q = layers.data(name='target_q', shape=[], dtype='float32')
            self.critic_cost = self.alg.learn(obs_n, act_n, target_q)

        with fluid.program_guard(self.next_q_program):
            obs_n = [
                layers.data(name='obs' + str(i),
                            shape=[self.obs_dim_n[i]],
                            dtype='float32') for i in range(self.n)
            ]
            act_n = [
                layers.data(name='act' + str(i),
                            shape=[self.act_dim_n[i]],
                            dtype='float32') for i in range(self.n)
            ]
            self.next_Q = self.alg.Q_next(obs_n, act_n)

        with fluid.program_guard(self.next_a_program):
            obs = layers.data(name='obs',
                              shape=[self.obs_dim_n[self.agent_index]],
                              dtype='float32')
            self.next_action = self.alg.predict_next(obs)

        if self.speedup:
            self.pred_program = parl.compile(self.pred_program)
            self.learn_program = parl.compile(self.learn_program,
                                              self.critic_cost)
            self.next_q_program = parl.compile(self.next_q_program)
            self.next_a_program = parl.compile(self.next_a_program)

    def predict(self, obs):
        obs = np.expand_dims(obs, axis=0)
        obs = obs.astype('float32')
        act = self.fluid_executor.run(self.pred_program,
                                      feed={'obs': obs},
                                      fetch_list=[self.pred_act])[0]
        return act[0]

    def learn(self, agents):
        self.global_train_step += 1

        # only update parameter every 100 steps
        if self.global_train_step % 100 != 0:
            return 0.0

        if self.rpm.size() <= self.min_memory_size:
            return 0.0

        batch_obs_n = []
        batch_act_n = []
        batch_obs_new_n = []

        rpm_sample_index = self.rpm.make_index(self.batch_size)
        for i in range(self.n):
            batch_obs, batch_act, _, batch_obs_new, _ \
                = agents[i].rpm.sample_batch_by_index(rpm_sample_index)
            batch_obs_n.append(batch_obs)
            batch_act_n.append(batch_act)
            batch_obs_new_n.append(batch_obs_new)
        _, _, batch_rew, _, batch_isOver \
                = self.rpm.sample_batch_by_index(rpm_sample_index)

        # compute target q
        target_q = 0.0
        target_act_next_n = []
        for i in range(self.n):
            feed = {'obs': batch_obs_new_n[i]}
            target_act_next = agents[i].fluid_executor.run(
                agents[i].next_a_program,
                feed=feed,
                fetch_list=[agents[i].next_action])[0]
            target_act_next_n.append(target_act_next)
        feed_obs = {'obs' + str(i): batch_obs_new_n[i] for i in range(self.n)}
        feed_act = {
            'act' + str(i): target_act_next_n[i]
            for i in range(self.n)
        }
        feed = feed_obs.copy()
        feed.update(feed_act)  # merge two dict
        target_q_next = self.fluid_executor.run(self.next_q_program,
                                                feed=feed,
                                                fetch_list=[self.next_Q])[0]
        target_q += (batch_rew + self.alg.gamma *
                     (1.0 - batch_isOver) * target_q_next)

        feed_obs = {'obs' + str(i): batch_obs_n[i] for i in range(self.n)}
        feed_act = {'act' + str(i): batch_act_n[i] for i in range(self.n)}
        target_q = target_q.astype('float32')
        feed = feed_obs.copy()
        feed.update(feed_act)
        feed['target_q'] = target_q
        critic_cost = self.fluid_executor.run(self.learn_program,
                                              feed=feed,
                                              fetch_list=[self.critic_cost])[0]

        self.alg.sync_target()
        return critic_cost

    def add_experience(self, obs, act, reward, next_obs, terminal):
        self.rpm.append(obs, act, reward, next_obs, terminal)

Пример #5

Показать файл

def main():
    logger.info("-----------------Carla_SAC-------------------")
    logger.set_dir('./{}_train'.format(args.env))

    # Parallel environments for training
    train_envs_params = EnvConfig['train_envs_params']
    env_num = EnvConfig['env_num']
    env_list = ParallelEnv(args.env, args.xparl_addr, train_envs_params)

    # env for eval
    eval_env_params = EnvConfig['eval_env_params']
    eval_env = LocalEnv(args.env, eval_env_params)

    obs_dim = eval_env.obs_dim
    action_dim = eval_env.action_dim

    # Initialize model, algorithm, agent, replay_memory
    if args.framework == 'torch':
        CarlaModel, SAC, CarlaAgent = TorchModel, TorchSAC, TorchAgent
    elif args.framework == 'paddle':
        CarlaModel, SAC, CarlaAgent = PaddleModel, PaddleSAC, PaddleAgent
    model = CarlaModel(obs_dim, action_dim)
    algorithm = SAC(
        model,
        gamma=GAMMA,
        tau=TAU,
        alpha=ALPHA,
        actor_lr=ACTOR_LR,
        critic_lr=CRITIC_LR)
    agent = CarlaAgent(algorithm)
    rpm = ReplayMemory(
        max_size=MEMORY_SIZE, obs_dim=obs_dim, act_dim=action_dim)

    total_steps = 0
    last_save_steps = 0
    test_flag = 0

    obs_list = env_list.reset()

    while total_steps < args.train_total_steps:
        # Train episode
        if rpm.size() < WARMUP_STEPS:
            action_list = [
                np.random.uniform(-1, 1, size=action_dim)
                for _ in range(env_num)
            ]
        else:
            action_list = [agent.sample(obs) for obs in obs_list]
        next_obs_list, reward_list, done_list, info_list = env_list.step(
            action_list)

        # Store data in replay memory
        for i in range(env_num):
            rpm.append(obs_list[i], action_list[i], reward_list[i],
                       next_obs_list[i], done_list[i])

        obs_list = env_list.get_obs()
        total_steps = env_list.total_steps
        # Train agent after collecting sufficient data
        if rpm.size() >= WARMUP_STEPS:
            batch_obs, batch_action, batch_reward, batch_next_obs, batch_terminal = rpm.sample_batch(
                BATCH_SIZE)
            agent.learn(batch_obs, batch_action, batch_reward, batch_next_obs,
                        batch_terminal)

        # Save agent
        if total_steps > int(1e5) and total_steps > last_save_steps + int(1e4):
            agent.save('./{}_model/step_{}_model.ckpt'.format(
                args.framework, total_steps))
            last_save_steps = total_steps

        # Evaluate episode
        if (total_steps + 1) // args.test_every_steps >= test_flag:
            while (total_steps + 1) // args.test_every_steps >= test_flag:
                test_flag += 1
            avg_reward = run_evaluate_episodes(agent, eval_env, EVAL_EPISODES)
            tensorboard.add_scalar('eval/episode_reward', avg_reward,
                                   total_steps)
            logger.info(
                'Total steps {}, Evaluation over {} episodes, Average reward: {}'
                .format(total_steps, EVAL_EPISODES, avg_reward))