예제 #1
0
    def run_game(self):

        config = self.config
        n = config.runs_per_agent
        prev_best_reward = -1000

        for run in range(n):

            # potentially, we can change the goals as agent picks up more skills
            env = eval(config.environment)
            test_env = eval(config.environment)
            cLoss, aLoss = [], []

            # 0. instantiate an agent instance of this class
            agent = self.agentCls(**self.agentArgs)
            obs_dim, act_dim = agent.obs_dim, agent.act_dim

            # 1. instantiate a memory pool and warm up
            rpm = ReplayMemory(config.memory_size, obs_dim, act_dim)

            # 2. set up logging file
            save_dir = config.log_path + "{}_{}".format(self.name, run + 1)
            if not os.path.exists(save_dir):
                os.makedirs(save_dir)

            # 3. start training
            test_flag, total_steps = 0, 0
            train_rewards, test_means, test_stds = [], [], []
            pbar = tqdm(total=config.train_total_steps)
            while total_steps < config.train_total_steps:

                para = [
                    config.reward_scale, config.warmup_size, config.batch_size,
                    config.expl_noise
                ]
                train_reward, steps, costC, costA = run_train_episode(
                    env, agent, rpm, *para)

                total_steps += steps
                train_rewards.append(train_reward)
                cLoss.append(costC)
                aLoss.append(costA)

                pbar.set_description('Steps: {} Reward: {}'.format(
                    total_steps, train_reward))
                pbar.update(steps)

                # 4. start testing
                if total_steps // config.test_every_steps >= test_flag:
                    while total_steps // config.test_every_steps >= test_flag:
                        test_flag += 1
                    r_mean, r_std = run_evaluate_episode(test_env, agent)
                    logger.info('Steps {}, Evaluate reward: {}'.format(
                        total_steps, r_mean))
                    test_means.append(r_mean)
                    test_stds.append(r_std)
                    if config.save_model and r_mean > prev_best_reward:
                        prev_best_reward = r_mean
                        ckpt = save_dir + '/Steps_{}_reward_{}.ckpt'.format(
                            total_steps, int(r_mean))
                        agent.save(ckpt, program=agent.pred_program)
                    np.savez(save_dir + '/record.npz',
                             train=train_rewards,
                             mean=test_means,
                             std=test_stds,
                             closs=cLoss,
                             aloss=aLoss)
            if config.visual_result:
                plot_reward(train_rewards)
                plot_reward(test_means, test_stds)
예제 #2
0
class MAAgent(parl.Agent):
    def __init__(self,
                 algorithm,
                 agent_index=None,
                 obs_dim_n=None,
                 act_dim_n=None,
                 batch_size=None,
                 speedup=False):
        assert isinstance(agent_index, int)
        assert isinstance(obs_dim_n, list)
        assert isinstance(act_dim_n, list)
        assert isinstance(batch_size, int)
        assert isinstance(speedup, bool)
        self.agent_index = agent_index
        self.obs_dim_n = obs_dim_n
        self.act_dim_n = act_dim_n
        self.batch_size = batch_size
        self.speedup = speedup
        self.n = len(act_dim_n)

        self.memory_size = int(1e6)
        self.min_memory_size = batch_size * 25  # batch_size * args.max_episode_len
        self.rpm = ReplayMemory(max_size=self.memory_size,
                                obs_dim=self.obs_dim_n[agent_index],
                                act_dim=self.act_dim_n[agent_index])
        self.global_train_step = 0

        super(MAAgent, self).__init__(algorithm)

        # Attention: In the beginning, sync target model totally.
        self.alg.sync_target(decay=0)

    def build_program(self):
        self.pred_program = fluid.Program()
        self.learn_program = fluid.Program()
        self.next_q_program = fluid.Program()
        self.next_a_program = fluid.Program()

        with fluid.program_guard(self.pred_program):
            obs = layers.data(name='obs',
                              shape=[self.obs_dim_n[self.agent_index]],
                              dtype='float32')
            self.pred_act = self.alg.predict(obs)

        with fluid.program_guard(self.learn_program):
            obs_n = [
                layers.data(name='obs' + str(i),
                            shape=[self.obs_dim_n[i]],
                            dtype='float32') for i in range(self.n)
            ]
            act_n = [
                layers.data(name='act' + str(i),
                            shape=[self.act_dim_n[i]],
                            dtype='float32') for i in range(self.n)
            ]
            target_q = layers.data(name='target_q', shape=[], dtype='float32')
            self.critic_cost = self.alg.learn(obs_n, act_n, target_q)

        with fluid.program_guard(self.next_q_program):
            obs_n = [
                layers.data(name='obs' + str(i),
                            shape=[self.obs_dim_n[i]],
                            dtype='float32') for i in range(self.n)
            ]
            act_n = [
                layers.data(name='act' + str(i),
                            shape=[self.act_dim_n[i]],
                            dtype='float32') for i in range(self.n)
            ]
            self.next_Q = self.alg.Q_next(obs_n, act_n)

        with fluid.program_guard(self.next_a_program):
            obs = layers.data(name='obs',
                              shape=[self.obs_dim_n[self.agent_index]],
                              dtype='float32')
            self.next_action = self.alg.predict_next(obs)

        if self.speedup:
            self.pred_program = parl.compile(self.pred_program)
            self.learn_program = parl.compile(self.learn_program,
                                              self.critic_cost)
            self.next_q_program = parl.compile(self.next_q_program)
            self.next_a_program = parl.compile(self.next_a_program)

    def predict(self, obs):
        obs = np.expand_dims(obs, axis=0)
        obs = obs.astype('float32')
        act = self.fluid_executor.run(self.pred_program,
                                      feed={'obs': obs},
                                      fetch_list=[self.pred_act])[0]
        return act[0]

    def learn(self, agents):
        self.global_train_step += 1

        # only update parameter every 100 steps
        if self.global_train_step % 100 != 0:
            return 0.0

        if self.rpm.size() <= self.min_memory_size:
            return 0.0

        batch_obs_n = []
        batch_act_n = []
        batch_obs_new_n = []

        rpm_sample_index = self.rpm.make_index(self.batch_size)
        for i in range(self.n):
            batch_obs, batch_act, _, batch_obs_new, _ \
                = agents[i].rpm.sample_batch_by_index(rpm_sample_index)
            batch_obs_n.append(batch_obs)
            batch_act_n.append(batch_act)
            batch_obs_new_n.append(batch_obs_new)
        _, _, batch_rew, _, batch_isOver \
                = self.rpm.sample_batch_by_index(rpm_sample_index)

        # compute target q
        target_q = 0.0
        target_act_next_n = []
        for i in range(self.n):
            feed = {'obs': batch_obs_new_n[i]}
            target_act_next = agents[i].fluid_executor.run(
                agents[i].next_a_program,
                feed=feed,
                fetch_list=[agents[i].next_action])[0]
            target_act_next_n.append(target_act_next)
        feed_obs = {'obs' + str(i): batch_obs_new_n[i] for i in range(self.n)}
        feed_act = {
            'act' + str(i): target_act_next_n[i]
            for i in range(self.n)
        }
        feed = feed_obs.copy()
        feed.update(feed_act)  # merge two dict
        target_q_next = self.fluid_executor.run(self.next_q_program,
                                                feed=feed,
                                                fetch_list=[self.next_Q])[0]
        target_q += (batch_rew + self.alg.gamma *
                     (1.0 - batch_isOver) * target_q_next)

        feed_obs = {'obs' + str(i): batch_obs_n[i] for i in range(self.n)}
        feed_act = {'act' + str(i): batch_act_n[i] for i in range(self.n)}
        target_q = target_q.astype('float32')
        feed = feed_obs.copy()
        feed.update(feed_act)
        feed['target_q'] = target_q
        critic_cost = self.fluid_executor.run(self.learn_program,
                                              feed=feed,
                                              fetch_list=[self.critic_cost])[0]

        self.alg.sync_target()
        return critic_cost

    def add_experience(self, obs, act, reward, next_obs, terminal):
        self.rpm.append(obs, act, reward, next_obs, terminal)
예제 #3
0
class DDPG(RLBaseController):
    def __init__(self, range_tables, use_gpu=False, **kwargs):
        self.use_gpu = use_gpu
        self.range_tables = range_tables - np.asarray(1)
        self.act_dim = len(self.range_tables)
        self.obs_dim = kwargs.get('obs_dim')
        self.model = kwargs.get(
            'model') if 'model' in kwargs else default_ddpg_model
        self.actor_lr = kwargs.get(
            'actor_lr') if 'actor_lr' in kwargs else 1e-4
        self.critic_lr = kwargs.get(
            'critic_lr') if 'critic_lr' in kwargs else 1e-3
        self.gamma = kwargs.get('gamma') if 'gamma' in kwargs else 0.99
        self.tau = kwargs.get('tau') if 'tau' in kwargs else 0.001
        self.memory_size = kwargs.get(
            'memory_size') if 'memory_size' in kwargs else 10
        self.reward_scale = kwargs.get(
            'reward_scale') if 'reward_scale' in kwargs else 0.1
        self.batch_size = kwargs.get(
            'controller_batch_size'
        ) if 'controller_batch_size' in kwargs else 1
        self.actions_noise = kwargs.get(
            'actions_noise') if 'actions_noise' in kwargs else default_noise
        self.action_dist = 0.0
        self.place = paddle.CUDAPlace(0) if self.use_gpu else paddle.CPUPlace()

        model = self.model(self.act_dim)

        if self.actions_noise:
            self.actions_noise = self.actions_noise()

        algorithm = parl.algorithms.DDPG(model,
                                         gamma=self.gamma,
                                         tau=self.tau,
                                         actor_lr=self.actor_lr,
                                         critic_lr=self.critic_lr)
        self.agent = DDPGAgent(algorithm, self.obs_dim, self.act_dim)
        self.rpm = ReplayMemory(self.memory_size, self.obs_dim, self.act_dim)

        self.pred_program = self.agent.pred_program
        self.learn_program = self.agent.learn_program
        self.param_dict = self.get_params(self.learn_program)

    def next_tokens(self, obs, params_dict, is_inference=False):
        batch_obs = np.expand_dims(obs, axis=0)
        self.set_params(self.pred_program, params_dict, self.place)
        actions = self.agent.predict(batch_obs.astype('float32'))
        ### add noise to action
        if self.actions_noise and is_inference == False:
            actions_noise = np.clip(
                np.random.normal(actions, scale=self.actions_noise.stdev_curr),
                -1.0, 1.0)
            self.action_dist = np.mean(np.abs(actions_noise - actions))
        else:
            actions_noise = actions
        actions_noise = action_mapping(actions_noise, self.range_tables)
        return actions_noise

    def _update_noise(self, actions_dist):
        self.actions_noise.update(actions_dist)

    def update(self, rewards, params_dict, obs, actions, obs_next, terminal):
        self.set_params(self.learn_program, params_dict, self.place)
        self.rpm.append(obs, actions, self.reward_scale * rewards, obs_next,
                        terminal)
        if self.actions_noise:
            self._update_noise(self.action_dist)
        if self.rpm.size() > self.memory_size:
            obs, actions, rewards, obs_next, terminal = rpm.sample_batch(
                self.batch_size)
        self.agent.learn(obs, actions, rewards, obs_next, terminal)
        params_dict = self.get_params(self.learn_program)
        return params_dict
예제 #4
0
파일: PPGD.py 프로젝트: QFaceblue/quadrotor
    act_dim = env.action_space.shape[0] + 1

    # 根据parl框架构建agent
    model = QuadrotorModel(act_dim)
    algorithm = DDPG(model,
                     gamma=GAMMA,
                     tau=TAU,
                     actor_lr=ACTOR_LR,
                     critic_lr=CRITIC_LR)
    agent = QuadrotorAgent(algorithm, obs_dim, act_dim)

    # # 加载模型
    # ckpt = 'model_dir/steps_680596_8423.2103163893.ckpt'
    # agent.restore(ckpt)
    # parl库也为DDPG算法内置了ReplayMemory,可直接从 parl.utils 引入使用
    rpm = ReplayMemory(int(MEMORY_SIZE), obs_dim, act_dim)

    # 启动训练
    # test_flag = 0
    # total_steps = 0
    # while total_steps < TRAIN_TOTAL_STEPS:
    #     train_reward, steps = run_episode(env, agent, rpm)
    #     total_steps += steps
    #     #logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward)) # 打印训练reward

    #     if total_steps // TEST_EVERY_STEPS >= test_flag: # 每隔一定step数,评估一次模型
    #         while total_steps // TEST_EVERY_STEPS >= test_flag:
    #             test_flag += 1

    #         evaluate_reward = evaluate(env, agent)
    #         logger.info('Steps {}, Test reward: {}'.format(
예제 #5
0
class RLBenchmarkDispatcher(DispatcherBase):
  '''
  An RL benchmark for elevator system
  '''

  def load_settings(self):
    self._obs_dim = obs_dim(self._mansion)
    self._act_dim = act_dim(self._mansion)
    self._ele_num = self._mansion._elevator_number
    self._max_floor = self._mansion._floor_number
    self._global_step = 0
    for i in range(self._mansion._elevator_number):
      self._rpm = ReplayMemory(MEMORY_SIZE, self._obs_dim, 1)
      self._model = RLDispatcherModel(self._act_dim)
    hyperparas = {
        'action_dim': self._act_dim,
        'lr': 5.0e-4,
        'gamma': 0.998
    }
    #print ("action dimention:", self._obs_dim, self._act_dim)
    self._algorithm = DQN(self._model, hyperparas)
    self._agent = ElevatorAgent(self._algorithm, self._obs_dim, self._act_dim)
    self._warm_up_size = 2000
    self._statistic_freq = 1000
    self._loss_queue = deque()

  def feedback(self, state, action, r):
    self._global_step += 1
    observation_array = mansion_state_preprocessing(state)
    new_actions = list()
    for ele_act in action:
      new_actions.append(action_to_action_idx(ele_act, self._act_dim))
    if(self._global_step > self._warm_up_size):
      for i in range(self._ele_num):
        self._rpm.append(
            self._last_observation_array[i], 
            self._last_action[i], 
            self._last_reward, 
            deepcopy(observation_array[i]), False)
    self._last_observation_array = deepcopy(observation_array)
    self._last_action = deepcopy(new_actions)
    self._last_reward = r

    if self._rpm.size() > self._warm_up_size:
      batch_obs, batch_action, batch_reward, batch_next_obs, batch_terminal = \
          self._rpm.sample_batch(BATCH_SIZE)
      cost = self._agent.learn(batch_obs, batch_action, 
          batch_reward, batch_next_obs, batch_terminal)
      self._loss_queue.appendleft(cost)
      if(len(self._loss_queue) > self._statistic_freq):
        self._loss_queue.pop()
      if(self._global_step % self._statistic_freq == 0):
        self._mansion._config.log_notice("Temporal Difference Error(Average) %f", sum(self._loss_queue)/float(len(self._loss_queue)))

  def policy(self, state):
    self._exploration_ratio = 500000.0 / (500000.0 + self._global_step) + 0.02
    observation_array = mansion_state_preprocessing(state)
    q_values = self._agent.predict(observation_array)
    ret_actions = list()
    for i in range(self._ele_num):
      if(random.random() < self._exploration_ratio):
        action = random.randint(1, self._max_floor)
      else:
        action = np.argmax(q_values[i])
      ret_actions.append(action_idx_to_action(int(action), self._act_dim))
    return ret_actions
예제 #6
0
def main():
    logger.info("-----------------Carla_SAC-------------------")
    logger.set_dir('./{}_train'.format(args.env))

    # Parallel environments for training
    train_envs_params = EnvConfig['train_envs_params']
    env_num = EnvConfig['env_num']
    env_list = ParallelEnv(args.env, args.xparl_addr, train_envs_params)

    # env for eval
    eval_env_params = EnvConfig['eval_env_params']
    eval_env = LocalEnv(args.env, eval_env_params)

    obs_dim = eval_env.obs_dim
    action_dim = eval_env.action_dim

    # Initialize model, algorithm, agent, replay_memory
    if args.framework == 'torch':
        CarlaModel, SAC, CarlaAgent = TorchModel, TorchSAC, TorchAgent
    elif args.framework == 'paddle':
        CarlaModel, SAC, CarlaAgent = PaddleModel, PaddleSAC, PaddleAgent
    model = CarlaModel(obs_dim, action_dim)
    algorithm = SAC(
        model,
        gamma=GAMMA,
        tau=TAU,
        alpha=ALPHA,
        actor_lr=ACTOR_LR,
        critic_lr=CRITIC_LR)
    agent = CarlaAgent(algorithm)
    rpm = ReplayMemory(
        max_size=MEMORY_SIZE, obs_dim=obs_dim, act_dim=action_dim)

    total_steps = 0
    last_save_steps = 0
    test_flag = 0

    obs_list = env_list.reset()

    while total_steps < args.train_total_steps:
        # Train episode
        if rpm.size() < WARMUP_STEPS:
            action_list = [
                np.random.uniform(-1, 1, size=action_dim)
                for _ in range(env_num)
            ]
        else:
            action_list = [agent.sample(obs) for obs in obs_list]
        next_obs_list, reward_list, done_list, info_list = env_list.step(
            action_list)

        # Store data in replay memory
        for i in range(env_num):
            rpm.append(obs_list[i], action_list[i], reward_list[i],
                       next_obs_list[i], done_list[i])

        obs_list = env_list.get_obs()
        total_steps = env_list.total_steps
        # Train agent after collecting sufficient data
        if rpm.size() >= WARMUP_STEPS:
            batch_obs, batch_action, batch_reward, batch_next_obs, batch_terminal = rpm.sample_batch(
                BATCH_SIZE)
            agent.learn(batch_obs, batch_action, batch_reward, batch_next_obs,
                        batch_terminal)

        # Save agent
        if total_steps > int(1e5) and total_steps > last_save_steps + int(1e4):
            agent.save('./{}_model/step_{}_model.ckpt'.format(
                args.framework, total_steps))
            last_save_steps = total_steps

        # Evaluate episode
        if (total_steps + 1) // args.test_every_steps >= test_flag:
            while (total_steps + 1) // args.test_every_steps >= test_flag:
                test_flag += 1
            avg_reward = run_evaluate_episodes(agent, eval_env, EVAL_EPISODES)
            tensorboard.add_scalar('eval/episode_reward', avg_reward,
                                   total_steps)
            logger.info(
                'Total steps {}, Evaluation over {} episodes, Average reward: {}'
                .format(total_steps, EVAL_EPISODES, avg_reward))
예제 #7
0
obs_dim = 7
goal_dim = 3

act_dim = env.action_space.shape[0]
max_action = float(env.action_space.high[0])

model = RLBenchModel(act_dim, max_action)
algorithm = parl.algorithms.TD3(model,
                                max_action=max_action,
                                gamma=GAMMA,
                                tau=TAU,
                                actor_lr=ACTOR_LR,
                                critic_lr=CRITIC_LR)

agent = RLBenchAgent(algorithm, obs_dim + goal_dim, act_dim)
rpm = ReplayMemory(MEMORY_SIZE, obs_dim + goal_dim, act_dim)

test_flag = 0
store_flag = 0
total_episodes = 16000
while total_episodes < args.train_total_episodes:
    train_reward = run_train_episode(env, agent, rpm)
    total_episodes += 1
    logger.logging_string('Episodes: {} Reward: {}'.format(
        total_episodes, train_reward))
    #tensorboard.add_scalar('train/episode_reward', train_reward,total_episodes)

    if total_episodes // args.test_every_episodes >= test_flag:
        while total_episodes // args.test_every_episodes >= test_flag:
            test_flag += 1
        evaluate_reward = run_evaluate_episode(env, agent, render=False)
예제 #8
0
def main(train=True):
    if train:
        # 加载数据
        df = pd.read_csv('wudigushi/DATA/AAPL.csv')
        df = df.sort_values('Date')
        # 创建环境
        env = StockTradingEnv(df)
        env.reset()
        act_dim = env.action_space.shape[0]
        obs_dim = env.observation_space.shape[0]
        
        print(act_dim)
        print(obs_dim)
        
        model = WudigupiaoModel(act_dim)
        algorithm = DDPG(
            model, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR)
        agent = MyStockAgent(algorithm, obs_dim, act_dim)
        

        rpm = ReplayMemory(int(MEMORY_SIZE), obs_dim , act_dim)

        

        test_flag = 0
        total_steps = 0
        while total_steps < TRAIN_TOTAL_STEPS:
            train_reward, steps = run_episode(env, agent, rpm)
            total_steps += steps
            # logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward))

            if total_steps // TEST_EVERY_STEPS >= test_flag:
                # print('s1')
                while total_steps // TEST_EVERY_STEPS >= test_flag:
                    test_flag += 1

                evaluate_reward = evaluate(env, agent)
                logger.info('Steps {}, Test reward: {}'.format(total_steps,
                                                        evaluate_reward))


                

                ckpt = 'wudigushi/ckpt/steps_{}.ckpt'.format(total_steps)
                agent.save(ckpt)
    else:
        ckpt = 'wudigushi/ckpt/steps_980117.ckpt'  # 请设置ckpt为你训练中效果最好的一次评估保存的模型文件名称
        df = pd.read_csv('wudigushi/DATA/AAPL.csv')
        df = df.sort_values('Date')
        # 创建环境
        env = StockTradingEnv(df)
        env.reset()
        act_dim = env.action_space.shape[0]
        obs_dim = env.observation_space.shape[0]
        # obs_dim = 36
        print(act_dim)
        print(obs_dim)
        
        model = WudigupiaoModel(act_dim)
        algorithm = DDPG(
            model, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR)
        agent = MyStockAgent(algorithm, obs_dim, act_dim)
        agent.restore(ckpt)
        evaluate_reward = evaluate(env, agent)
        logger.info('Evaluate reward: {}'.format(evaluate_reward)) # 打印评估的reward
예제 #9
0
        env_reward.append(d_r)
    total_reward.append(np.mean(total_reward))
    env_reward.append(np.mean(env_reward))
    return total_reward, env_reward


env = make_env("Quadrotor", task="velocity_control")
obs_dim = env.observation_space.shape[0]
act_dim = env.action_space.shape[0]
logger.info('obs_dim:{} , act_dim:{}'.format(obs_dim, act_dim))

model = QModel(act_dim)
alg = DDPG(model, gamma=Gamma, tau=Tau, actor_lr=A_lr, critic_lr=C_lr)
agent = QAgent(alg, obs_dim, act_dim)

rpm = ReplayMemory(int(Max_Size), obs_dim, act_dim)

# if os.path.exists('./M2_-1094_Over.ckpt'):
#     agent.restore('./M2_-1094_Over.ckpt')
# else:
#     exit(1)

test_flag = 0
total_step = 0
reward_max = -1094
while True:
    step, reward = run_episode(env, agent, rpm)
    total_step += step
    logger.info('Step:{} , Train Reward:{}'.format(total_step, reward))

    if total_step // Test_Round == test_flag: