def run_game(self): config = self.config n = config.runs_per_agent prev_best_reward = -1000 for run in range(n): # potentially, we can change the goals as agent picks up more skills env = eval(config.environment) test_env = eval(config.environment) cLoss, aLoss = [], [] # 0. instantiate an agent instance of this class agent = self.agentCls(**self.agentArgs) obs_dim, act_dim = agent.obs_dim, agent.act_dim # 1. instantiate a memory pool and warm up rpm = ReplayMemory(config.memory_size, obs_dim, act_dim) # 2. set up logging file save_dir = config.log_path + "{}_{}".format(self.name, run + 1) if not os.path.exists(save_dir): os.makedirs(save_dir) # 3. start training test_flag, total_steps = 0, 0 train_rewards, test_means, test_stds = [], [], [] pbar = tqdm(total=config.train_total_steps) while total_steps < config.train_total_steps: para = [ config.reward_scale, config.warmup_size, config.batch_size, config.expl_noise ] train_reward, steps, costC, costA = run_train_episode( env, agent, rpm, *para) total_steps += steps train_rewards.append(train_reward) cLoss.append(costC) aLoss.append(costA) pbar.set_description('Steps: {} Reward: {}'.format( total_steps, train_reward)) pbar.update(steps) # 4. start testing if total_steps // config.test_every_steps >= test_flag: while total_steps // config.test_every_steps >= test_flag: test_flag += 1 r_mean, r_std = run_evaluate_episode(test_env, agent) logger.info('Steps {}, Evaluate reward: {}'.format( total_steps, r_mean)) test_means.append(r_mean) test_stds.append(r_std) if config.save_model and r_mean > prev_best_reward: prev_best_reward = r_mean ckpt = save_dir + '/Steps_{}_reward_{}.ckpt'.format( total_steps, int(r_mean)) agent.save(ckpt, program=agent.pred_program) np.savez(save_dir + '/record.npz', train=train_rewards, mean=test_means, std=test_stds, closs=cLoss, aloss=aLoss) if config.visual_result: plot_reward(train_rewards) plot_reward(test_means, test_stds)
class MAAgent(parl.Agent): def __init__(self, algorithm, agent_index=None, obs_dim_n=None, act_dim_n=None, batch_size=None, speedup=False): assert isinstance(agent_index, int) assert isinstance(obs_dim_n, list) assert isinstance(act_dim_n, list) assert isinstance(batch_size, int) assert isinstance(speedup, bool) self.agent_index = agent_index self.obs_dim_n = obs_dim_n self.act_dim_n = act_dim_n self.batch_size = batch_size self.speedup = speedup self.n = len(act_dim_n) self.memory_size = int(1e6) self.min_memory_size = batch_size * 25 # batch_size * args.max_episode_len self.rpm = ReplayMemory(max_size=self.memory_size, obs_dim=self.obs_dim_n[agent_index], act_dim=self.act_dim_n[agent_index]) self.global_train_step = 0 super(MAAgent, self).__init__(algorithm) # Attention: In the beginning, sync target model totally. self.alg.sync_target(decay=0) def build_program(self): self.pred_program = fluid.Program() self.learn_program = fluid.Program() self.next_q_program = fluid.Program() self.next_a_program = fluid.Program() with fluid.program_guard(self.pred_program): obs = layers.data(name='obs', shape=[self.obs_dim_n[self.agent_index]], dtype='float32') self.pred_act = self.alg.predict(obs) with fluid.program_guard(self.learn_program): obs_n = [ layers.data(name='obs' + str(i), shape=[self.obs_dim_n[i]], dtype='float32') for i in range(self.n) ] act_n = [ layers.data(name='act' + str(i), shape=[self.act_dim_n[i]], dtype='float32') for i in range(self.n) ] target_q = layers.data(name='target_q', shape=[], dtype='float32') self.critic_cost = self.alg.learn(obs_n, act_n, target_q) with fluid.program_guard(self.next_q_program): obs_n = [ layers.data(name='obs' + str(i), shape=[self.obs_dim_n[i]], dtype='float32') for i in range(self.n) ] act_n = [ layers.data(name='act' + str(i), shape=[self.act_dim_n[i]], dtype='float32') for i in range(self.n) ] self.next_Q = self.alg.Q_next(obs_n, act_n) with fluid.program_guard(self.next_a_program): obs = layers.data(name='obs', shape=[self.obs_dim_n[self.agent_index]], dtype='float32') self.next_action = self.alg.predict_next(obs) if self.speedup: self.pred_program = parl.compile(self.pred_program) self.learn_program = parl.compile(self.learn_program, self.critic_cost) self.next_q_program = parl.compile(self.next_q_program) self.next_a_program = parl.compile(self.next_a_program) def predict(self, obs): obs = np.expand_dims(obs, axis=0) obs = obs.astype('float32') act = self.fluid_executor.run(self.pred_program, feed={'obs': obs}, fetch_list=[self.pred_act])[0] return act[0] def learn(self, agents): self.global_train_step += 1 # only update parameter every 100 steps if self.global_train_step % 100 != 0: return 0.0 if self.rpm.size() <= self.min_memory_size: return 0.0 batch_obs_n = [] batch_act_n = [] batch_obs_new_n = [] rpm_sample_index = self.rpm.make_index(self.batch_size) for i in range(self.n): batch_obs, batch_act, _, batch_obs_new, _ \ = agents[i].rpm.sample_batch_by_index(rpm_sample_index) batch_obs_n.append(batch_obs) batch_act_n.append(batch_act) batch_obs_new_n.append(batch_obs_new) _, _, batch_rew, _, batch_isOver \ = self.rpm.sample_batch_by_index(rpm_sample_index) # compute target q target_q = 0.0 target_act_next_n = [] for i in range(self.n): feed = {'obs': batch_obs_new_n[i]} target_act_next = agents[i].fluid_executor.run( agents[i].next_a_program, feed=feed, fetch_list=[agents[i].next_action])[0] target_act_next_n.append(target_act_next) feed_obs = {'obs' + str(i): batch_obs_new_n[i] for i in range(self.n)} feed_act = { 'act' + str(i): target_act_next_n[i] for i in range(self.n) } feed = feed_obs.copy() feed.update(feed_act) # merge two dict target_q_next = self.fluid_executor.run(self.next_q_program, feed=feed, fetch_list=[self.next_Q])[0] target_q += (batch_rew + self.alg.gamma * (1.0 - batch_isOver) * target_q_next) feed_obs = {'obs' + str(i): batch_obs_n[i] for i in range(self.n)} feed_act = {'act' + str(i): batch_act_n[i] for i in range(self.n)} target_q = target_q.astype('float32') feed = feed_obs.copy() feed.update(feed_act) feed['target_q'] = target_q critic_cost = self.fluid_executor.run(self.learn_program, feed=feed, fetch_list=[self.critic_cost])[0] self.alg.sync_target() return critic_cost def add_experience(self, obs, act, reward, next_obs, terminal): self.rpm.append(obs, act, reward, next_obs, terminal)
class DDPG(RLBaseController): def __init__(self, range_tables, use_gpu=False, **kwargs): self.use_gpu = use_gpu self.range_tables = range_tables - np.asarray(1) self.act_dim = len(self.range_tables) self.obs_dim = kwargs.get('obs_dim') self.model = kwargs.get( 'model') if 'model' in kwargs else default_ddpg_model self.actor_lr = kwargs.get( 'actor_lr') if 'actor_lr' in kwargs else 1e-4 self.critic_lr = kwargs.get( 'critic_lr') if 'critic_lr' in kwargs else 1e-3 self.gamma = kwargs.get('gamma') if 'gamma' in kwargs else 0.99 self.tau = kwargs.get('tau') if 'tau' in kwargs else 0.001 self.memory_size = kwargs.get( 'memory_size') if 'memory_size' in kwargs else 10 self.reward_scale = kwargs.get( 'reward_scale') if 'reward_scale' in kwargs else 0.1 self.batch_size = kwargs.get( 'controller_batch_size' ) if 'controller_batch_size' in kwargs else 1 self.actions_noise = kwargs.get( 'actions_noise') if 'actions_noise' in kwargs else default_noise self.action_dist = 0.0 self.place = paddle.CUDAPlace(0) if self.use_gpu else paddle.CPUPlace() model = self.model(self.act_dim) if self.actions_noise: self.actions_noise = self.actions_noise() algorithm = parl.algorithms.DDPG(model, gamma=self.gamma, tau=self.tau, actor_lr=self.actor_lr, critic_lr=self.critic_lr) self.agent = DDPGAgent(algorithm, self.obs_dim, self.act_dim) self.rpm = ReplayMemory(self.memory_size, self.obs_dim, self.act_dim) self.pred_program = self.agent.pred_program self.learn_program = self.agent.learn_program self.param_dict = self.get_params(self.learn_program) def next_tokens(self, obs, params_dict, is_inference=False): batch_obs = np.expand_dims(obs, axis=0) self.set_params(self.pred_program, params_dict, self.place) actions = self.agent.predict(batch_obs.astype('float32')) ### add noise to action if self.actions_noise and is_inference == False: actions_noise = np.clip( np.random.normal(actions, scale=self.actions_noise.stdev_curr), -1.0, 1.0) self.action_dist = np.mean(np.abs(actions_noise - actions)) else: actions_noise = actions actions_noise = action_mapping(actions_noise, self.range_tables) return actions_noise def _update_noise(self, actions_dist): self.actions_noise.update(actions_dist) def update(self, rewards, params_dict, obs, actions, obs_next, terminal): self.set_params(self.learn_program, params_dict, self.place) self.rpm.append(obs, actions, self.reward_scale * rewards, obs_next, terminal) if self.actions_noise: self._update_noise(self.action_dist) if self.rpm.size() > self.memory_size: obs, actions, rewards, obs_next, terminal = rpm.sample_batch( self.batch_size) self.agent.learn(obs, actions, rewards, obs_next, terminal) params_dict = self.get_params(self.learn_program) return params_dict
act_dim = env.action_space.shape[0] + 1 # 根据parl框架构建agent model = QuadrotorModel(act_dim) algorithm = DDPG(model, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) agent = QuadrotorAgent(algorithm, obs_dim, act_dim) # # 加载模型 # ckpt = 'model_dir/steps_680596_8423.2103163893.ckpt' # agent.restore(ckpt) # parl库也为DDPG算法内置了ReplayMemory,可直接从 parl.utils 引入使用 rpm = ReplayMemory(int(MEMORY_SIZE), obs_dim, act_dim) # 启动训练 # test_flag = 0 # total_steps = 0 # while total_steps < TRAIN_TOTAL_STEPS: # train_reward, steps = run_episode(env, agent, rpm) # total_steps += steps # #logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward)) # 打印训练reward # if total_steps // TEST_EVERY_STEPS >= test_flag: # 每隔一定step数,评估一次模型 # while total_steps // TEST_EVERY_STEPS >= test_flag: # test_flag += 1 # evaluate_reward = evaluate(env, agent) # logger.info('Steps {}, Test reward: {}'.format(
class RLBenchmarkDispatcher(DispatcherBase): ''' An RL benchmark for elevator system ''' def load_settings(self): self._obs_dim = obs_dim(self._mansion) self._act_dim = act_dim(self._mansion) self._ele_num = self._mansion._elevator_number self._max_floor = self._mansion._floor_number self._global_step = 0 for i in range(self._mansion._elevator_number): self._rpm = ReplayMemory(MEMORY_SIZE, self._obs_dim, 1) self._model = RLDispatcherModel(self._act_dim) hyperparas = { 'action_dim': self._act_dim, 'lr': 5.0e-4, 'gamma': 0.998 } #print ("action dimention:", self._obs_dim, self._act_dim) self._algorithm = DQN(self._model, hyperparas) self._agent = ElevatorAgent(self._algorithm, self._obs_dim, self._act_dim) self._warm_up_size = 2000 self._statistic_freq = 1000 self._loss_queue = deque() def feedback(self, state, action, r): self._global_step += 1 observation_array = mansion_state_preprocessing(state) new_actions = list() for ele_act in action: new_actions.append(action_to_action_idx(ele_act, self._act_dim)) if(self._global_step > self._warm_up_size): for i in range(self._ele_num): self._rpm.append( self._last_observation_array[i], self._last_action[i], self._last_reward, deepcopy(observation_array[i]), False) self._last_observation_array = deepcopy(observation_array) self._last_action = deepcopy(new_actions) self._last_reward = r if self._rpm.size() > self._warm_up_size: batch_obs, batch_action, batch_reward, batch_next_obs, batch_terminal = \ self._rpm.sample_batch(BATCH_SIZE) cost = self._agent.learn(batch_obs, batch_action, batch_reward, batch_next_obs, batch_terminal) self._loss_queue.appendleft(cost) if(len(self._loss_queue) > self._statistic_freq): self._loss_queue.pop() if(self._global_step % self._statistic_freq == 0): self._mansion._config.log_notice("Temporal Difference Error(Average) %f", sum(self._loss_queue)/float(len(self._loss_queue))) def policy(self, state): self._exploration_ratio = 500000.0 / (500000.0 + self._global_step) + 0.02 observation_array = mansion_state_preprocessing(state) q_values = self._agent.predict(observation_array) ret_actions = list() for i in range(self._ele_num): if(random.random() < self._exploration_ratio): action = random.randint(1, self._max_floor) else: action = np.argmax(q_values[i]) ret_actions.append(action_idx_to_action(int(action), self._act_dim)) return ret_actions
def main(): logger.info("-----------------Carla_SAC-------------------") logger.set_dir('./{}_train'.format(args.env)) # Parallel environments for training train_envs_params = EnvConfig['train_envs_params'] env_num = EnvConfig['env_num'] env_list = ParallelEnv(args.env, args.xparl_addr, train_envs_params) # env for eval eval_env_params = EnvConfig['eval_env_params'] eval_env = LocalEnv(args.env, eval_env_params) obs_dim = eval_env.obs_dim action_dim = eval_env.action_dim # Initialize model, algorithm, agent, replay_memory if args.framework == 'torch': CarlaModel, SAC, CarlaAgent = TorchModel, TorchSAC, TorchAgent elif args.framework == 'paddle': CarlaModel, SAC, CarlaAgent = PaddleModel, PaddleSAC, PaddleAgent model = CarlaModel(obs_dim, action_dim) algorithm = SAC( model, gamma=GAMMA, tau=TAU, alpha=ALPHA, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) agent = CarlaAgent(algorithm) rpm = ReplayMemory( max_size=MEMORY_SIZE, obs_dim=obs_dim, act_dim=action_dim) total_steps = 0 last_save_steps = 0 test_flag = 0 obs_list = env_list.reset() while total_steps < args.train_total_steps: # Train episode if rpm.size() < WARMUP_STEPS: action_list = [ np.random.uniform(-1, 1, size=action_dim) for _ in range(env_num) ] else: action_list = [agent.sample(obs) for obs in obs_list] next_obs_list, reward_list, done_list, info_list = env_list.step( action_list) # Store data in replay memory for i in range(env_num): rpm.append(obs_list[i], action_list[i], reward_list[i], next_obs_list[i], done_list[i]) obs_list = env_list.get_obs() total_steps = env_list.total_steps # Train agent after collecting sufficient data if rpm.size() >= WARMUP_STEPS: batch_obs, batch_action, batch_reward, batch_next_obs, batch_terminal = rpm.sample_batch( BATCH_SIZE) agent.learn(batch_obs, batch_action, batch_reward, batch_next_obs, batch_terminal) # Save agent if total_steps > int(1e5) and total_steps > last_save_steps + int(1e4): agent.save('./{}_model/step_{}_model.ckpt'.format( args.framework, total_steps)) last_save_steps = total_steps # Evaluate episode if (total_steps + 1) // args.test_every_steps >= test_flag: while (total_steps + 1) // args.test_every_steps >= test_flag: test_flag += 1 avg_reward = run_evaluate_episodes(agent, eval_env, EVAL_EPISODES) tensorboard.add_scalar('eval/episode_reward', avg_reward, total_steps) logger.info( 'Total steps {}, Evaluation over {} episodes, Average reward: {}' .format(total_steps, EVAL_EPISODES, avg_reward))
obs_dim = 7 goal_dim = 3 act_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) model = RLBenchModel(act_dim, max_action) algorithm = parl.algorithms.TD3(model, max_action=max_action, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) agent = RLBenchAgent(algorithm, obs_dim + goal_dim, act_dim) rpm = ReplayMemory(MEMORY_SIZE, obs_dim + goal_dim, act_dim) test_flag = 0 store_flag = 0 total_episodes = 16000 while total_episodes < args.train_total_episodes: train_reward = run_train_episode(env, agent, rpm) total_episodes += 1 logger.logging_string('Episodes: {} Reward: {}'.format( total_episodes, train_reward)) #tensorboard.add_scalar('train/episode_reward', train_reward,total_episodes) if total_episodes // args.test_every_episodes >= test_flag: while total_episodes // args.test_every_episodes >= test_flag: test_flag += 1 evaluate_reward = run_evaluate_episode(env, agent, render=False)
def main(train=True): if train: # 加载数据 df = pd.read_csv('wudigushi/DATA/AAPL.csv') df = df.sort_values('Date') # 创建环境 env = StockTradingEnv(df) env.reset() act_dim = env.action_space.shape[0] obs_dim = env.observation_space.shape[0] print(act_dim) print(obs_dim) model = WudigupiaoModel(act_dim) algorithm = DDPG( model, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) agent = MyStockAgent(algorithm, obs_dim, act_dim) rpm = ReplayMemory(int(MEMORY_SIZE), obs_dim , act_dim) test_flag = 0 total_steps = 0 while total_steps < TRAIN_TOTAL_STEPS: train_reward, steps = run_episode(env, agent, rpm) total_steps += steps # logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward)) if total_steps // TEST_EVERY_STEPS >= test_flag: # print('s1') while total_steps // TEST_EVERY_STEPS >= test_flag: test_flag += 1 evaluate_reward = evaluate(env, agent) logger.info('Steps {}, Test reward: {}'.format(total_steps, evaluate_reward)) ckpt = 'wudigushi/ckpt/steps_{}.ckpt'.format(total_steps) agent.save(ckpt) else: ckpt = 'wudigushi/ckpt/steps_980117.ckpt' # 请设置ckpt为你训练中效果最好的一次评估保存的模型文件名称 df = pd.read_csv('wudigushi/DATA/AAPL.csv') df = df.sort_values('Date') # 创建环境 env = StockTradingEnv(df) env.reset() act_dim = env.action_space.shape[0] obs_dim = env.observation_space.shape[0] # obs_dim = 36 print(act_dim) print(obs_dim) model = WudigupiaoModel(act_dim) algorithm = DDPG( model, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) agent = MyStockAgent(algorithm, obs_dim, act_dim) agent.restore(ckpt) evaluate_reward = evaluate(env, agent) logger.info('Evaluate reward: {}'.format(evaluate_reward)) # 打印评估的reward
env_reward.append(d_r) total_reward.append(np.mean(total_reward)) env_reward.append(np.mean(env_reward)) return total_reward, env_reward env = make_env("Quadrotor", task="velocity_control") obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] logger.info('obs_dim:{} , act_dim:{}'.format(obs_dim, act_dim)) model = QModel(act_dim) alg = DDPG(model, gamma=Gamma, tau=Tau, actor_lr=A_lr, critic_lr=C_lr) agent = QAgent(alg, obs_dim, act_dim) rpm = ReplayMemory(int(Max_Size), obs_dim, act_dim) # if os.path.exists('./M2_-1094_Over.ckpt'): # agent.restore('./M2_-1094_Over.ckpt') # else: # exit(1) test_flag = 0 total_step = 0 reward_max = -1094 while True: step, reward = run_episode(env, agent, rpm) total_step += step logger.info('Step:{} , Train Reward:{}'.format(total_step, reward)) if total_step // Test_Round == test_flag: