def evaluate(env, agent, render=False): eval_reward = [] for i in range(5): env.reset_game() obs = get_obs(env) total_reward, steps = 0, 0 while True: batch_obs = np.expand_dims(obs, axis=0) pred_action = agent.predict(batch_obs.astype('float32')) # 选取最优动作 pred_action = np.squeeze(pred_action) action_set = env.getActionSet() action_index = np.random.choice(range(3), p=pred_action) action = action_set[action_index] reward = env.act(action) next_obs = get_obs(env) done = env.game_over() obs = next_obs total_reward += reward steps += 1 if render: env.getScreenRGB() if done: break eval_reward.append(total_reward) return np.mean(eval_reward)
def run_episode(env, agent, rpm): total_reward = 0 env.reset_game() obs = get_obs(env) step = 0 while True: step += 1 action_index = agent.sample(obs) # 采样动作,所有动作都有概率被尝试到 action = env.getActionSet()[action_index] # 行动 reward = env.act(action) next_obs = get_obs(env) done = env.game_over() rpm.append((obs, action_index, reward, next_obs, done)) # train model if (len(rpm) > MEMORY_WARMUP_SIZE) and (step % LEARN_FREQ == 0): (batch_obs, batch_action, batch_reward, batch_next_obs, batch_done) = rpm.sample(BATCH_SIZE) train_loss = agent.learn(batch_obs, batch_action, batch_reward, batch_next_obs, batch_done) # s,a,r,s',done total_reward += reward obs = next_obs if done: break return total_reward
def run_episode(env, agent, rpm): total_reward = 0 env.reset_game() obs = get_obs(env) steps = 0 while True: steps += 1 batch_obs = np.expand_dims(obs, axis=0) pred_action = agent.predict(batch_obs.astype('float32')) # 选取最优动作 pred_action = np.squeeze(pred_action) action_set = env.getActionSet() action_index = np.random.choice(range(3), p=pred_action) action = action_set[action_index] reward = env.act(action) next_obs = get_obs(env) done = env.game_over() rpm.append(obs, action_index, REWARD_SCALE * reward, next_obs, done) if rpm.size() > MEMORY_WARMUP_SIZE: batch_obs, batch_action, batch_reward, batch_next_obs, \ batch_terminal = rpm.sample_batch(BATCH_SIZE) critic_cost = agent.learn(batch_obs, batch_action, batch_reward, batch_next_obs, batch_terminal) obs = next_obs total_reward += reward if done: break return total_reward
def step(state_dic): """ Parameters ---------- state_dic : dictionary current state of agent coming from environment Returns ------- Action: angle """ global net, a, action if a == 10: state, done = get_obs(state_dic) state = sum(state, []) state_v = torch.tensor(np.array([state], copy=False)) q_vals = net(state_v).data.numpy()[0] action = np.argmax(q_vals) a = 0 a += 1 return action
def __init__(self, actor_id): self.env = suite.load(domain_name="walker", task_name="run") self.action_size = self.env.action_spec().shape[0] self.obs_size = get_obs(self.env.reset().observation).shape[1] self.actor_id = actor_id self.burn_in_length = 20 # 40-80 self.learning_length = 40 self.sequence_length = self.burn_in_length + self.learning_length self.n_step = 5 self.sequence = [] self.recurrent_state = [] self.priority = [] self.td_loss = deque(maxlen=self.learning_length) self.memory_sequence_size = 1000 self.memory = ReplayMemory( memory_sequence_size=self.memory_sequence_size) self.memory_save_interval = 3 self.gamma = 0.997 self.actor_parameter_update_interval = 500 self.model_path = './model_data/' self.actor = ActorNet(self.obs_size, self.action_size, cuda_id=self.actor_id % 2 + 1).cuda(self.actor_id % 2 + 1).eval() self.target_actor = deepcopy(self.actor) self.critic = CriticNet(self.obs_size, self.action_size, cuda_id=self.actor_id % 2 + 1).cuda(self.actor_id % 2 + 1).eval() self.target_critic = deepcopy(self.critic) self.load_model() self.epsilon = 1 self.last_obs = None
def evaluate(env, agent, render=False): eval_reward = [] for i in range(5): env.reset_game() obs = get_obs(env) episode_reward = 0 while True: action_index = agent.predict(obs) # 选取最优动作 action = env.getActionSet()[action_index] reward = env.act(action) obs = get_obs(env) episode_reward += reward if render: env.getScreenRGB() if env.game_over(): break eval_reward.append(episode_reward) return np.mean(eval_reward)
def run_episode(env, agent): obs_list, action_list, reward_list = [], [], [] env.reset_game() obs = get_obs(env) while True: obs_list.append(obs) action_index = agent.sample(obs) # 采样动作,所有动作都有概率被尝试到 action = env.getActionSet()[action_index] action_list.append(action_index) # 行动 reward = env.act(action) next_obs = get_obs(env) done = env.game_over() obs = next_obs reward_list.append(reward) if done: break return obs_list, action_list, reward_list
def main(): # 创建环境 game = Pong(width=200, height=200,MAX_SCORE=11) p = PLE(game, fps=30, display_screen=False, force_fps=False) p.reset_game() # 根据parl框架构建agent print(p.getActionSet()) act_dim = len(p.getActionSet()) obs = get_obs(p) obs_dim = 200*200 rpm = ReplayMemory(MEMORY_SIZE) # DQN的经验回放池 model = Model(act_dim=act_dim) alg = DQN(model, act_dim=act_dim, gamma=GAMMA, lr=LEARNING_RATE) agent = Agent(alg, obs_dim=obs_dim, act_dim=act_dim, e_greed_decrement=1e-6, e_greed=0.1) # e_greed有一定概率随机选取动作,探索 # # 加载模型 # if os.path.exists('./water_world_dqn.ckpt'): # agent.restore('./water_world_dqn.ckpt') # 先往经验池里存一些数据,避免最开始训练的时候样本丰富度不够 while len(rpm) < MEMORY_WARMUP_SIZE: run_episode(p, agent, rpm) max_episode = 200000 # 开始训练 episode = 0 best_reward = -float('inf') while episode < max_episode: # 训练max_episode个回合,test部分不计算入episode数量 # train part for i in range(0, 50): total_reward = run_episode(p, agent, rpm) episode += 1 # test part eval_reward = evaluate(p, agent, render=False) # render=True 查看显示效果 if eval_reward>best_reward: best_reward = eval_reward agent.save('model_dir/dqn_pong_{}_reward_{}.ckpt'.format(episode,best_reward)) logger.info('episode:{} e_greed:{} test_reward:{}'.format( episode, agent.e_greed, eval_reward))
def __init__(self, n_actors): self.env = suite.load(domain_name="walker", task_name="run") self.n_actions = self.env.action_spec().shape[0] self.obs_size = get_obs(self.env.reset().observation).shape[1] self.n_actors = n_actors self.burn_in_length = 20 # 40-80 self.learning_length = 40 self.sequence_length = self.burn_in_length + self.learning_length self.n_step = 5 self.memory_sequence_size = 5000000 self.batch_size = 32 self.memory = LearnerReplayMemory( memory_sequence_size=self.memory_sequence_size, batch_size=self.batch_size) self.model_path = './model_data/' self.memory_path = './memory_data/' self.actor = ActorNet(self.obs_size, self.n_actions, 0).cuda() self.target_actor = deepcopy(self.actor).eval() self.critic = CriticNet(self.obs_size, self.n_actions, 0).cuda() self.target_critic = deepcopy(self.critic).eval() self.model_save_interval = 50 # 50 self.memory_update_interval = 50 # 50 self.target_update_inverval = 500 # 100 self.gamma = 0.997 self.actor_lr = 1e-4 self.critic_lr = 1e-3 self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=self.actor_lr) self.actor_criterion = nn.MSELoss() self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=self.critic_lr) self.critic_criterion = nn.MSELoss() self.save_model()
def get_obs_states( env_id, system='cartpole_obs', ): if system == 'cartpole_obs': def line_line_cc(x1, y1, x2, y2, x3, y3, x4, y4): uA = ((x4 - x3) * (y1 - y3) - (y4 - y3) * (x1 - x3)) / ((y4 - y3) * (x2 - x1) - (x4 - x3) * (y2 - y1)) uB = ((x2 - x1) * (y1 - y3) - (y2 - y1) * (x1 - x3)) / ((y4 - y3) * (x2 - x1) - (x4 - x3) * (y2 - y1)) if uA >= 0. and uA <= 1. and uB >= 0. and uB <= 1.: # intersection return True # collision free return False def IsInCollision(x, obc, obc_width=4.): I = 10 L = 2.5 M = 10 m = 5 g = 9.8 H = 0.5 STATE_X = 0 STATE_V = 1 STATE_THETA = 2 STATE_W = 3 CONTROL_A = 0 MIN_X = -30 MAX_X = 30 MIN_V = -40 MAX_V = 40 MIN_W = -2 MAX_W = 2 if x[0] < MIN_X or x[0] > MAX_X: return True H = 0.5 pole_x1 = x[0] pole_y1 = H pole_x2 = x[0] + L * np.sin(x[2]) pole_y2 = H + L * np.cos(x[2]) width = 4 for i in range(len(obc)): for j in range(0, 8, 2): x1 = obc[i][j] y1 = obc[i][j + 1] x2 = obc[i][(j + 2) % 8] y2 = obc[i][(j + 3) % 8] if line_line_cc(pole_x1, pole_y1, pole_x2, pole_y2, x1, y1, x2, y2): return True return False _obs_list = get_obs(system, env_id)[env_id] #.reshape(-1, 2) obs_list = [] width = 4 for i in range(len(_obs_list)): x = _obs_list[i][0] y = _obs_list[i][1] obs = np.zeros(8) obs[0] = x - width / 2 obs[1] = y + width / 2 obs[2] = x + width / 2 obs[3] = y + width / 2 obs[4] = x + width / 2 obs[5] = y - width / 2 obs[6] = x - width / 2 obs[7] = y - width / 2 obs_list.append(obs) obs_i = np.array(obs_list) dx = 5 dtheta = 0.5 # feasible_points = [] infeasible_points = [] imin = 0 imax = int(2 * 30. / dx) jmin = 0 jmax = int(2 * np.pi / dtheta) for i in range(imin, imax): for j in range(jmin, jmax): x = np.array([dx * i - 30, 0., dtheta * j - np.pi, 0.]) if IsInCollision(x, obs_i): infeasible_points.append(x) # pass # else: # feasible_points.append(x) # feasible_points = np.array(feasible_points) infeasible_points = np.array(infeasible_points) # print('feasible points') # print(feasible_points) # print('infeasible points') # print(infeasible_points) elif system == 'acrobot_obs': return None else: raise NotImplementedError("unkown dynamics") return infeasible_points
def run(self): episode = 0 step = 0 reward_sum = 0 while True: time_step = self.env.reset() obs = get_obs(time_step.observation) self.actor.reset_state() self.critic.reset_state() self.target_actor.reset_state() self.target_critic.reset_state() self.sequence = [] self.recurrent_state = [] self.priority = [] self.td_loss.clear() last_obs = None episode_step = 0 done = False if self.actor_id == 0 and episode != 0: print('episode:', episode, 'step:', step, 'reward:', reward_sum) episode += 1 reward_sum = 0 while not time_step.last(): # get recurrent state actor_hx, actor_cx = self.actor.get_state() target_actor_hx, target_actor_cx = self.target_actor.get_state( ) critic_hx, critic_cx = self.critic.get_state() target_critic_hx, target_critic_cx = self.target_critic.get_state( ) action = self.actor( torch.from_numpy(obs).cuda(self.actor_id % 2 + 1)) target_action = self.target_actor( torch.from_numpy(obs).cuda(self.actor_id % 2 + 1)) _ = self.critic( torch.from_numpy(obs).cuda(self.actor_id % 2 + 1), action) _ = self.target_critic( torch.from_numpy(obs).cuda(self.actor_id % 2 + 1), target_action) action = action.detach().cpu().numpy()[0] action += np.random.normal(0, 0.3, (self.action_size)) action = np.clip(action, -1, 1) reward = 0. sleep(0.01) for i in range(4): time_step = self.env.step(action) next_obs = get_obs(time_step.observation) reward += time_step.reward if time_step.last(): break reward_sum += reward step += 1 episode_step += 1 terminal = 1. if time_step.last() else 0. self.sequence.append((obs[0], action, [reward], [terminal])) obs = next_obs.copy() self.recurrent_state.append( [[actor_hx[0], actor_cx[0]], [target_actor_hx[0], target_actor_cx[0]], [critic_hx[0], critic_cx[0]], [target_critic_hx[0], target_critic_cx[0]]]) if step % self.actor_parameter_update_interval == 0: self.load_model() if len(self.sequence) >= self.sequence_length: self.sequence.extend([(np.zeros((self.obs_size), dtype=np.float32), np.zeros((self.action_size), dtype=np.float32), [0.], [1.]) for i in range(self.n_step)]) self.calc_nstep_reward() self.calc_priorities() self.memory.add(self.sequence, self.recurrent_state, self.priority) if len(self.memory.memory) > self.memory_save_interval: self.memory.save(self.actor_id)