class Agent(): def __init__(self, env): self.num_actions = env.action_space.n target_dqn = DQNParamNoise(env) self.algo = DQNParamNoise(env, target_dqn=target_dqn) self.num_steps = 0 self.episode_steps = 0 self.env = env self.memory = Memory(maxlen=10000) self.rewards = deque(maxlen=100) self.epsilon = 0.005 self.total_reward = 0 def reset(self, learn=True): self.algo.episode_start() self.total_reward = 0 def store_in_memory(self, state, action, reward, new_state, done): self.memory.append((state, action, reward, new_state, done)) def learn_from_memory(self): self.algo.train(self.memory.get_batch(batch_size=32)) def step(self, state, learn=True): action_probs = self.algo.eval(np.expand_dims(state, 0)).double()[0] # print(action_probs) action_probs = action_probs / action_probs.sum() if random.random() < self.epsilon: action = random.randint(0, self.num_actions - 1) else: action = action_probs.max(0)[1] new_state, reward, done, info = self.env.step(action) self.store_in_memory(state, action, reward, new_state, done) if self.num_steps >= 100 and learn: self.learn_from_memory() if done: print( f"[{self.num_steps}] reward {self.env.mean_reward}, steps {self.episode_steps}, speed {self.env.speed} f/s, epsilon {self.epsilon}" ) self.episode_steps = 0 self.algo.save() self.num_steps += 1 self.episode_steps += 1 self.total_reward += reward self.epsilon -= 1e-04 self.epsilon = max(0.005, self.epsilon) return new_state, reward, done, info
def train(env, model, base_path, batch_size=64, epsilon=0.01, update_every=4, update_target_every=1000, learning_starts=200, memory_size=500000, num_iterations=6250000): if not os.path.exists(base_path): os.makedirs(base_path) model_path = os.path.join(base_path, "model") begin_i = model.load(model_path) memory_buffer = Memory(memory_size) results_buffer = ResultsBuffer(base_path) state = env.reset() for i in range(learning_starts): action = np.random.randint( env.action_n ) if np.random.uniform() < epsilon else model.get_action(state) next_state, reward, done, info = env.step(action) memory_buffer.append((state, action, reward, next_state, done)) state = next_state state = env.reset() start = time.time() for i in range(begin_i + 1, num_iterations): action = np.random.randint( env.action_n ) if np.random.uniform() < epsilon else model.get_action(state) next_state, reward, done, info = env.step(action) results_buffer.update_info(info) memory_buffer.append((state, action, reward, next_state, done)) state = next_state if i > 0 and i % update_every == 0: summaries = model.update(*memory_buffer.sample(batch_size)) results_buffer.update_info(summaries) if i > 0 and i % (update_every * update_target_every) == 0: model.update_target() model.save(model_path, i) t = time.time() - start print("Save model, global step:{}, delta_time:{}.".format(i, t)) start = time.time()
class RandomAgent(Learner): def __init__(self, observation_space, action_space, memory_len=1000): self.action_space = action_space self.memory = Memory(memory_len, observation_space.shape) def handle_transition(self, s, a, r, sp, done): s = self._convert_to_torch(s) sp = self._convert_to_torch(sp) self.memory.append((s, a, r, sp, done)) pass def exploration_strategy(self, s): return self.action_space.sample() def deterministic_strategy(self, s): return self.action_space.sample()
class QAgent(): def __init__(self, env=None, config: str = None, seed: int = None, model: tf.keras.Model = None): assert env is not None, "A GYM environment must be provided" assert config is not None, "A config filename must be provided" assert model is not None, "A keras model must be provided" self.env = env self.config_file = config self.config = Config(self.config_file) self.model_name = f'train_{int(time.time())}' self.log_dir = "" self.model = model self.target_model = None self.tensorboard = None self.rng = np.random.default_rng(seed) self.memory = None self.last_step = 0 self.current_episode = 0 def compile(self, optimizer=None, loss=Huber()): # lr_schedule = ExponentialDecay( # initial_learning_rate=self.config.learning_rate, # decay_steps=self.config.lr_decay_steps, # decay_rate=self.config.lr_decay, # lr_min=self.config.lr_min) # optimizer = Adam(learning_rate=lr_schedule) if optimizer is None: optimizer = Adam(learning_rate=self.config.learning_rate) self.target_model = clone_model(self.model) self.target_model.compile(optimizer='sgd', loss='mse') self.model.compile(loss=loss, optimizer=optimizer, metrics=['accuracy']) def adjust_lr(self, lr=None): assert lr is not None K.set_value(self.model.optimizer.learning_rate, lr) def load_model(self, filepath): self.model.load_weights(filepath) self.target_model.set_weights(self.model.get_weights()) def save(self, filepath=""): self.save_model(filepath) self.save_checkpoint(filepath) def save_model(self, filepath=""): filepath = os.path.join(filepath, "models") if not os.path.exists(filepath): os.makedirs(filepath) self.model.save_weights(os.path.join(filepath, self.model_name + ".h5"), overwrite=True) def save_checkpoint(self, filepath=""): filepath = os.path.join(filepath, "models") if not os.path.exists(filepath): os.makedirs(filepath) # save memory, current step data = { "memory": self.memory.json(), "last_step": self.last_step, "current_episode": self.current_episode, "model_name": self.model_name } with open(os.path.join(filepath, self.model_name + "_checkpoint.json"), "w") as jsonfile: json.dump(data, jsonfile) def load_checkpoint(self, filename): with open(filename, "r") as json_file: data = json.load(json_file) self.last_step = data['last_step'] self.current_episode = data['current_episode'] self.model_name = data['model_name'] if self.memory is None: self.config = Config(self.config_file) self.memory = Memory(max_len=self.config.max_queue_length) self.memory.load(data['memory']) def _encode_state(self, state): return state def _train_model(self, step): if self.memory.length < self.config.batch_size: return mini_batch = self.memory.sample(self.config.batch_size) current_states = self._encode_state(mini_batch.states) next_states = self._encode_state(mini_batch.new_states) # current Q values for each action q_values = self.model.predict_on_batch(current_states) # identify the best action to take and get the corresponding target Q value target_q_values = self.target_model.predict_on_batch(next_states) q_batch = np.max(target_q_values, axis=1).flatten() indices = (np.arange(self.config.batch_size), mini_batch.actions) q_values[indices] = mini_batch.rewards + ( 1 - mini_batch.done) * self.config.discount_factor * q_batch # As the model will predict `q_values`, only the Q value for the proper action (given by indices) # differ and count for the loss computation. self.tensorboard.on_step_begin() metrics = self.model.train_on_batch(current_states.astype(np.float32), q_values.astype(np.float32), return_dict=True) self.tensorboard.on_step_end(step=step, logs=metrics) def _get_epsilon(self, episode): epsilon = self.config.min_epsilon + \ (self.config.max_epsilon - self.config.min_epsilon) * np.exp(-self.config.decay_epsilon * episode) return epsilon def _remember(self, state, action, reward, new_state, done): self.memory.append(state, action, reward, new_state, done) def _get_action_for_state(self, state): state_decoded = self._encode_state(state) predicted = self.model.predict_on_batch(np.array([state_decoded])) action = np.argmax(predicted[0]) return action def _choose_action(self, state, epsilon): if self.rng.uniform() < epsilon: # Explore action = self.env.action_space.sample() else: # Exploit action = self._get_action_for_state(state) return action def fit(self): try: self.config = Config(self.config_file) if self.tensorboard is None: self.log_dir = os.path.join(self.config.log_dir, self.model_name) self.tensorboard = LogTensorBoard(log_dir=self.log_dir) self.tensorboard.set_model(self.model) if self.memory is None: self.memory = Memory(max_len=self.config.max_queue_length) state = self.env.reset() done = False epsilon = self._get_epsilon(self.current_episode) steps_in_episode = 0 reward_queue = deque(maxlen=10) reward_in_episode = 0 pbar = trange(self.last_step, self.config.train_steps, initial=self.last_step, total=self.config.train_steps) for step in pbar: steps_in_episode += 1 self.last_step = step # Greedy exploration strategy action = self._choose_action(state, epsilon) new_state, reward, done, info = self.env.step(action) self._remember(state, action, reward, new_state, done) reward_in_episode += reward if steps_in_episode == self.config.max_steps_per_episode: done = True # Train with the Bellman equation if step > self.config.warmup_steps: self._train_model(step) state = new_state if done: steps_in_episode = 0 state = self.env.reset() done = False self.current_episode += 1 reward_queue.append(reward_in_episode) reward_in_episode = 0 epsilon = self._get_epsilon(self.current_episode) pbar.set_postfix({"reward": np.mean(reward_queue)}) if step % self.config.target_model_update == 0: self.target_model.set_weights(self.model.get_weights()) self.last_step += 1 except KeyboardInterrupt: print("Training has been interrupted") def play(self, verbose: bool = False, sleep: float = 0.2, max_steps: int = 100): # Play an episode try: actions_str = [ "South", "North", "East", "West", "Pickup", "Dropoff" ] iteration = 0 state = self.env.reset( ) # reset environment to a new, random state self.env.render() if verbose: print(f"Iter: {iteration} - Action: *** - Reward ***") time.sleep(sleep) done = False while not done: action = self._get_action_for_state(state) iteration += 1 state, reward, done, info = self.env.step(action) clear_output(wait=True) self.env.render() if verbose: print( f"Iter: {iteration} - Action: {action}({actions_str[action]}) - Reward {reward}" ) time.sleep(sleep) if iteration == max_steps: print("cannot converge :(") break except KeyboardInterrupt: pass def evaluate(self, max_steps: int = 100): try: total_steps, total_penalties = 0, 0 episodes = 100 for episode in trange(episodes): state = self.env.reset( ) # reset environment to a new, random state nb_steps, penalties, reward = 0, 0, 0 done = False while not done: action = self._get_action_for_state(state) state, reward, done, info = self.env.step(action) if reward == -10: penalties += 1 nb_steps += 1 if nb_steps == max_steps: done = True total_penalties += penalties total_steps += nb_steps print(f"Results after {episodes} episodes:") print(f"Average timesteps per episode: {total_steps / episodes}") print( f"Average penalties per episode: {total_penalties / episodes}") except KeyboardInterrupt: pass
class DDPG(object): def __init__(self): agent_args = Singleton_arger()['agent'] self.actor_lr = agent_args['actor_lr'] self.critic_lr = agent_args['critic_lr'] self.lr_decay = agent_args['lr_decay'] self.l2_critic = agent_args['l2_critic'] self.batch_size = agent_args['batch_size'] self.discount = agent_args['discount'] self.tau = agent_args['tau'] self.with_cuda = agent_args['with_cuda'] self.buffer_size = int(agent_args['buffer_size']) def setup(self, nb_pos, nb_laser, nb_actions): self.lr_coef = 1 model_args = Singleton_arger()['model'] actor = Actor(nb_pos, nb_laser, nb_actions, hidden1=model_args['hidden1'], hidden2=model_args['hidden2'], layer_norm=model_args['layer_norm']) critic = Critic(nb_pos, nb_laser, nb_actions, hidden1=model_args['hidden1'], hidden2=model_args['hidden2'], layer_norm=model_args['layer_norm']) self.nb_pos = nb_pos self.nb_laser = nb_laser self.actor = copy.deepcopy(actor) self.actor_target = copy.deepcopy(actor) self.critic = copy.deepcopy(critic) self.critic_target = copy.deepcopy(critic) self.memory = Memory(self.buffer_size, (nb_actions, ), (nb_pos + nb_laser, ), self.with_cuda) if self.with_cuda: for net in (self.actor, self.actor_target, self.critic, self.critic_target): if net is not None: net.cuda() p_groups = [{ 'params': [ param, ], 'weight_decay': self.l2_critic if ('weight' in name) and ('LN' not in name) else 0 } for name, param in self.critic.named_parameters()] self.critic_optim = Adam(params=p_groups, lr=self.critic_lr, weight_decay=self.l2_critic) self.actor_optim = Adam(self.actor.parameters(), lr=self.actor_lr) def reset_noise(self): pass def before_epoch(self): pass def before_cycle(self): pass def store_transition(self, s_t, a_t, r_t, s_t1, done_t): s_t = torch.tensor(s_t, dtype=torch.float32, requires_grad=False) if self.with_cuda: s_t = s_t.cuda() self.memory.append(s_t, a_t, r_t, s_t1, done_t) def update_critic(self, batch=None, pass_batch=False): # Sample batch if batch is None: batch = self.memory.sample(self.batch_size) assert batch is not None tensor_obs0 = batch['obs0'].split([self.nb_pos, self.nb_laser], dim=1) tensor_obs1 = batch['obs1'].split([self.nb_pos, self.nb_laser], dim=1) # Prepare for the target q batch with torch.no_grad(): next_q_values = self.critic_target([ tensor_obs1[0], tensor_obs1[1], self.actor_target(tensor_obs1), ]) target_q_batch = batch['rewards'] + self.discount * ( 1 - batch['terminals1']) * next_q_values # Critic update self.critic.zero_grad() q_batch = self.critic( [tensor_obs0[0], tensor_obs0[1], batch['actions']]) value_loss = nn.functional.mse_loss(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() if pass_batch: return value_loss.item(), batch else: return value_loss.item() def update_actor(self, batch=None, pass_batch=False): if batch is None: batch = self.memory.sample(self.batch_size) assert batch is not None tensor_obs0 = batch['obs0'].split([self.nb_pos, self.nb_laser], dim=1) # Actor update self.actor.zero_grad() policy_loss = -self.critic( [tensor_obs0[0], tensor_obs0[1], self.actor(tensor_obs0)]) policy_loss = policy_loss.mean() policy_loss.backward() self.actor_optim.step() if pass_batch: return policy_loss.item(), batch else: return policy_loss.item() def update_critic_target(self, soft_update=True): for target_param, param in zip(self.critic_target.parameters(), self.critic.parameters()): target_param.data.copy_(target_param.data * (1.0 - self.tau) + param.data * self.tau \ if soft_update else param.data) def update_actor_target(self, soft_update=True): for target_param, param in zip(self.actor_target.parameters(), self.actor.parameters()): target_param.data.copy_(target_param.data * (1.0 - self.tau) + param.data * self.tau \ if soft_update else param.data) def apply_lr_decay(self): if self.lr_decay > 0: self.lr_coef = self.lr_decay * self.lr_coef / (self.lr_coef + self.lr_decay) for (opt, base_lr) in ((self.actor_optim, self.actor_lr), (self.critic_optim, self.critic_lr)): for group in opt.param_groups: group['lr'] = base_lr * self.lr_coef def calc_last_error(self): # Sample batch batch = self.memory.sample_last(self.batch_size) tensor_obs0 = batch['obs0'].split([self.nb_pos, self.nb_laser], dim=1) tensor_obs1 = batch['obs1'].split([self.nb_pos, self.nb_laser], dim=1) # Prepare for the target q batch with torch.no_grad(): next_q_values = self.critic_target([ tensor_obs1[0], tensor_obs1[1], self.actor_target(tensor_obs1), ]) target_q_batch = batch['rewards'] + self.discount * ( 1 - batch['terminals1']) * next_q_values q_batch = self.critic_target( [tensor_obs0[0], tensor_obs0[1], batch['actions']]) value_loss = nn.functional.mse_loss(q_batch, target_q_batch) return value_loss.item() def select_action(self, s_t, apply_noise): s_t = torch.tensor(np.vstack(s_t), dtype=torch.float32, requires_grad=False).cuda() s_t = s_t.split([self.nb_pos, self.nb_laser], dim=1) with torch.no_grad(): action = self.actor(s_t).cpu().numpy() action = np.clip(action, -1., 1.) return action def load_weights(self, output): self.actor = torch.load('{}/actor.pkl'.format(output)) self.critic = torch.load('{}/critic.pkl'.format(output)) def save_model(self, output): torch.save(self.actor, '{}/actor.pkl'.format(output)) torch.save(self.critic, '{}/critic.pkl'.format(output)) def get_actor_buffer(self): actor_buffer = io.BytesIO() torch.save(self.actor, actor_buffer) return actor_buffer
class DDPG(object): def __init__(self, nb_actions, nb_states, layer_norm, obs_norm, actor_lr, critic_lr, SGLD_coef, noise_decay, lr_decay, batch_size, discount, tau, pool_size, parameters_noise, action_noise, SGLD_mode, pool_mode, with_cuda): self.nb_actions = nb_actions self.nb_states = nb_states self.layer_norm = layer_norm self.parameters_noise = parameters_noise self.action_noise = action_noise self.batch_size = batch_size self.discount = discount self.tau = tau self.pool_size = pool_size self.critic_lr = critic_lr self.actor_lr = actor_lr self.SGLD_coef = SGLD_coef self.noise_coef = 1 self.noise_decay = noise_decay self.lr_coef = 1 self.lr_decay = lr_decay self.SGLD_mode = SGLD_mode self.pool_mode = pool_mode self.with_cuda = with_cuda self.actor = Actor(nb_states=self.nb_states, nb_actions=self.nb_actions, layer_norm=self.layer_norm) self.actor_target = Actor(nb_states=self.nb_states, nb_actions=self.nb_actions, layer_norm=self.layer_norm) self.critic = Critic(nb_states, nb_actions, layer_norm=self.layer_norm) self.critic_target = Critic(nb_states, nb_actions, layer_norm=self.layer_norm) if self.with_cuda: self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() hard_update(self.actor_target, self.actor) hard_update(self.critic_target, self.critic) #self.actor_optim = SGD(self.actor.parameters(), lr=actor_lr, momentum=0.9,weight_decay = 0.01) self.actor_optim = Adam(self.actor.parameters(), lr=actor_lr) #self.critic_optim = SGD(self.critic.parameters(), lr=critic_lr, momentum=0.9,weight_decay = 0.01) self.critic_optim = Adam(self.critic.parameters(), lr=critic_lr) self.memory = Memory(int(1e6), (nb_actions, ), (nb_states, ), with_cuda) self.obs_norm = obs_norm if self.obs_norm: self.run_obs_norm = Run_Normalizer((nb_states, ), self.with_cuda) self.is_training = True if self.pool_size > 0: self.agent_pool = Agent_pool(self.pool_size) self.s_t = None self.a_t = None def store_transition(self, s_t, a_t, r_t, s_t1, done_t): if self.is_training: self.memory.append(s_t, a_t, r_t, s_t1, done_t) if self.obs_norm: self.run_obs_norm.observe(s_t) self.s_t = s_t1 def update(self): # Sample batch batch = self.memory.sample(self.batch_size) tensor_obs0 = batch['obs0'] tensor_obs1 = batch['obs1'] if self.obs_norm: tensor_obs0 = self.run_obs_norm.normalize(tensor_obs0) tensor_obs1 = self.run_obs_norm.normalize(tensor_obs1) # Prepare for the target q batch with torch.no_grad(): next_q_values = self.critic_target([ tensor_obs1, self.actor_target(tensor_obs1), ]) target_q_batch = batch['rewards'] + \ self.discount*(1-batch['terminals1'])*next_q_values # Critic update self.critic.zero_grad() q_batch = self.critic([tensor_obs0, batch['actions']]) value_loss = nn.functional.mse_loss(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() if (self.SGLD_mode == 2) or (self.SGLD_mode == 3): SGLD_update(self.critic, self.critic_lr * self.lr_coef, self.SGLD_coef) # Actor update self.actor.zero_grad() policy_loss = -self.critic([tensor_obs0, self.actor(tensor_obs0)]) policy_loss = policy_loss.mean() policy_loss.backward() self.actor_optim.step() if (self.SGLD_mode == 1) or (self.SGLD_mode == 3): SGLD_update(self.actor, self.actor_lr * self.lr_coef, self.SGLD_coef) # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) return value_loss.item(), policy_loss.item() def apply_lr_decay(self): if self.lr_decay > 0: self.lr_coef = self.lr_decay * self.lr_coef / (self.lr_coef + self.lr_decay) self.critic_optim.param_groups[0][ 'lr'] = self.critic_lr * self.lr_coef def apply_noise_decay(self): if self.noise_decay > 0: self.noise_coef = self.noise_decay * self.noise_coef / ( self.noise_coef + self.noise_decay) def select_action(self, random=False, s_t=None, if_noise=True): if random: action = np.random.uniform(-1., 1., self.nb_actions) else: if s_t is None: raise RuntimeError() s_t = torch.tensor(s_t, dtype=torch.float32, requires_grad=False) if self.with_cuda: s_t = s_t.cuda() if self.obs_norm: s_t = self.run_obs_norm.normalize(s_t) with torch.no_grad(): action = self.actor(s_t).cpu().numpy().squeeze(0) if if_noise & (self.action_noise is not None): action += self.is_training * max(self.noise_coef, 0) * self.action_noise() action = np.clip(action, -1., 1.) self.a_t = action return action def load_weights(self, output): self.actor = torch.load('{}/actor.pkl'.format(output)) self.critic = torch.load('{}/critic.pkl'.format(output)) if self.obs_norm: self.run_obs_norm = torch.load('{}/obs_norm.pkl'.format(output)) def save_model(self, output): torch.save(self.actor, '{}/actor.pkl'.format(output)) torch.save(self.critic, '{}/critic.pkl'.format(output)) if self.obs_norm: torch.save(self.run_obs_norm, '{}/obs_norm.pkl'.format(output)) def get_actor_buffer(self): buffer = io.BytesIO() torch.save(self.actor, buffer) return buffer def get_norm_param(self): return self.run_obs_norm.mean.cpu(), self.run_obs_norm.var.cpu() #TODO recode agent pool def append_actor(self): self.agent_pool.actor_append(self.actor.state_dict(), self.actor_target.state_dict()) def pick_actor(self): actor, actor_target = self.agent_pool.get_actor() self.actor.load_state_dict(actor) self.actor_target.load_state_dict(actor_target) def append_critic(self): self.agent_pool.critic_append(self.critic.state_dict(), self.critic_target.state_dict()) def pick_critic(self): critic, critic_target = self.agent_pool.get_critic() self.critic.load_state_dict(critic) self.critic_target.load_state_dict(critic_target) def append_actor_critic(self): self.agent_pool.actor_append(self.actor.state_dict(), self.actor_target.state_dict()) self.agent_pool.critic_append(self.critic.state_dict(), self.critic_target.state_dict()) def pick_actor_critic(self): actor, actor_target, critic, critic_target = self.agent_pool.get_agent( ) self.actor.load_state_dict(actor) self.actor_target.load_state_dict(actor_target) self.critic.load_state_dict(critic) self.critic_target.load_state_dict(critic_target) def append_agent(self): if self.pool_mode == 1: self.append_actor() elif self.pool_mode == 2: self.append_critic() elif self.pool_mode == 3: self.append_actor_critic() def pick_agent(self): if self.pool_mode == 1: self.pick_actor() elif self.pool_mode == 2: self.pick_critic() elif self.pool_mode == 3: self.pick_actor_critic() def reset(self, obs): self.s_t = obs if self.action_noise is not None: self.action_noise.reset()
class Agent: """ class implements agent """ def __init__(self, state_size, action_size, args): self.args = args with open( os.path.dirname( os.path.abspath(inspect.getfile(inspect.currentframe()))) + '/agent_args.json') as f: data = json.load(f) self.initial_epsilon = int( data[self.args.environment]["initial_epsilon"]) self.final_epsilon = float( data[self.args.environment]["final_epsilon"]) self.current_epsilon = self.initial_epsilon self.epsilon_decay = float( data[self.args.environment]["epsilon_decay"]) self.gamma = float(data[self.args.environment]["gamma"]) self.minibatch_size = int( data[self.args.environment]["minibatch_size"]) self.learning_rate = float( data[self.args.environment]["learning_rate"]) self.fraction_update = float( data[self.args.environment]["fraction_update"]) self.loss = data[self.args.environment]["loss"] self.memory_type = self.args.memory self.memory_size = int(data[self.args.environment]["memory_size"]) if self.memory_type == "basic": self.memory = deque(maxlen=self.memory_size) else: self.memory = Memory(self.memory_size) self.action_size = action_size self.state_size = state_size if self.args.mdl_blueprint and not self.args.dont_save: self.mdl_blueprint = True else: self.mdl_blueprint = False network = Network(state_size, action_size, self.learning_rate, self.loss, [True, self.mdl_blueprint]) self.net_units = None if data[self.args.environment]["net_units"] != "None": self.net_units = [ int(i) for i in data[self.args.environment]["net_units"] ] self.model_type = self.args.network if self.model_type == "2layer_bsc_mdl": self.model_net = network.make_2layer_mdl(self.net_units) self.target_net = network.make_2layer_mdl(self.net_units) elif self.model_type == "2layer_duel_mdl": self.model_net = network.make_2layer_duel_mdl(self.net_units) self.target_net = network.make_2layer_duel_mdl(self.net_units) elif self.model_type == "bsc_img_mdl": self.model_net = network.make_bsc_img_mdl() self.target_net = network.make_bsc_img_mdl() elif self.model_type == "duel_img_model": self.model_net = network.make_duel_img_mdl() self.target_net = network.make_duel_img_mdl() elif self.model_type == "1layer_ram_mdl": self.model_net = network.make_1layer_mdl(self.net_units) self.target_net = network.make_1layer_mdl(self.net_units) self.update_target_net() self.algorithm = self.args.algorithm self.algorithms = { "DQN": self.train_dqn, "DQN+TN": self.train_target_dqn, "DDQN": self.train_ddqn, } def update_target_net(self): """ method updates target network """ self.target_net.set_weights(self.model_net.get_weights()) print("[Target network was updated.]") def update_target_net_partially(self): """ method updates target network by parts """ weights_model = self.model_net.get_weights() weights_target = self.target_net.get_weights() for i in range(len(weights_target)): weights_target[i] = weights_model[ i] * self.fraction_update + weights_target[i] * ( 1 - self.fraction_update) self.target_net.set_weights(weights_target) print("[Target network was updated by parts.]") def get_error(self, state, action, reward, next_state, done): """ method returns difference between Q-value from primary and target network """ q_value = self.model_net.predict(np.array([state])) ns_model_pred = self.model_net.predict(np.array([next_state])) ns_target_pred = self.target_net.predict(np.array([next_state])) obs_error = q_value[0][action] if done == 1: q_value[0][action] = reward else: q_value[0][action] = reward + self.gamma * ns_target_pred[0][ np.argmax(ns_model_pred)] obs_error = abs(obs_error - q_value[0][action]) return obs_error def remember(self, state, action, reward, next_state, done, rand_agent): """ method saves observation (experience) to experience replay memory """ if self.memory_type == "basic": self.memory.append((state, action, reward, next_state, done)) else: if rand_agent: obs_error = abs(reward) else: obs_error = self.get_error(state, action, reward, next_state, done) self.memory.add_observation( (state, action, reward, next_state, done), obs_error) def clear_memory(self): """ method clears replay memory """ self.memory.clear() def decrease_epsilon(self): """ method decreases epsilon """ if self.current_epsilon > self.final_epsilon: if (self.current_epsilon - self.epsilon_decay) > self.final_epsilon: self.current_epsilon = self.current_epsilon - self.epsilon_decay else: self.current_epsilon = self.final_epsilon def get_action(self, task, state, non_normalized_state, epsilon): """ method returns action to take """ if not epsilon: q_value = self.model_net.predict(np.array([state])) else: if np.random.rand() <= self.current_epsilon: if task.name == "2048-v0": possible_actions = possible_moves(non_normalized_state) while True: rand_action = np.random.randint(0, self.action_size, size=1)[0] if possible_actions[rand_action] == 1: return rand_action else: return np.random.randint(0, self.action_size, size=1)[0] else: q_value = self.model_net.predict(np.array([state])) if task.name == "2048-v0": possible_actions = possible_moves(non_normalized_state) while True: chosen_action = np.argmax(q_value) if possible_actions[chosen_action] == 1: return chosen_action else: q_value[0][chosen_action] = -100 return np.argmax(q_value) def get_minibatch(self): """ method returns minibatch from diffrent memory types """ if self.memory_type == "basic": minibatch = random.sample(list(self.memory), self.minibatch_size) state = np.array([i[0] for i in minibatch]) action = [i[1] for i in minibatch] reward = [i[2] for i in minibatch] next_state = np.array([i[3] for i in minibatch]) done = [i[4] for i in minibatch] else: minibatch = self.memory.sample(self.minibatch_size) state = np.array([i[1][0] for i in minibatch]) action = [i[1][1] for i in minibatch] reward = [i[1][2] for i in minibatch] next_state = np.array([i[1][3] for i in minibatch]) done = [i[1][4] for i in minibatch] return state, action, reward, next_state, done def train(self): """ method trains agent with selected algorithm """ self.algorithms[self.algorithm]() def train_dqn(self): """ method trains agent using DQN """ if self.memory_type == "basic": if len(self.memory) >= self.minibatch_size: state, action, reward, next_state, done = self.get_minibatch() else: return else: if self.memory.length >= self.minibatch_size: state, action, reward, next_state, done = self.get_minibatch() else: return errors = np.zeros(self.minibatch_size) possible_actions_curr = [] if self.args.environment == "2048-v0": for i, item in enumerate(state): possible_actions_curr.append(possible_moves(item)) state = state / 16384.0 - 0.5 next_state = next_state / 16384.0 - 0.5 q_value = self.model_net.predict(np.array(state)) ns_model_pred = self.model_net.predict(np.array(next_state)) for i in range(0, self.minibatch_size): errors[i] = q_value[i][action[i]] if done[i] == 1: q_value[i][action[i]] = reward[i] else: q_value[i][action[i]] = reward[i] + self.gamma * np.max( ns_model_pred[i]) errors[i] = abs(errors[i] - q_value[i][action[i]]) for i, item in enumerate(possible_actions_curr): for e, elem in enumerate(item): if elem == 0: q_value[i][e] = -1 self.model_net.fit(state, q_value, epochs=1, verbose=0) if self.memory_type == "dueling": self.memory.update_minibatch(minibatch, errors) def train_target_dqn(self): """ method trains agent using DQN with target network """ if self.memory_type == "basic": if len(self.memory) >= self.minibatch_size: state, action, reward, next_state, done = self.get_minibatch() else: return else: if self.memory.length >= self.minibatch_size: state, action, reward, next_state, done = self.get_minibatch() else: return errors = np.zeros(self.minibatch_size) possible_actions_curr = [] if self.args.environment == "2048-v0": for i, item in enumerate(state): possible_actions_curr.append(possible_moves(item)) state = state / 16384.0 - 0.5 next_state = next_state / 16384.0 - 0.5 q_value = self.model_net.predict(np.array(state)) ns_target_pred = self.target_net.predict(np.array(next_state)) for i in range(0, self.minibatch_size): errors[i] = q_value[i][action[i]] if done[i] == 1: q_value[i][action[i]] = reward[i] else: q_value[i][action[i]] = reward[i] + self.gamma * np.max( ns_target_pred[i]) errors[i] = abs(errors[i] - q_value[i][action[i]]) for i, item in enumerate(possible_actions_curr): for e, elem in enumerate(item): if elem == 0: q_value[i][e] = -1 self.model_net.fit(state, q_value, epochs=1, verbose=0) if self.memory_type == "dueling": self.memory.update_minibatch(minibatch, errors) def train_ddqn(self): """ method trains agent using DDQN """ if self.memory_type == "basic": if len(self.memory) >= self.minibatch_size: state, action, reward, next_state, done = self.get_minibatch() else: return else: if self.memory.length >= self.minibatch_size: state, action, reward, next_state, done = self.get_minibatch() else: return errors = np.zeros(self.minibatch_size) possible_actions_curr = [] if self.args.environment == "2048-v0": for i, item in enumerate(state): possible_actions_curr.append(possible_moves(item)) state = state / 16384.0 - 0.5 next_state = next_state / 16384.0 - 0.5 q_value = self.model_net.predict(state) ns_model_pred = self.model_net.predict(next_state) ns_target_pred = self.target_net.predict(next_state) for i in range(0, self.minibatch_size): errors[i] = q_value[i][action[i]] if done[i] == 1: q_value[i][action[i]] = reward[i] else: q_value[i][action[ i]] = reward[i] + self.gamma * ns_target_pred[i][np.argmax( ns_model_pred[i])] errors[i] = abs(errors[i] - q_value[i][action[i]]) for i, item in enumerate(possible_actions_curr): for e, elem in enumerate(item): if elem == 0: q_value[i][e] = -1 self.model_net.fit(state, q_value, epochs=1, verbose=0) if self.memory_type == "dueling": self.memory.update_minibatch(minibatch, errors) def load_model_weights(self, name): """ method loads weights to primary neural network """ self.model_net.load_weights(name) print("[Model has been loaded from \"{}\".]".format(name)) def save_model_weights(self, name): """ method saves weights of primary neural network """ self.model_net.save_weights("./model-{}".format(name)) print("[Model was saved to \"./model-{}\".]".format(name)) def load_target_weights(self, name): """ method loads weights to target neural network """ self.target_net.load_weights(name) print("[Target model has been loaded from \"{}\".]".format(name)) def save_target_weights(self, name): """ method saves weights of target neural network """ self.target_net.save_weights("./target-{}".format(name)) print("[Target model was saved to \"./target-{}\".]".format(name))
class QLearning(Learner): def __init__(self, n_actions, opt=Adam, opt_args={}, loss=MSELoss, gamma=0.99, do_target=True, memory_len=10000, name=None, memory_shape=(4, 84, 84), initial_eps=0.1, final_eps=0.01, decay_steps=int(1e6), memory_dtype=torch.uint8): self.n_actions = n_actions self._memory = Memory(memory_len, memory_shape, dtype=memory_dtype) self.Q = Sequential(Conv2d(4, 32, kernel_size=8, stride=4), LeakyReLU(), Conv2d(32, 64, kernel_size=4, stride=2), LeakyReLU(), Conv2d(64, 64, kernel_size=3, stride=1), LeakyReLU(), Flatten(), Linear(3136, 512), LeakyReLU(), Linear(512, self.n_actions)) self._name = name self.gamma = gamma self.opt = opt(self.Q.parameters(), **opt_args) self._base_loss_fn = MSELoss() self._steps = 0 self.eps = initial_eps #self.decay = (final_eps / initial_eps) ** (1/decay_steps) # Linear Decay self.decay = (initial_eps - final_eps) / decay_steps def learn(self, batch_size=100, n_samples=100): if len(self._memory) < n_samples: return 'n/a' self.Q.train() X, y = self._build_dataset(n_samples) y_pred = self.Q(X) loss = self._base_loss_fn(y, y_pred) self.opt.zero_grad() loss.backward() clip_grad_value_(self.Q.parameters(), 1) self.opt.step() self.Q.eval() return loss.item() def _build_dataset(self, n): with torch.no_grad(): s_s, a_s, r_s, sp_s, done_mask = self._memory.sample(n) vhat_sp_s = torch.max(self.Q(sp_s.float()), dim=1).values vhat_sp_s[done_mask] = 0 targets = self.Q(s_s.float()) for idx, t in enumerate(targets): t[int( a_s[idx].byte())] = r_s[idx] + self.gamma * vhat_sp_s[idx] X = s_s.float() y = targets return X, y def handle_transition(self, s, a, r, sp, done): s = self._convert_to_torch(s) sp = self._convert_to_torch(sp) self._memory.append( (s, torch.from_numpy(np.array([a]))[0], r, sp, done)) if (self._steps % 4) == 0: self.learn(n_samples=1024) self._steps += 1 def get_action_vals(self, s): s = self._convert_to_torch(s) return self.Q(s[None, :]) def exploration_strategy(self, s): #self.eps *= self.decay self.eps -= self.decay if np.random.random() > self.eps: ps = np.zeros(self.n_actions) best_action = torch.argmax(self.Q(s[None, :])) try: ps[best_action] = 1. except: print(self._name) exit() else: ps = np.full(self.n_actions, 1 / self.n_actions) return ps def deterministic_strategy(self, s): s = self._convert_to_torch(s) eps = 0.05 if np.random.random() > eps: ps = np.zeros(self.n_actions) best_action = torch.argmax(self.Q(s[None, :])).detach().numpy() ps[best_action] = 1. else: ps = np.full(self.n_actions, 1 / self.n_actions) return ps
class Agent(object): def __init__(self, observation_space_dims, action_space, discount_factor=.96, model_path='./model'): self.discount_factor = discount_factor self.model_path = model_path self.global_step = 0 self.history = History(log_path=model_path) self.max_reward = 1000 self.lock = th.Lock() self.lock_swap = th.Lock() self.action_shape = action_space.shape #(19,) self.observation_shape = (observation_space_dims, ) #(321,) self.inputdims = observation_space_dims self.memory = None self.block_training = False print("observation shape:", self.observation_shape) print("action shape: ", self.action_shape) self.is_continuous = True if isinstance(action_space, gym.spaces.Box) else False if self.is_continuous: low = action_space.low high = action_space.high num_of_actions = action_space.shape[0] self.action_bias = high / 2. + low / 2. self.action_multiplier = high - self.action_bias def clamp_action(actions): return np.clip(actions, a_max=action_space.high, a_min=action_space.low) self.clamp_action = clamp_action else: # not supported raise RuntimeError( 'This version of DDPG only supports continuous action space') self.outputdims = num_of_actions ids, ods = self.inputdims, self.outputdims #print('inputs:{}, outputs:{}'.format(ids, ods)) # start TF #tf.reset_default_graph() self.tf_graph = tf.Graph() self.sess = tf.Session(graph=self.tf_graph) # setup model with self.tf_graph.as_default(): self.nr_networks = hyper.nr_agents self.actor = [] self.critic = [] self.actor_target = [] self.critic_target = [] for i in range(self.nr_networks): self.actor.append( self.create_actor_network(ids, ods, 'actor_o' + str(i))) self.critic.append( self.create_critic_network(ids, ods, 'critic_o' + str(i))) self.actor_target.append( self.create_actor_network(ids, ods, 'actor_t' + str(i))) self.critic_target.append( self.create_critic_network(ids, ods, 'critic_t' + str(i))) # setup tf actions self.train, self.predict, self.sync_target, self.evaluate, self.swap_actors = self.train_step_gen( ) # setup model saving self.saver = tf.train.Saver(max_to_keep=10000) # init tf self.sess.run(tf.global_variables_initializer()) # sync model => model_target (on first run) for i in range(self.nr_networks): self.sync_target(i) def setup_memory(self): if self.memory == None: print("Creating memory buffer, hold on...") limit = hyper.memory_size self.memory = Memory(limit=limit, action_shape=self.action_shape, observation_shape=self.observation_shape) def create_actor_network(self, num_inputs, num_outputs, scope): def actor_model(state): with tf.variable_scope(scope, reuse=tf.AUTO_REUSE): x = tf.layers.dense(state, 512, kernel_initializer=w_init(3.0, num_inputs), name='a1', reuse=tf.AUTO_REUSE) x = tf.nn.leaky_relu(x, alpha=0.35) x = tf.layers.dense(x, 256, kernel_initializer=w_init(3.0, 512), name='a2', reuse=tf.AUTO_REUSE) x = tf.nn.leaky_relu(x, alpha=0.35) x = tf.layers.dense(x, 256, kernel_initializer=w_init(3.0, 256), name='a3', reuse=tf.AUTO_REUSE) x = tf.contrib.layers.layer_norm(x, center=True, scale=True) x = tf.nn.leaky_relu(x, alpha=0.35) x = tf.layers.dense(x, 256, kernel_initializer=w_init(2.0, 256), name='a4', reuse=tf.AUTO_REUSE) x = tf.nn.relu(x) x = tf.layers.dense(x, 256, kernel_initializer=w_init(2.0, 256), name='a5', reuse=tf.AUTO_REUSE) x = tf.nn.relu(x) x = tf.layers.dense(x, 256, kernel_initializer=w_init(2.0, 256), name='a6', reuse=tf.AUTO_REUSE) x = tf.nn.relu(x) x = tf.layers.dense(x, num_outputs, kernel_initializer=w_init(0.5, 256), name='a9', reuse=tf.AUTO_REUSE) x = tf.nn.tanh(x) * self.action_multiplier + self.action_bias return x return actor_model def create_critic_network(self, num_inputs, num_outputs, scope): def critic_model(input): state = input[0] action = input[1] with tf.variable_scope(scope, reuse=tf.AUTO_REUSE): x = tf.layers.dense(state, 256, kernel_initializer=w_init(3.0, num_inputs), name='c1s', reuse=tf.AUTO_REUSE) x = tf.nn.leaky_relu(x, alpha=0.35) x = tf.layers.dense(x, 256, kernel_initializer=w_init(3.0, 256), name='c2s', reuse=tf.AUTO_REUSE) x = tf.nn.leaky_relu(x, alpha=0.35) y = tf.layers.dense(action, 256, kernel_initializer=w_init( 3.0, num_outputs), name='c1a', reuse=tf.AUTO_REUSE) y = tf.nn.leaky_relu(y, alpha=0.35) x = tf.concat([x, y], axis=1) x = tf.layers.dense(x, 256, kernel_initializer=w_init(3.0, 256 + 256), name='c2', reuse=tf.AUTO_REUSE) x = tf.contrib.layers.layer_norm(x, center=True, scale=True) x = tf.nn.leaky_relu(x, alpha=0.35) x = tf.layers.dense(x, 256, kernel_initializer=w_init(3.0, 256), name='c3', reuse=tf.AUTO_REUSE) x = tf.nn.leaky_relu(x, alpha=0.35) x = tf.layers.dense(x, 256, kernel_initializer=w_init(3.0, 256), name='c4', reuse=tf.AUTO_REUSE) x = tf.nn.leaky_relu(x, alpha=0.35) x = tf.layers.dense(x, 1, kernel_initializer=w_init(1.0, 256), name='c9', reuse=tf.AUTO_REUSE) return x return critic_model def train_step_gen(self): s1 = tf.placeholder(tf.float32, shape=[None, self.inputdims]) a1 = tf.placeholder(tf.float32, shape=[None, self.outputdims]) r1 = tf.placeholder(tf.float32, shape=[None, 1]) isdone = tf.placeholder(tf.float32, shape=[None, 1]) s2 = tf.placeholder(tf.float32, shape=[None, self.inputdims]) tau = tf.Variable(1e-3, name='tau', trainable=False) self.train_ops = [] self.predict_ops = [] self.sync_target_ops = [] self.evaluate_ops = [] self.actor_vars = [] for i in range(self.nr_networks): scope = 'ac_' + str(i) with tf.variable_scope(scope): # 1. update the critic a2 = self.actor_target[i](s2) q2 = self.critic_target[i]([s2, a2]) q1_target = r1 + (1 - isdone) * (self.discount_factor + i * hyper.discount_step) * q2 q1_predict = self.critic[i]([s1, a1]) critic_loss = tf.reduce_mean((q1_target - q1_predict)**2) # 2. update the actor a1_predict = self.actor[i](s1) q1_predict2 = self.critic[i]([s1, a1_predict]) actor_loss = tf.reduce_mean(-q1_predict2) # 3. shift the weights (aka target network) aw = tf.trainable_variables(scope=scope + '/actor_o') cw = tf.trainable_variables(scope=scope + '/critic_o') atw = tf.trainable_variables(scope=scope + '/actor_t') ctw = tf.trainable_variables(scope=scope + '/critic_t') self.actor_vars.append([aw, atw]) one_m_tau = 1 - tau shift1 = [ tf.assign(atw[i], aw[i] * tau + atw[i] * (one_m_tau)) for i, _ in enumerate(aw) ] shift2 = [ tf.assign(ctw[i], cw[i] * tau + ctw[i] * (one_m_tau)) for i, _ in enumerate(cw) ] # 4. inference a_infer = self.actor[i](s1) q_infer = self.critic[i]([s1, a_infer]) # optimizer with tf.variable_scope('opt_a'): opt_actor = tf.train.AdamOptimizer( hyper.lr_actor) #, name='Adam' default astep = opt_actor.minimize(actor_loss, var_list=aw) with tf.variable_scope('opt_c'): opt_critic = tf.train.AdamOptimizer( hyper.lr_critic) #, name='Adam' cstep = opt_critic.minimize(critic_loss, var_list=cw) self.train_ops.append( [critic_loss, actor_loss, cstep, astep, shift1, shift2]) self.predict_ops.append([a_infer, q_infer]) self.sync_target_ops.append([shift1, shift2]) self.evaluate_ops.append([q1_predict]) # setup ops for swapping actors self.copy_ops = [] with tf.variable_scope('copy'): self.actor_backup = [] self.actor_t_backup = [] # Create variables of actor shape to hold a backup # I'm sure there's a better way to do this for _, av in enumerate(self.actor_vars[0][0]): self.actor_backup.append(tf.Variable(av)) for _, av in enumerate(self.actor_vars[0][1]): self.actor_t_backup.append(tf.Variable(av)) # copy the first one self.backup_cp = [ tf.assign(self.actor_backup[k], self.actor_vars[0][0][k]) for k, _ in enumerate(self.actor_vars[0][0]) ] self.backup_cp_t = [ tf.assign(self.actor_t_backup[k], self.actor_vars[0][1][k]) for k, _ in enumerate(self.actor_vars[0][1]) ] # copy actors to index-1 for i in range(self.nr_networks - 1): cp = [ tf.assign(self.actor_vars[i][0][k], self.actor_vars[i + 1][0][k]) for k, _ in enumerate(self.actor_vars[i][0]) ] cp_t = [ tf.assign(self.actor_vars[i][1][k], self.actor_vars[i + 1][1][k]) for k, _ in enumerate(self.actor_vars[i][1]) ] self.copy_ops.append([cp, cp_t]) # copy the backup to the last last_id = self.nr_networks - 1 self.last_cp = [ tf.assign(self.actor_vars[last_id][0][k], self.actor_backup[k]) for k, _ in enumerate(self.actor_vars[last_id][0]) ] self.last_cp_t = [ tf.assign(self.actor_vars[last_id][1][k], self.actor_t_backup[k]) for k, _ in enumerate(self.actor_vars[last_id][1]) ] def swap_actors(): with self.lock_swap: with self.lock: # could setup control_dependencies/groups, but this is not run very often self.sess.run([self.backup_cp, self.backup_cp_t], feed_dict={}) for _, cp in enumerate(self.copy_ops): self.sess.run(cp, feed_dict={}) self.sess.run([self.last_cp, self.last_cp_t], feed_dict={}) def train(memory, i): [s1d, a1d, r1d, isdoned, s2d] = memory res = self.sess.run(self.train_ops[i], feed_dict={ s1: s1d, a1: a1d, r1: r1d, isdone: isdoned, s2: s2d, tau: hyper.tau }) return res def predict(state, i): res = self.sess.run(self.predict_ops[i], feed_dict={s1: state}) return res def sync_target(i): self.sess.run(self.sync_target_ops[i], feed_dict={tau: 1.}) def evaluate(state, action, i): [qv] = self.sess.run(self.evaluate_ops[i], feed_dict={ s1: state, a1: action }) return qv return train, predict, sync_target, evaluate, swap_actors def test_swap_actors(self): for i in range(self.nr_networks): print(self.sess.run(self.actor_vars[i][0][0][0][0], feed_dict={})) def get_max_action(self, observation): obs_b = np.reshape(observation, (1, len(observation))) # get actions all_actions = [] for aci in range(self.nr_networks): [actions, _] = self.predict(obs_b, aci) # setup for batches, get first action = actions[0] all_actions.append(action) # create combinations for ai in range(self.nr_networks): for aj in range(self.nr_networks): if aj < ai: a1 = all_actions[ai] a2 = all_actions[aj] avg_action = (a1 + a2) * 0.5 all_actions.append(avg_action) # and more combinations for ai in range(self.nr_networks): for aj in range(self.nr_networks): for ak in range(self.nr_networks): if aj < ai and ak < aj: a1 = all_actions[ai] a2 = all_actions[aj] a3 = all_actions[ak] avg_action = (a1 + a2 + a3) * (1.0 / 3.0) all_actions.append(avg_action) # for sanity for ai in range(len(all_actions)): all_actions[ai] = self.clamp_action(all_actions[ai]) # make it a np array, so we can batch it all_actions = np.asarray(all_actions) # stack observation for batching all_obs = np.repeat(obs_b, len(all_actions), axis=0) # get qv from each critic and sum them for ci in range(self.nr_networks): if ci == 0: all_qv_b = self.evaluate(all_obs, all_actions, ci) else: all_qv_b += self.evaluate(all_obs, all_actions, ci) all_qv_b /= self.nr_networks # evaluate actions max_action = None max_qv = None for ai in range(len(all_qv_b)): qv_a = all_qv_b[ai] if max_qv == None or qv_a > max_qv: max_action = all_actions[ai] max_qv = qv_a return max_action def get_all_actions(self, observation): obs_b = np.reshape(observation, (1, len(observation))) # get actions all_actions = [] for aci in range(self.nr_networks): [actions, _] = self.predict(obs_b, aci) # setup for batches, get first action = actions[0] all_actions.append(action) return all_actions def get_action_qs(self, observation, all_actions): obs_b = np.reshape(observation, (1, len(observation))) # make it a np array, so we can batch it all_actions = np.asarray(all_actions) # stack observation for batching all_obs = np.repeat(obs_b, len(all_actions), axis=0) # get qv from each critic and sum them for ci in range(self.nr_networks): if ci == 0: all_qv_b = self.evaluate(all_obs, all_actions, ci) else: all_qv_b += self.evaluate(all_obs, all_actions, ci) all_qv_b /= self.nr_networks return all_qv_b def get_action(self, observation, i): obs = np.reshape(observation, (1, len(observation))) [actions, q] = self.predict(obs, i) actions, q = actions[0], q[0] return actions def train_batch(self, i): if self.block_training: return # only if enough samples in memory if self.memory.size() > hyper.batch_size * 128: # sample a minibatch [s1, a1, r1, isdone, s2] = self.memory.sample_batch(hyper.batch_size) # print(s1.shape,a1.shape,r1.shape,isdone.shape,s2.shape) self.train([s1, a1, r1, isdone, s2], i) def append_memory(self, s1, a1, r1, isdone, s2): self.memory.append(s1, a1, r1, isdone, s2) def run_episode(self, fenv, max_steps=-1, training=False, render=False, noise_level=0., ac_id=0): time_start = time.time() noise_source = None if noise_level > 0.0: noise_source = one_fsq_noise() # warm up noise source for _ in range(2000): noise_source.one((self.outputdims, ), noise_level) max_steps = max_steps if max_steps > 0 else 50000 steps = 0 total_reward = 0 try: # this might be a remote env observation = np.array(fenv.reset()) except Exception as e: print('Bad things during reset. Episode terminated.', e) traceback.print_exc() return while True and steps <= max_steps: steps += 1 observation_before_action = observation # s1 exploration_noise = 0.0 if noise_level > 0.0: exploration_noise = noise_source.one((self.outputdims, ), noise_level) # get action action = None with self.lock_swap: if training: action = self.get_action(observation_before_action, ac_id) else: action = self.get_max_action(observation_before_action) # add noise to our actions, since our policy is deterministic if noise_level > 0.0: exploration_noise *= self.action_multiplier action += exploration_noise action = self.clamp_action(action) # step try: # can't send receive np arrays over pyro action_out = [float(action[i]) for i in range(len(action))] observation, reward, done, _info = fenv.step(action_out) observation = np.array(observation) except Exception as e: print('Bad things during step. Episode terminated.', e) traceback.print_exc() return # d1 isdone = 1 if done else 0 total_reward += reward # train if training == True: # The code works without this lock, but depending on training speed there is too much noise on updates. # The model always trains and is more stable with lock here with self.lock: self.append_memory(observation_before_action, action, reward, isdone, observation) # s1,a1,r1,isdone,s2 for i in range(self.nr_networks): self.train_batch(i) else: if render: fenv.render() if done: break totaltime = time.time() - time_start if training == True: self.global_step += 1 print( self.global_step, ': Episode done in {} steps in {:.2f} sec, {:.4f} sec/step, got reward :{:.2f}' .format(steps, totaltime, totaltime / steps, total_reward)) self.history.append_train(total_reward, noise_level, steps) else: print( 'Test done in {} steps in {:.2f} sec, {:.4f} sec/step, got reward :{:.2f}' .format(steps, totaltime, totaltime / steps, total_reward)) self.history.append_test(total_reward, self.global_step, steps) if render == False: # background test if total_reward > self.max_reward: self.max_reward = total_reward self.save_weights("max_model") print("Saved new max model with score: ", total_reward) return total_reward def save_weights(self, model_name="model"): with self.lock_swap: with self.lock: self.saver.save(self.sess, self.model_path + "/" + model_name, global_step=self.global_step) print("Saved model at global episode:", self.global_step) def load_weights(self, model=""): print('Loading Model...') path = "" if model == "": checkpoint = tf.train.get_checkpoint_state(self.model_path) if checkpoint: path = checkpoint.model_checkpoint_path else: path = self.model_path + "/" + model try: self.saver.restore(self.sess, path) print("Loaded model from checkpoint:", path) return True except Exception as ex: print("No model checkpoint available!") return False
class DDPG(object): def __init__(self, memory_capacity, batch_size, prioritiy, noise_target_action=False, alpha=0.2, use_n_step=False, n_step_return=5, is_training=True, LAMBDA_BC=100, policy_delay=1, use_TD3=False, experiment_name='none', Q_value_range=(-250, 5)): self.batch_size = batch_size self.is_prioritiy = prioritiy self.n_step_return = n_step_return self.use_n_step = use_n_step self.LAMBDA_BC = LAMBDA_BC self.use_TD3 = use_TD3 self.experiment_name = experiment_name self.Q_value_range = Q_value_range # 限制q的范围,防止过估计. self.demo_percent = [] # demo 在 sample中所占比例 if prioritiy: from priority_memory import PrioritizedMemory self.memory = PrioritizedMemory(capacity=memory_capacity, alpha=alpha) else: from memory import Memory self.memory = Memory(limit=memory_capacity, action_shape=(4, ), observation_shape=(224, 224, 3), full_state_shape=(15, )) self.pointer = 0 # memory 计数器 self.sess = tf.InteractiveSession() # 创建一个默认会话 self.lambda_1_step = 0.5 # 1_step_return_loss的权重 self.lambda_n_step = 0.5 # n_step_return_loss的权重 self.beta = 0.6 self.act_limit = np.array([0.05, 0.05, 0.05, np.radians(90)]) # actor 比 critic 更新频率小 self.policy_delay_iterate = 0 self.policy_delay = policy_delay # 定义 placeholders self.observe_Input = tf.placeholder(tf.float32, [None, 15], name='observe_Input') self.observe_Input_ = tf.placeholder(tf.float32, [None, 15], name='observe_Input_') self.f_s = tf.placeholder(tf.float32, [None, 15], name='full_state_Input') self.f_s_ = tf.placeholder(tf.float32, [None, 15], name='fill_state_Input_') self.R = tf.placeholder(tf.float32, [None, 1], 'r') self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1') self.ISWeights = tf.placeholder(tf.float32, [None, 1], name='IS_weights') self.n_step_steps = tf.placeholder(tf.float32, shape=(None, 1), name='n_step_reached') self.q_demo = tf.placeholder(tf.float32, [None, 1], name='Q_of_actions_from_memory') self.come_from_demo = tf.placeholder(tf.float32, [None, 1], name='Demo_index') self.action_memory = tf.placeholder(tf.float32, [None, 4], name='actions_from_memory') with tf.variable_scope('obs_rms'): self.obs_rms = RunningMeanStd(shape=(15, )) with tf.variable_scope('state_rms'): self.state_rms = RunningMeanStd(shape=(15, )) with tf.name_scope('obs_preprocess'): self.normalized_observe_Input = tf.clip_by_value( normalize(self.observe_Input, self.obs_rms), -10., 10.) self.normalized_observe_Input_ = tf.clip_by_value( normalize(self.observe_Input_, self.obs_rms), -10., 10.) with tf.name_scope('state_preprocess'): self.normalized_f_s0 = normalize(self.f_s, self.state_rms) self.normalized_f_s1 = normalize(self.f_s_, self.state_rms) with tf.variable_scope('Actor'): self.action = self.build_actor(self.normalized_observe_Input, scope='eval', trainable=True, is_training=is_training) self.action_ = self.build_actor(self.normalized_observe_Input_, scope='target', trainable=False, is_training=False) # Target policy smoothing, by adding clipped noise to target actions if noise_target_action: epsilon = tf.random_normal(tf.shape(self.action_), stddev=0.007) epsilon = tf.clip_by_value(epsilon, -0.01, 0.01) a2 = self.action_ + epsilon noised_action_ = tf.clip_by_value(a2, -self.act_limit, self.act_limit) else: noised_action_ = self.action_ with tf.variable_scope('Critic'): # Q值都要被clip 防止过估计. self.q_1 = tf.clip_by_value( self.build_critic(self.normalized_f_s0, self.action, scope='eval_1', trainable=True, is_training=is_training), self.Q_value_range[0], self.Q_value_range[1]) q_1_ = self.build_critic(self.normalized_f_s1, noised_action_, scope='target_1', trainable=False, is_training=False) if self.use_TD3: q_2 = tf.clip_by_value( self.build_critic(self.normalized_f_s0, self.action, scope='eval_2', trainable=True, is_training=is_training), self.Q_value_range[0], self.Q_value_range[1]) q_2_ = self.build_critic(self.normalized_f_s1, noised_action_, scope='target_2', trainable=False, is_training=False) # Collect networks parameters. It would make it more easily to manage them. self.ae_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval') self.at_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/target') self.ce1_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval_1') self.ct1_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target_1') if self.use_TD3: self.ce2_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval_2') self.ct2_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target_2') with tf.variable_scope('Soft_Update'): self.soft_replace_a = [ tf.assign(t, (1 - TAU) * t + TAU * e) for t, e in zip(self.at_params, self.ae_params) ] self.soft_replace_c = [ tf.assign(t, (1 - TAU) * t + TAU * e) for t, e in zip(self.ct1_params, self.ce1_params) ] if self.use_TD3: self.soft_replace_c += [ tf.assign(t, (1 - TAU) * t + TAU * e) for t, e in zip(self.ct2_params, self.ce2_params) ] # critic 的误差 为 (one-step-td 误差 + n-step-td 误差 + critic_online 的L2惩罚) # TD3: critic一共有4个, 算两套 critic的误差, 秀儿. with tf.variable_scope('Critic_Lose'): if self.use_TD3: min_q_ = tf.minimum(q_1_, q_2_) else: min_q_ = q_1_ self.q_target = self.R + (1. - self.terminals1) * GAMMA * min_q_ if self.use_n_step: self.n_step_target_q = self.R + ( 1. - self.terminals1) * tf.pow(GAMMA, self.n_step_steps) * min_q_ cliped_n_step_target_q = tf.clip_by_value( self.n_step_target_q, self.Q_value_range[0], self.Q_value_range[1]) cliped_q_target = tf.clip_by_value(self.q_target, self.Q_value_range[0], self.Q_value_range[1]) self.td_error_1 = tf.abs(cliped_q_target - self.q_1) if self.use_TD3: self.td_error_2 = tf.abs(cliped_q_target - q_2) if self.use_n_step: self.nstep_td_error_1 = tf.abs(cliped_n_step_target_q - self.q_1) if self.use_TD3: self.nstep_td_error_2 = tf.abs(cliped_n_step_target_q - q_2) L2_regular_1 = tf.contrib.layers.apply_regularization( tf.contrib.layers.l2_regularizer(0.001), weights_list=self.ce1_params) if self.use_TD3: L2_regular_2 = tf.contrib.layers.apply_regularization( tf.contrib.layers.l2_regularizer(0.001), weights_list=self.ce2_params) one_step_losse_1 = tf.reduce_mean( tf.multiply(self.ISWeights, tf.square( self.td_error_1))) * self.lambda_1_step if self.use_TD3: one_step_losse_2 = tf.reduce_mean( tf.multiply(self.ISWeights, tf.square( self.td_error_2))) * self.lambda_1_step if self.use_n_step: n_step_td_losses_1 = tf.reduce_mean( tf.multiply(self.ISWeights, tf.square( self.nstep_td_error_1))) * self.lambda_n_step c_loss_1 = one_step_losse_1 + n_step_td_losses_1 + L2_regular_1 if self.use_TD3: n_step_td_losses_2 = tf.reduce_mean( tf.multiply( self.ISWeights, tf.square( self.nstep_td_error_2))) * self.lambda_n_step c_loss_2 = one_step_losse_2 + n_step_td_losses_2 + L2_regular_2 else: c_loss_1 = one_step_losse_1 + L2_regular_1 if self.use_TD3: c_loss_2 = one_step_losse_2 + L2_regular_2 # actor 的 loss 为 最大化q(s,a) 最小化行为克隆误差. # (只有demo的transition 且 demo的action 比 actor生成的action q_1(s,a)高的时候 才会有克隆误差) with tf.variable_scope('Actor_lose'): Is_worse_than_demo = self.q_1 < self.q_demo Is_worse_than_demo = tf.cast(Is_worse_than_demo, tf.float32) worse_than_demo = tf.cast(tf.reduce_sum(Is_worse_than_demo), tf.int8) # 算action误差 我用的是平方和, 也有人用均方误差 reduce_mean. 其实都可以. # 我的action本来都是很小的数. action_diffs = Is_worse_than_demo * tf.reduce_sum( self.come_from_demo * tf.square(self.action - self.action_memory), 1, keepdims=True) L_BC = self.LAMBDA_BC * tf.reduce_sum(action_diffs) a_loss = -tf.reduce_mean(self.q_1) + L_BC # Setting optimizer for Actor and Critic update_ops = tf.get_collection( tf.GraphKeys.UPDATE_OPS) # batch-normal 参数更新 with tf.variable_scope('Critic_Optimizer'): if self.use_TD3: self.ctrain = tf.group(tf.train.AdamOptimizer(LR_C).minimize( c_loss_1, var_list=self.ce1_params), tf.train.AdamOptimizer(LR_C).minimize( c_loss_2, var_list=self.ce2_params), name='ctrain') else: self.ctrain = tf.train.AdamOptimizer(LR_C).minimize( c_loss_1, var_list=self.ce1_params) with tf.variable_scope('Actor_Optimizer'): with tf.control_dependencies(update_ops): self.atrain = tf.train.AdamOptimizer(LR_A).minimize( a_loss, var_list=self.ae_params) self.sess.run(tf.global_variables_initializer()) # init_target net-work with evaluate net-params init_a_t = [ tf.assign(t, e) for t, e in zip(self.at_params, self.ae_params) ] init_c_t = [ tf.assign(t, e) for t, e in zip(self.ct1_params, self.ce1_params) ] if self.use_TD3: init_c_t += [ tf.assign(t, e) for t, e in zip(self.ct2_params, self.ce2_params) ] self.sess.run(init_a_t) self.sess.run(init_c_t) # 保存模型 var_list = [ var for var in tf.global_variables() if "moving" in var.name ] var_list += tf.trainable_variables() self.saver = tf.train.Saver(var_list=var_list, max_to_keep=1) self.writer = tf.summary.FileWriter( "logs/" + self.experiment_name + "/", self.sess.graph) self.a_summary = tf.summary.merge([ tf.summary.scalar('a_loss', a_loss, family='actor'), tf.summary.scalar('L_BC', L_BC, family='actor'), tf.summary.scalar('worse_than_demo', worse_than_demo, family='actor') ]) if self.use_TD3: self.c_summary = tf.summary.merge([ tf.summary.scalar('c_loss_1', c_loss_1, family='critic'), tf.summary.scalar('c_loss_2', c_loss_2, family='critic') ]) else: self.c_summary = tf.summary.merge( [tf.summary.scalar('c_loss_1', c_loss_1, family='critic')]) self.episode_result = tf.placeholder(tf.int8, name='episode_result') self.episode_summary = tf.summary.merge([ tf.summary.scalar('Episode_Result( success or not )', self.episode_result, family='Result') ]) def pi(self, obs): obs = obs.astype(dtype=np.float32) return self.sess.run(self.action, {self.observe_Input: obs[np.newaxis, :]})[0] def Save(self): # 只存权重,不存计算图. self.saver.save(self.sess, save_path="model/" + self.experiment_name + "/model.ckpt", write_meta_graph=False) def load(self): self.saver.restore(self.sess, save_path="model/" + self.experiment_name + "/model.ckpt") def save_episoed_result(self, result, episoed): s = self.sess.run(self.episode_summary, feed_dict={self.episode_result: result}) self.writer.add_summary(s, episoed) def learn(self): if self.is_prioritiy: batch, n_step_batch, percentage = self.memory.sample_rollout( batch_size=self.batch_size, nsteps=self.n_step_return, beta=self.beta, gamma=GAMMA) self.demo_percent.append(float(percentage)) else: batch = self.memory.sample(batch_size=self.batch_size) one_step_target_q = self.sess.run( self.q_target, feed_dict={ self.observe_Input_: batch['f_s1'], # low dim input self.R: batch['rewards'], self.terminals1: batch['terminals1'], self.f_s_: batch['f_s1'] }) if self.use_TD3: opt = [ self.td_error_1, self.td_error_2, self.ctrain, self.c_summary, self.q_1 ] else: opt = [self.td_error_1, self.ctrain, self.c_summary, self.q_1] if self.is_prioritiy and self.use_n_step: n_step_target_q = self.sess.run(self.n_step_target_q, feed_dict={ self.terminals1: n_step_batch["terminals1"], self.n_step_steps: n_step_batch["step_reached"], self.R: n_step_batch['rewards'], self.observe_Input_: n_step_batch['f_s1'], self.f_s_: n_step_batch['f_s1'] }) res = self.sess.run(opt, feed_dict={ self.q_target: one_step_target_q, self.n_step_target_q: n_step_target_q, self.f_s: batch['f_s0'], self.action: batch['actions'], self.ISWeights: batch['weights'] }) else: res = self.sess.run(opt, feed_dict={ self.q_target: one_step_target_q, self.f_s: batch['f_s0'], self.action: batch['actions'], self.ISWeights: batch['weights'] }) if self.use_TD3: td_error_1, td_error_2, _, c_s, q_demo = res td_error = (td_error_1 + td_error_2) / 2.0 else: td_error, _, c_s, q_demo = res # actor update if self.policy_delay_iterate % self.policy_delay == 0: _, a_s, = self.sess.run( [self.atrain, self.a_summary], { self.observe_Input: batch['f_s0'], self.q_demo: q_demo, self.f_s: batch['f_s0'], self.come_from_demo: batch['demos'], self.action_memory: batch['actions'] }) self.sess.run(self.soft_replace_a) self.writer.add_summary(a_s) if self.is_prioritiy: self.memory.update_priorities(batch['idxes'], td_errors=td_error) self.sess.run(self.soft_replace_c) self.writer.add_summary(c_s) self.policy_delay_iterate += 1 def store_transition(self, obs0, action, reward, obs1, full_state0, full_state1, terminal1, demo=False): obs0 = obs0.astype(np.float32) obs1 = obs1.astype(np.float32) full_state0 = full_state0.astype(np.float32) full_state1 = full_state1.astype(np.float32) if demo: self.memory.append_demo(obs0=obs0, f_s0=full_state0, action=action, reward=reward, obs1=obs1, f_s1=full_state1, terminal1=terminal1) else: self.memory.append(obs0=obs0, f_s0=full_state0, action=action, reward=reward, obs1=obs1, f_s1=full_state1, terminal1=terminal1) # 增量式的更新observe的均值标准差 # self.obs_rms.update(np.array([obs0])) # self.obs_rms.update(np.array([obs1])) self.state_rms.update(np.array([full_state0])) self.state_rms.update(np.array([full_state1])) self.pointer += 1 def build_actor(self, observe_input, scope, trainable, is_training=True): bn_a = partial(bn, trainable=trainable, training=is_training) fc_a = partial(tf.layers.dense, activation=None, trainable=trainable) conv2_a = partial(conv2_, trainable=trainable) relu = partial(tf.nn.relu) with tf.variable_scope(scope): # conv -> BN -> relu # net = relu(bn_a(conv2_a( observe_input, 32 ))) # net = relu(bn_a(conv2_a( net, 32 ))) # net = relu(bn_a(conv2_a( net, 64 ))) # net = relu(bn_a(conv2_a( net, 64 ))) # net = relu(bn_a(conv2_a( net, 128 ))) # net = relu(bn_a(conv2_a( net, 128 ))) # # net = tf.layers.flatten(net) net = observe_input net = relu(bn_a(fc_a(net, 128))) net = relu(bn_a(fc_a(net, 128))) action_output = fc_a( net, 4, activation=tf.nn.tanh, kernel_initializer=tf.initializers.random_uniform( minval=-0.0003, maxval=0.0003)) #输出(1,4) action_output = action_output * self.act_limit # dx a[0] (-0.05,0.05) # dy a[1] (-0.05,0.05) # dz a[2] (-0.05,0.05) # da a[3] (-pi/2,pi/2) return action_output def build_critic(self, f_s, a, scope, trainable, is_training=True): bn_a = partial(bn, trainable=trainable, training=is_training) relu = partial(tf.nn.relu) fc_c = partial(tf.layers.dense, activation=None, trainable=trainable) with tf.variable_scope(scope): net = tf.concat([f_s, a], axis=1) net = relu(bn_a(fc_c(net, 128))) net = relu(bn_a(fc_c(net, 128))) q = fc_c(net, 1, kernel_initializer=tf.initializers.random_uniform( minval=-0.0003, maxval=0.0003)) # Q(s,a) 输出一个[None,1] return q
class DQN(object): def __init__(self): agent_args = Singleton_arger()['agent'] self.critic_lr = agent_args['critic_lr'] self.lr_decay = agent_args['lr_decay'] self.l2_critic = agent_args['l2_critic'] self.batch_size = agent_args['batch_size'] self.discount = agent_args['discount'] self.tau = agent_args['tau'] self.with_cuda = agent_args['with_cuda'] self.buffer_size = int(agent_args['buffer_size']) self.num_update_time = 10 def setup(self, obs_shape, nb_action): self.lr_coef = 1 self.epsilon = 1 self.nb_action = nb_action model_args = Singleton_arger()['model'] qnet = QNet(obs_shape, nb_action) self.qnet = copy.deepcopy(qnet) self.target_qnet = copy.deepcopy(qnet) self.memory = Memory(self.buffer_size, nb_action, self.with_cuda) if self.with_cuda: self.qnet.cuda() self.target_qnet.cuda() self.qnet_optim = Adam(self.qnet.parameters(), lr=self.critic_lr) def reset_noise(self): pass def before_epoch(self): pass def before_cycle(self): pass def before_iter(self): self.epsilon = max((self.epsilon - (1 - 0.01) / 250000), 0.01) def store_transition(self, s_t, a_t, r_t, s_t1, done_t): #s_t = torch.tensor(s_t,dtype = torch.float32,requires_grad = False) self.memory.append(s_t, a_t, r_t, s_t1, done_t) def update_target(self): for target_param, param in zip(self.target_qnet.parameters(), self.qnet.parameters()): target_param.data.copy_(param.data) def update(self): batch = self.memory.sample(self.batch_size) self.qnet_optim.zero_grad() q_eval = self.qnet(batch['obs0']).gather(1, batch['actions']) with torch.no_grad(): _, a_next = self.qnet(batch['obs1']).max(1) q_next = self.target_qnet(batch['obs1']).gather( 1, a_next.unsqueeze(1)) q_target = batch['rewards'] + self.discount * ( 1 - batch['terminals1']) * q_next value_loss = nn.functional.mse_loss(q_eval, q_target) value_loss.backward() self.qnet_optim.step() return value_loss.item() def calc_last_error(self): # Sample batch batch = self.memory.sample_last(self.batch_size) #tensor_obs0 = batch['obs0'] #tensor_obs1 = batch['obs1'] # Prepare for the target q batch with torch.no_grad(): q_eval = self.qnet(batch['obs0']).gather(1, batch['actions']) _, a_next = self.qnet(batch['obs1']).max(1) q_next = self.target_qnet(batch['obs1']).gather( 1, a_next.unsqueeze(1)) q_target = batch['rewards'] + self.discount * ( 1 - batch['terminals1']) * q_next value_loss = nn.functional.mse_loss(q_eval, q_target) return value_loss.item() def apply_lr_decay(self): if self.lr_decay > 0: self.lr_coef = self.lr_decay * self.lr_coef / (self.lr_coef + self.lr_decay) for group in self.qnet_optim.param_groups: group['lr'] = self.critic_lr * self.lr_coef def select_action(self, s_t, apply_noise): if apply_noise and np.random.rand() < self.epsilon: return np.random.random_integers(0, self.nb_action - 1) s_t = torch.tensor(np.expand_dims(np.array(s_t), axis=0), dtype=torch.float32, requires_grad=False) if self.with_cuda: s_t = s_t.cuda() with torch.no_grad(): q_value = self.qnet(s_t) action = np.argmax(q_value.cpu().numpy().squeeze(0)) return action def load_weights(self, output): self.qnet = torch.load('{}/qnet.pkl'.format(output)) def save_model(self, output): torch.save(self.qnet, '{}/qnet.pkl'.format(output)) def get_qnet_buffer(self): qnet_buffer = io.BytesIO() torch.save(self.qnet, qnet_buffer) return qnet_buffer
class DDPG: def __init__(self, env, args): ob_space = env.observation_space goal_dim = env.goal_dim ob_dim = ob_space.shape[0] self.ob_dim = ob_dim self.ac_dim = ac_dim = 7 self.goal_dim = goal_dim self.num_iters = args.num_iters self.random_prob = args.random_prob self.tau = args.tau self.reward_scale = args.reward_scale self.gamma = args.gamma self.log_interval = args.log_interval self.save_interval = args.save_interval self.rollout_steps = args.rollout_steps self.env = env self.batch_size = args.batch_size self.train_steps = args.train_steps self.closest_dist = np.inf self.warmup_iter = args.warmup_iter self.max_grad_norm = args.max_grad_norm self.use_her = args.her self.k_future = args.k_future self.model_dir = os.path.join(args.save_dir, 'model') self.pretrain_dir = args.pretrain_dir os.makedirs(self.model_dir, exist_ok=True) self.global_step = 0 self.actor = Actor(ob_dim=ob_dim, act_dim=ac_dim, hid1_dim=args.hid1_dim, hid2_dim=args.hid2_dim, hid3_dim=args.hid3_dim, init_method=args.init_method) self.critic = Critic(ob_dim=ob_dim, act_dim=ac_dim, hid1_dim=args.hid1_dim, hid2_dim=args.hid2_dim, hid3_dim=args.hid3_dim, init_method=args.init_method) if args.resume or args.test or args.pretrain_dir is not None: self.load_model(args.resume_step, pretrain_dir=args.pretrain_dir) if not args.test: self.actor_target = Actor(ob_dim=ob_dim, act_dim=ac_dim, hid1_dim=args.hid1_dim, hid2_dim=args.hid2_dim, hid3_dim=args.hid3_dim, init_method=args.init_method) self.critic_target = Critic(ob_dim=ob_dim, act_dim=ac_dim, hid1_dim=args.hid1_dim, hid2_dim=args.hid2_dim, hid3_dim=args.hid3_dim, init_method=args.init_method) self.actor_optim = self.construct_optim(self.actor, lr=args.actor_lr) cri_w_decay = args.critic_weight_decay self.critic_optim = self.construct_optim(self.critic, lr=args.critic_lr, weight_decay=cri_w_decay) self.hard_update(self.actor_target, self.actor) self.hard_update(self.critic_target, self.critic) self.actor_target.eval() self.critic_target.eval() if args.noise_type == 'ou_noise': mu = np.zeros(ac_dim) sigma = float(args.ou_noise_std) * np.ones(ac_dim) self.action_noise = OrnsteinUhlenbeckActionNoise(mu=mu, sigma=sigma) elif args.noise_type == 'uniform': low_limit = args.uniform_noise_low high_limit = args.uniform_noise_high dec_step = args.max_noise_dec_step self.action_noise = UniformNoise(low_limit=low_limit, high_limit=high_limit, dec_step=dec_step) elif args.noise_type == 'gaussian': mu = np.zeros(ac_dim) sigma = args.normal_noise_std * np.ones(ac_dim) self.action_noise = NormalActionNoise(mu=mu, sigma=sigma) self.memory = Memory(limit=int(args.memory_limit), action_shape=(int(ac_dim), ), observation_shape=(int(ob_dim), )) self.critic_loss = nn.MSELoss() self.ob_norm = args.ob_norm if self.ob_norm: self.obs_oms = OnlineMeanStd(shape=(1, ob_dim)) else: self.obs_oms = None self.cuda() def test(self, render=False, record=True, slow_t=0): dist, succ_rate = self.rollout(render=render, record=record, slow_t=slow_t) print('Final step distance: ', dist) def train(self): self.net_mode(train=True) tfirststart = time.time() epoch_episode_rewards = deque(maxlen=1) epoch_episode_steps = deque(maxlen=1) total_rollout_steps = 0 for epoch in range(self.global_step, self.num_iters): episode_reward = 0 episode_step = 0 self.action_noise.reset() obs = self.env.reset() obs = obs[0] epoch_actor_losses = [] epoch_critic_losses = [] if self.use_her: ep_experi = { 'obs': [], 'act': [], 'reward': [], 'new_obs': [], 'ach_goals': [], 'done': [] } for t_rollout in range(self.rollout_steps): total_rollout_steps += 1 ran = np.random.random(1)[0] if self.pretrain_dir is None and epoch < self.warmup_iter or \ ran < self.random_prob: act = self.random_action().flatten() else: act = self.policy(obs).flatten() new_obs, r, done, info = self.env.step(act) ach_goals = new_obs[1].copy() new_obs = new_obs[0].copy() episode_reward += r episode_step += 1 self.memory.append(obs, act, r * self.reward_scale, new_obs, ach_goals, done) if self.use_her: ep_experi['obs'].append(obs) ep_experi['act'].append(act) ep_experi['reward'].append(r * self.reward_scale) ep_experi['new_obs'].append(new_obs) ep_experi['ach_goals'].append(ach_goals) ep_experi['done'].append(done) if self.ob_norm: self.obs_oms.update(new_obs) obs = new_obs epoch_episode_rewards.append(episode_reward) epoch_episode_steps.append(episode_step) if self.use_her: for t in range(episode_step - self.k_future): ob = ep_experi['obs'][t] act = ep_experi['act'][t] new_ob = ep_experi['new_obs'][t] ach_goal = ep_experi['ach_goals'][t] k_futures = np.random.choice(np.arange( t + 1, episode_step), self.k_future - 1, replace=False) k_futures = np.concatenate((np.array([t]), k_futures)) for future in k_futures: new_goal = ep_experi['ach_goals'][future] her_ob = np.concatenate( (ob[:-self.goal_dim], new_goal), axis=0) her_new_ob = np.concatenate( (new_ob[:-self.goal_dim], new_goal), axis=0) res = self.env.cal_reward(ach_goal.copy(), new_goal, act) her_reward, _, done = res self.memory.append(her_ob, act, her_reward * self.reward_scale, her_new_ob, ach_goal.copy(), done) self.global_step += 1 if epoch >= self.warmup_iter: for t_train in range(self.train_steps): act_loss, cri_loss = self.train_net() epoch_critic_losses.append(cri_loss) epoch_actor_losses.append(act_loss) if epoch % self.log_interval == 0: tnow = time.time() stats = {} if self.ob_norm: stats['ob_oms_mean'] = safemean(self.obs_oms.mean.numpy()) stats['ob_oms_std'] = safemean(self.obs_oms.std.numpy()) stats['total_rollout_steps'] = total_rollout_steps stats['rollout/return'] = safemean( [rew for rew in epoch_episode_rewards]) stats['rollout/ep_steps'] = safemean( [l for l in epoch_episode_steps]) if epoch >= self.warmup_iter: stats['actor_loss'] = np.mean(epoch_actor_losses) stats['critic_loss'] = np.mean(epoch_critic_losses) stats['epoch'] = epoch stats['actor_lr'] = self.actor_optim.param_groups[0]['lr'] stats['critic_lr'] = self.critic_optim.param_groups[0]['lr'] stats['time_elapsed'] = tnow - tfirststart for name, value in stats.items(): logger.logkv(name, value) logger.dumpkvs() if (epoch == 0 or epoch >= self.warmup_iter) and \ self.save_interval and\ epoch % self.save_interval == 0 and \ logger.get_dir(): mean_final_dist, succ_rate = self.rollout() logger.logkv('epoch', epoch) logger.logkv('test/total_rollout_steps', total_rollout_steps) logger.logkv('test/mean_final_dist', mean_final_dist) logger.logkv('test/succ_rate', succ_rate) tra_mean_dist, tra_succ_rate = self.rollout(train_test=True) logger.logkv('train/mean_final_dist', tra_mean_dist) logger.logkv('train/succ_rate', tra_succ_rate) # self.log_model_weights() logger.dumpkvs() if mean_final_dist < self.closest_dist: self.closest_dist = mean_final_dist is_best = True else: is_best = False self.save_model(is_best=is_best, step=self.global_step) def train_net(self): batch_data = self.memory.sample(batch_size=self.batch_size) for key, value in batch_data.items(): batch_data[key] = torch.from_numpy(value) obs0_t = batch_data['obs0'] obs1_t = batch_data['obs1'] obs0_t = self.normalize(obs0_t, self.obs_oms) obs1_t = self.normalize(obs1_t, self.obs_oms) obs0 = Variable(obs0_t).float().cuda() with torch.no_grad(): vol_obs1 = Variable(obs1_t).float().cuda() rewards = Variable(batch_data['rewards']).float().cuda() actions = Variable(batch_data['actions']).float().cuda() terminals = Variable(batch_data['terminals1']).float().cuda() cri_q_val = self.critic(obs0, actions) with torch.no_grad(): target_net_act = self.actor_target(vol_obs1) target_net_q_val = self.critic_target(vol_obs1, target_net_act) # target_net_q_val.volatile = False target_q_label = rewards target_q_label += self.gamma * target_net_q_val * (1 - terminals) target_q_label = target_q_label.detach() self.actor.zero_grad() self.critic.zero_grad() cri_loss = self.critic_loss(cri_q_val, target_q_label) cri_loss.backward() if self.max_grad_norm is not None: torch.nn.utils.clip_grad_norm(self.critic.parameters(), self.max_grad_norm) self.critic_optim.step() self.critic.zero_grad() self.actor.zero_grad() net_act = self.actor(obs0) net_q_val = self.critic(obs0, net_act) act_loss = -net_q_val.mean() act_loss.backward() if self.max_grad_norm is not None: torch.nn.utils.clip_grad_norm(self.actor.parameters(), self.max_grad_norm) self.actor_optim.step() self.soft_update(self.actor_target, self.actor, self.tau) self.soft_update(self.critic_target, self.critic, self.tau) return act_loss.cpu().data.numpy(), cri_loss.cpu().data.numpy() def normalize(self, x, stats): if stats is None: return x return (x - stats.mean) / stats.std def denormalize(self, x, stats): if stats is None: return x return x * stats.std + stats.mean def net_mode(self, train=True): if train: self.actor.train() self.critic.train() else: self.actor.eval() self.critic.eval() def load_model(self, step=None, pretrain_dir=None): model_dir = self.model_dir if pretrain_dir is not None: ckpt_file = os.path.join(self.pretrain_dir, 'model_best.pth') else: if step is None: ckpt_file = os.path.join(model_dir, 'model_best.pth') else: ckpt_file = os.path.join(model_dir, 'ckpt_{:08d}.pth'.format(step)) if not os.path.isfile(ckpt_file): raise ValueError("No checkpoint found at '{}'".format(ckpt_file)) mutils.print_yellow('Loading checkpoint {}'.format(ckpt_file)) checkpoint = torch.load(ckpt_file) if pretrain_dir is not None: actor_dict = self.actor.state_dict() critic_dict = self.critic.state_dict() actor_pretrained_dict = { k: v for k, v in checkpoint['actor_state_dict'].items() if k in actor_dict } critic_pretrained_dict = { k: v for k, v in checkpoint['critic_state_dict'].items() if k in critic_dict } actor_dict.update(actor_pretrained_dict) critic_dict.update(critic_pretrained_dict) self.actor.load_state_dict(actor_dict) self.critic.load_state_dict(critic_dict) self.global_step = 0 else: self.actor.load_state_dict(checkpoint['actor_state_dict']) self.critic.load_state_dict(checkpoint['critic_state_dict']) self.global_step = checkpoint['global_step'] if step is None: mutils.print_yellow('Checkpoint step: {}' ''.format(checkpoint['ckpt_step'])) self.warmup_iter += self.global_step mutils.print_yellow('Checkpoint loaded...') def save_model(self, is_best, step=None): if step is None: step = self.global_step ckpt_file = os.path.join(self.model_dir, 'ckpt_{:08d}.pth'.format(step)) data_to_save = { 'ckpt_step': step, 'global_step': self.global_step, 'actor_state_dict': self.actor.state_dict(), 'actor_optimizer': self.actor_optim.state_dict(), 'critic_state_dict': self.critic.state_dict(), 'critic_optimizer': self.critic_optim.state_dict() } mutils.print_yellow('Saving checkpoint: %s' % ckpt_file) torch.save(data_to_save, ckpt_file) if is_best: torch.save(data_to_save, os.path.join(self.model_dir, 'model_best.pth')) def rollout(self, train_test=False, render=False, record=False, slow_t=0): test_conditions = self.env.train_test_conditions \ if train_test else self.env.test_conditions done_num = 0 final_dist = [] episode_length = [] for idx in range(test_conditions): if train_test: obs = self.env.train_test_reset(cond=idx) else: obs = self.env.test_reset(cond=idx) for t_rollout in range(self.rollout_steps): obs = obs[0].copy() act = self.policy(obs, stochastic=False).flatten() obs, r, done, info = self.env.step(act) if render: self.env.render() if slow_t > 0: time.sleep(slow_t) if done: done_num += 1 break if record: print('dist: ', info['dist']) final_dist.append(info['dist']) episode_length.append(t_rollout) final_dist = np.array(final_dist) mean_final_dist = np.mean(final_dist) succ_rate = done_num / float(test_conditions) if record: with open('./test_data.json', 'w') as f: json.dump(final_dist.tolist(), f) print('\nDist statistics:') print("Minimum: {0:9.4f} Maximum: {1:9.4f}" "".format(np.min(final_dist), np.max(final_dist))) print("Mean: {0:9.4f}".format(mean_final_dist)) print("Standard Deviation: {0:9.4f}".format(np.std(final_dist))) print("Median: {0:9.4f}".format(np.median(final_dist))) print("First quartile: {0:9.4f}" "".format(np.percentile(final_dist, 25))) print("Third quartile: {0:9.4f}" "".format(np.percentile(final_dist, 75))) print('Success rate:', succ_rate) if render: while True: self.env.render() return mean_final_dist, succ_rate def log_model_weights(self): for name, param in self.actor.named_parameters(): logger.logkv('actor/' + name, param.clone().cpu().data.numpy()) for name, param in self.actor_target.named_parameters(): logger.logkv('actor_target/' + name, param.clone().cpu().data.numpy()) for name, param in self.critic.named_parameters(): logger.logkv('critic/' + name, param.clone().cpu().data.numpy()) for name, param in self.critic_target.named_parameters(): logger.logkv('critic_target/' + name, param.clone().cpu().data.numpy()) def random_action(self): act = np.random.uniform(-1., 1., self.ac_dim) return act def policy(self, obs, stochastic=True): self.actor.eval() ob = Variable(torch.from_numpy(obs)).float().cuda().view(1, -1) act = self.actor(ob) act = act.cpu().data.numpy() if stochastic: act = self.action_noise(act) self.actor.train() return act def cuda(self): self.critic.cuda() self.actor.cuda() if hasattr(self, 'critic_target'): self.critic_target.cuda() self.actor_target.cuda() self.critic_loss.cuda() def construct_optim(self, net, lr, weight_decay=None): if weight_decay is None: weight_decay = 0 params = mutils.add_weight_decay([net], weight_decay=weight_decay) optimizer = optim.Adam(params, lr=lr, weight_decay=weight_decay) return optimizer def soft_update(self, target, source, tau): for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(target_param.data * (1.0 - tau) + param.data * tau) def hard_update(self, target, source): for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(param.data)
episodeReward = reward shortMemory.push(state, action, mask, nextStateNumpy, reward) if done: break else: state = nextState.to(device) if len(memory) > batchSize: for _ in range(updatesPerStep): transition = memory.sample(batchSize) batch = Transition(*zip(*transition)) valueLoss = agent.updateParameters(batch, device) valueLossEp += valueLoss memory.append(shortMemory) rewards.append(episodeReward) if episode % checkEvery == 0: testRewards = [] for _ in range(numberOfTests): state = env.reset() startingPositionPuck = state["achieved_goal"] orginalDistance = np.linalg.norm(startingPositionPuck - desiredGoal) while True: state = stateToTensor(state, desiredGoal).to(device=device) #env.render() action = agent.selectAction(state, useTarget=True) action = action.cpu().numpy() nextState, reward, done, _ = env.step(
class DQNAgent: def __init__(self, sess, env, window_size, input_shape, gamma, batch_size, update_freq, is_duel, is_double, is_per, is_distributional, num_step, is_noisy, learning_rate, train_step): self.sess = sess self.env = env self.per = is_per self.noisy = is_noisy self.dist = is_distributional self.duel = is_duel self.double = is_double self.eps_start = 1.0 self.eps_end = 0.01 self.eps_step = 500000 self.beta_start = 0.4 self.batch_size = batch_size self.gamma = gamma self.n_steps = num_step self.update_freq = update_freq self.learning_rate = learning_rate self.mem_size = 1000000 self.num_actions = env.num_action.n self.train_step = train_step self.input_shape = input_shape self.window_size = window_size self.history = None #np.zeros(shape=(1, self.input_shape[0], self.input_shape[1], self.window_size), dtype=np.uint8) self.state = None self.update_network = False if self.dist: self.num_atoms = 51 else: self.num_atoms = 1 #self, sess, window_size, input_shape, name='dqn',double=True, duel=False, dist=False, noisy=False, trainable=True self.predict_network = DQN(self.sess, window_size, input_shape, self.num_actions, self.num_atoms, name='pred_net', double=self.double, duel=self.duel, dist=self.dist, noisy=self.noisy, trainable=True) self.target_network = DQN(self.sess, window_size, input_shape, self.num_actions, self.num_atoms, name='target_net', double=self.double, duel=self.duel, dist=self.dist, noisy=self.noisy, trainable=True) self.target_network.create_copy_op(self.predict_network) if self.per == 1: self.memory = Memory(self.mem_size, self.n_steps, self.gamma) else: self.memory = deque() with tf.variable_scope('optimizer'): self.targets = tf.placeholder('float32', [None], name='target_q') self.actions = tf.placeholder('int64', [None], name='action') actions_onehot = tf.one_hot(self.actions, self.num_actions, 1.0, 0.0, name='action_onehot') pred_q = tf.reduce_sum(self.predict_network.outputs * actions_onehot, reduction_indices=1, name='q_acted') self.importance_weights = tf.placeholder('float32', [None], name='importance_weights') if self.per: # use importance sampling self.delta = tf.square(self.targets - pred_q, name='squared_error') else: # use huber loss td_error = self.targets - pred_q self.delta = tf.where(tf.abs(td_error) < 1.0, 0.5 * tf.square(td_error), tf.abs(td_error) - 0.5, name='clipped_error') self.loss = tf.reduce_mean(tf.multiply(self.importance_weights, self.delta), name='loss') optimizer = tf.train.AdamOptimizer(self.learning_rate, epsilon=1.5e-4) self.optim = optimizer.minimize(self.loss, global_step=self.train_step) def reset(self): state = self.env.reset() self.history = np.stack((state, state, state, state), axis=2) self.history = np.reshape( [self.history], (self.input_shape[0], self.input_shape[1], self.window_size)) def train(self, episode): self.cnt = self.sess.run(self.train_step) if self.per == 1: beta = min( 1.0, self.beta_start + (1 - self.beta_start) * float(self.cnt) / float(self.eps_step)) samples, weights = self.memory.sample(self.batch_size, beta) else: # random.sample activates in dic or list samples = random.sample(list(self.memory), self.batch_size) weights = np.ones(self.batch_size) batch_s = [] # state batch_r = [] # reward batch_a = [] # action batch_n = [] # next state batch_t = [] # terminal flag if self.per: for i in range(len(samples)): batch_s.append(samples[i][1][0]) batch_r.append(samples[i][1][1]) batch_a.append(samples[i][1][2]) batch_n.append(samples[i][1][3]) batch_t.append(samples[i][1][4]) else: for i in samples: batch_s.append(i[0]) batch_r.append(i[1]) batch_a.append(i[2]) batch_n.append(i[3]) batch_t.append(i[4]) batch_s = np.array(batch_s) batch_r = np.array(batch_r) batch_a = np.array(batch_a) batch_n = np.array(batch_n) batch_t = np.array(batch_t) batch_n = np.float32(batch_n / 255.0) batch_s = np.float32(batch_s / 255.0) if self.double: pred_next_max_action = self.predict_network.calc_actions(batch_n) target_next_qmax = self.target_network.calc_outputs_with_idx( batch_n, [[idx, pred_a] for idx, pred_a in enumerate(pred_next_max_action)]) target_q = (1. - batch_t) * self.gamma * target_next_qmax + batch_r # print(batch_r) else: target_next_qmax = self.target_network.calc_max_outputs(batch_n) target_q = (1. - batch_t) * self.gamma * target_next_qmax + batch_r _, q_t, loss, step = self.sess.run( [ self.optim, self.predict_network.outputs, self.loss, self.train_step ], { self.targets: target_q, self.actions: batch_a, self.predict_network.inputs: batch_s, self.importance_weights: weights }) if self.per: for i in range(len(batch_a)): error = abs(target_q[i] - q_t[i][int(batch_a[i])]) self.memory.update(samples[i][0], error) if step % self.update_freq == 0: print(episode, " episode, ", step, "th steps update target network") self.update_network = True self.target_network.run_copy() def get_action(self, history, Training=True): if Training == True: self.cnt = self.sess.run(self.train_step) eps = max(self.eps_end, self.eps_start - float(self.cnt) / float(self.eps_step)) # print('epsilon : ', eps) if np.random.rand() < eps and not self.noisy: # exploration move = np.random.randint(0, self.num_actions) max_q_pred = None else: ob = np.float32(history / 255.0) ob = np.reshape(ob, (1, self.input_shape[0], self.input_shape[1], self.window_size)) move = self.predict_network.calc_actions(ob)[0] max_q_pred = max(self.predict_network.calc_outputs(ob)[0]) else: ob = np.float32(history / 255.0) ob = np.reshape(ob, (1, self.input_shape[0], self.input_shape[1], self.window_size)) move = self.predict_network.calc_actions(ob)[0] max_q_pred = max(self.predict_network.calc_outputs(ob)[0]) return move, max_q_pred def step(self, num_steps, training=True): cumulative_reward = 0 terminal = 0 last_history = self.history last_action = 0 for _ in range(num_steps): action, q_value = self.get_action(self.history, Training=training) next_state, reward, terminal = self.env.step(action, Training=training) #print("reward:", reward) if training: reward = np.clip(reward, -1., 1.) self.state = next_state cumulative_reward += reward last_action = action s1 = np.reshape(next_state, (self.input_shape[0], self.input_shape[1], 1)) next_history = np.append(self.history[:, :, 1:], s1, axis=2) self.history = next_history if terminal == True: break return last_history, last_action, cumulative_reward, self.history, q_value, terminal def evaluate(self, num_episode): rewards_list = [] for _ in range(num_episode): cumulative_reward = 0 self.reset() while True: action, q_value = self.get_action(self.history, Training=False) next_state, reward, terminal = self.env.step(action, Training=False) cumulative_reward += reward s1 = np.reshape(next_state, (self.input_shape[0], self.input_shape[1], 1)) next_history = np.append(self.history[:, :, 1:], s1, axis=2) self.history = next_history if terminal == True: break rewards_list.append(cumulative_reward) return np.mean(rewards_list), np.std(rewards_list) #experience:(old_state, reward, action, new_state, Done) def append(self, experience): if self.per == 1: old_state = experience[0] reward = experience[1] action = experience[2] new_state = experience[3] done = experience[4] if self.double: ob = np.float32(new_state / 255.0) observation = np.reshape( ob, (1, self.input_shape[0], self.input_shape[1], self.window_size)) pred_next_max_action = self.predict_network.calc_actions( observation) target_next_qmax = self.target_network.calc_outputs_with_idx( observation, [[idx, pred_a] for idx, pred_a in enumerate(pred_next_max_action)]) target_q = (1. - done) * self.gamma * target_next_qmax + float( reward) else: ob = np.float32(new_state / 255.0) observation = np.reshape( ob, (1, self.input_shape[0], self.input_shape[1], self.window_size)) target_next_qmax = self.target_network.calc_max_outputs( observation) target_q = (1. - done) * self.gamma * target_next_qmax + float( reward) ob_last = np.float32(old_state / 255.0) last_observation = np.reshape( ob_last, (1, self.input_shape[0], self.input_shape[1], self.window_size)) pred_q = self.predict_network.calc_outputs_with_idx( last_observation, [[0, action]]) error = abs(target_q - pred_q) self.memory.add(error[0], experience) else: self.memory.append(experience) if len(self.memory) > self.mem_size: self.memory.popleft()
class Agent: def __init__(self, sess, eps_schedule, lr_schedule): self.dqn_online = Agent._make_dqn('online') self.dqn_target = Agent._make_dqn('target') self.sess = sess self.eps_schedule = eps_schedule self.lr_schedule = lr_schedule self.memory = Memory(MEMORY_SIZE) self.step = 0 def get_action(self, s): if random.random() < self.eps_schedule.get(): return random.randint(0, ACTIONS_DIM - 1) qs = self.dqn_online.predict(np.array([s]), self.sess) a = np.argmax(qs) return a def on_reward(self, s, a, r, s_, done): self.memory.append([s, a, r, s_, done], MAX_WEIGHT) self.step += 1 if self.step % STEPS_TO_TRAIN == 0: self._train() if self.step % STEPS_TO_COPY == 0: self._copy() def _train(self): n = min(self.memory.size, BATCH_SIZE) samples = self.memory.sample_n(n) ss, ss_, ws = [], [], [] for ([s, a, r, s_, done], i, w) in samples: ss.append(s) ss_.append(s_) ws.append([w]) ss, ss_ = np.array(ss), np.array(ss_) qs = self._predict_online(ss) qs_ = self._predict_online(ss_) ts_ = self._predict_target(ss_) ds = [] for i, ([s, a, r, s_, done], _, _) in enumerate(samples): reward = r # There's no need to discount future. if not done: reward += ts_[i][np.argmax(qs_[i])] delta = abs(reward - qs[i][a]) + 0.001 ds.append(delta) qs[i][a] = reward for i, (_, j, _) in enumerate(samples): self.memory.set_delta(j, ds[i]) self.dqn_online.train(ss, qs, ws, self.lr_schedule.get(), self.sess) def _copy(self): return self.dqn_online.copy_to(self.dqn_target, self.sess) def _predict_online(self, ss): return self.dqn_online.predict(ss, self.sess) def _predict_target(self, ss): return self.dqn_target.predict(ss, self.sess) @staticmethod def _make_dqn(name): return DQN(name=name, states_dim=STATES_DIM, actions_dim=ACTIONS_DIM, hidden_layers=HIDDEN_LAYERS, hidden_units=HIDDEN_UNITS)
class DDPG(object): def setup_placeholders(self): # placeholders # Prefixes and suffixes: # ob - observation # ac - action # _no - this tensor should have shape (batch size /n/, observation dim) # _na - this tensor should have shape (batch size /n/, action dim) # _n - this tensor should have shape (batch size /n/) # placeholders前面都加一个前缀是好文明,可以方便在之后区分variable和placeholder self.sy_ob_no = tf.placeholder(tf.float32, shape=[None, self.ob_dim], name="ob") self.sy_ob_next = tf.placeholder(tf.float32, shape=[None, self.ob_dim], name="ob_next") self.terminal_next = tf.placeholder(tf.float32, shape=[None, 1], name="terminal_next") self.sy_rewards = tf.placeholder(tf.float32, shape=[None, 1], name="sy_rewards") self.sy_critic_targets = tf.placeholder(tf.float32, shape=[None, 1], name="sy_critic_targets") self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev') # actions的维度为整个actions选择的概率,而不只是输出一个被选择的action # tensorforce是按以下实现的,因此应该也是输入的概率 # x_actions = tf.reshape(tf.cast(x_actions, dtype=tf.float32), (-1, 1)) # 现在的问题是cartpole返回的shape是一个() 空tuple 但实际上应该是两个action 感觉应该是个bug self.sy_actions = tf.placeholder(tf.float32, shape=[None, self.ac_dim], name='actions') def setup_network(self): # 指定Reuse即可reuse同一个scope下的网络参数 self.actor返回一个tensor,表示选择每个action的概率 self.actor_tf = build_actor(self.sy_ob_no, self.ac_dim, scope_name='actor') # 默认axis为0 会返回[0,0],即沿着第0维归一化,由于只有一个数,因此固定返回0,0 # tf.argmax返回一个[1] 的tensor 使用tf.squeeze规约为int 否则env.step会检查不通过 self.actor_choose_action = tf.squeeze(tf.argmax(self.actor_tf, axis=1)) # target 输入下一次的ob self.target_actor_tf = build_actor(self.sy_ob_next, self.ac_dim, scope_name='target_actor') # 输入的是action的placeholder 这里可以选择输入action的选择概率,即actor_network的原始输出 # 也可以选择输入argmax之后的action index self.critic_tf = build_critic(self.sy_ob_no, self.sy_actions, scope_name='critic') # 输入的是模型的action概率分布,将这个输入到critic的第二层,也算是actor和critic共用一部分参数 # critic_tf和critic_with_actor_tf使用同一个网络,只是输入的action不同 self.critic_with_actor_tf = build_critic(self.sy_ob_no, self.actor_tf, scope_name='critic', reuse=True) # 计算next_q时用的是target_actor的输出作为actor部分的输入 next_q = build_critic(self.sy_ob_next, self.target_actor_tf, scope_name='target_critic') # terminal_next如果是tf.int32的placeholder,则会报 * self.target_q = self.sy_rewards + (1 - self.terminal_next) * self.gamma * next_q # setup var updates actor_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='actor') target_actor_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_actor') critic_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='critic') target_critic_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_critic') actor_init_updates, actor_soft_updates = get_target_updates(actor_vars, target_actor_vars, self.tau) critic_init_updates, critic_soft_updates = get_target_updates(critic_vars, target_critic_vars, self.tau) self.target_init_updates = [actor_init_updates, critic_init_updates] self.target_soft_updates = [actor_soft_updates, critic_soft_updates] # setup loss self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf) # 构造AdamOptimizer.minimize之后,会让actor_vars中的参数膨胀两倍,因此需要先设置updates,再设置loss self.actor_update_op = tf.train.AdamOptimizer(self.actor_lr).minimize(self.actor_loss) self.critic_loss = tf.reduce_mean(tf.square(self.critic_tf-self.sy_critic_targets)) self.critic_update_op = tf.train.AdamOptimizer(self.critic_lr).minimize(self.critic_loss) def __init__(self, env=None, discrete=True, ob_shape=(), ac_dim=0, gamma=1.0, actor_lr=1e-4, critic_lr=1e-3, logdir=None, normalize_returns=True, # network arguments n_layers=1, size=32, gae_lambda=-1.0, tau=0.001 #parameter update rate ): self.gamma = gamma self.actor_lr = actor_lr self.critic_lr = critic_lr self.normalize_returns = normalize_returns self.n_layers = n_layers self.size = size self.gae_lambda = gae_lambda self.tau = tau # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters # args = inspect.getfullargspec(train_DDPG)[0] # locals_ = locals() # params = {k: locals_[k] if k in locals_ else None for k in args} # logz.save_params(params) # Make the gym environment self.env = env # Is this env continuous, or discrete? self.discrete = discrete self.ac_dim = ac_dim self.ob_dim = ob_shape[0] #observation_shape in cartpole is (2,) 一个tuple self.memory = Memory(limit=int(1e6), action_shape=ac_dim, observation_shape=ob_shape) self.setup_placeholders() self.setup_network() def sample_action(self,obs,compute_Q=True): feed_dict = {self.sy_ob_no:[obs]} # baseline的代码中这里直接输出action的选择概率,而且传入env.step时乘以env.high 应该是用于连续action的做法 # 而我们求argmax则是用于离散action的做法 # build_critic(self.sy_ob_no, self.actor_tf, scope_name='critic', reuse=True) critic传入的是actor的输出 if compute_Q: action, action_prob, q = self.sess.run([self.actor_choose_action, self.actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict) else: action, action_prob = self.sess.run([self.actor_choose_action, self.actor_tf], feed_dict=feed_dict) q = None # 去除多余的维度,并限制在-1到1之间 action_prob = action_prob.flatten() action_prob = np.clip(action_prob, -1., 1.) return action, action_prob, q def soft_sync_target_actor(self): self.sess.run(self.target_soft_updates) def store_transition(self, obs0, action, reward, obs1, terminal1): self.memory.append(obs0, action, reward, obs1, terminal1) # 相当于baseline.ddpg.train 执行一次更新, def update_loss(self): batch = self.memory.sample(batch_size=self.batch_size) target_Q = self.sess.run(self.target_q, feed_dict={ self.sy_ob_next: batch['obs1'], self.sy_rewards: batch['rewards'], self.terminal_next: batch['terminals1'].astype('float32'), }) ops = [self.actor_loss, self.critic_loss, self.actor_update_op, self.critic_update_op] actor_loss, critic_loss, _, _ = self.sess.run(ops, feed_dict={ self.sy_ob_no: batch['obs0'], self.sy_actions: batch['actions'], self.sy_critic_targets: target_Q, }) return critic_loss, actor_loss # 完整的训练流程 def train(self, seed=0, n_iter=100, animate=False, min_timesteps_per_batch=1000, batch_epochs=1, batch_size = 32, max_path_length=None, ): self.batch_size = batch_size start = time.time() # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) # Maximum length for episodes max_path_length = max_path_length tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) sess = tf.Session(config=tf_config) self.sess = sess sess.__enter__() # equivalent to `with sess:` tf.global_variables_initializer().run() # pylint: disable=E1101 sess.run(self.target_init_updates) # todo: use finalize to make sure no new node in graph #sess.graph.finalize() #make it readonly, speed up # ========================================================================================# # Training Loop # ========================================================================================# #max_action = self.env.action_space.high total_timesteps = 0 for itr in range(n_iter): #print('start train itr=%d max_step=%d batch=%d'%(itr, max_path_length, min_timesteps_per_batch)) # Collect paths until we have enough timesteps # 每一轮结束或者超过max_path_length时会结束一次path # 每一轮path结束后填充到paths中,检查一次总的batch步数是否超过batch需求数,超过了则退出,开始训练 # 因此每次训练的都是完整的数据 # PG算法每次都使用当前分布sample action,不涉及exploration # TODO 改成observation和train分开两个进程,这样不用互相等待 timesteps_this_batch = 0 paths = [] while True: ob = self.env.reset() #obs, acs, ac_probs, rewards, ob_nexts, dones = [], [], [], [], [], [] obs, acs, rewards, ob_nexts, dones = [], [], [], [], [] animate_this_episode = (len(paths) == 0 and (itr % 10 == 0) and animate) steps = 0 while True: if animate_this_episode: self.env.render() time.sleep(0.05) obs.append(ob) # eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) # baseline将action限制在-1,1 再scale 可以看下这样是否有必要 if self.discrete: ac, ac_prob, q = self.sample_action(ob, False) acs.append(ac) ob_next, rew, done, _ = self.env.step(ac) else: _, ac_prob, q = self.sample_action(ob, False) #ac_prob = tf.Print(ac_prob, [ac_prob, ac_prob.shape], 'sample action') acs.append(ac_prob) ob_next, rew, done, _ = self.env.step(ac_prob) #ac_probs.append(ac_prob) ob_nexts.append(ob_next) dones.append(done) rewards.append(rew) self.store_transition(ob, ac_prob, rew, ob_next, done) steps += 1 if done or steps > max_path_length: break path = {"observation": np.array(obs), "reward": np.array(rewards), "action": np.array(acs), "ob_next": np.array(ob_nexts), "done": np.array(dones)} paths.append(path) timesteps_this_batch += pathlength(path) if timesteps_this_batch > min_timesteps_per_batch: break total_timesteps += timesteps_this_batch # Build arrays for observation, action for the policy gradient update by concatenating # across paths ob_no = np.concatenate([path["observation"] for path in paths]) ac_na = np.concatenate([path["action"] for path in paths]) # todo train process # todo memory sample in paths epoch_actor_losses = [] epoch_critic_losses = [] for epoch in range(batch_epochs): cl, al = self.update_loss() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) self.soft_sync_target_actor() # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] #print('log iter %d'%itr) #logz.log_tabular("LossDelta", loss_1 - loss_2) logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.dump_tabular() logz.pickle_tf_vars()