class Agent(): """Interacts with and learns from the environment.""" def __init__(self, agent_count, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action """ self.state_size = state_size self.action_size = action_size # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(agent_count * state_size, agent_count * action_size, random_seed).to(device) self.critic_target = Critic(agent_count * state_size, agent_count * action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) def soft_update(self): self.soft_update_network(self.critic_local, self.critic_target, TAU) self.soft_update_network(self.actor_local, self.actor_target, TAU) def soft_update_network(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def save(self, name): torch.save(self.actor_local.state_dict(), name + '_actor.pth') torch.save(self.critic_local.state_dict(), name + '_critic.pth') def load(self, name): self.actor_local.load_state_dict(torch.load(name + '_actor.pth')) self.critic_local.load_state_dict(torch.load(name + '_critic.pth'))
def train_tsp(args): # Goals from paper: # TSP20, 3.97 # TSP50, 6.08 # TSP100, 8.44 from tasks import tsp from tasks.tsp import TSPDataset STATIC_SIZE = 2 # (x, y) DYNAMIC_SIZE = 1 # dummy for compatibility train_data = TSPDataset(args.num_nodes, args.train_size, args.seed) valid_data = TSPDataset(args.num_nodes, args.valid_size, args.seed + 1) update_fn = None actor = Actor(STATIC_SIZE, DYNAMIC_SIZE, args.hidden_size, update_fn, tsp.update_mask, args.num_layers, args.dropout).to(device) critic = Critic(STATIC_SIZE, DYNAMIC_SIZE, args.hidden_size).to(device) kwargs = vars(args) kwargs['train_data'] = train_data kwargs['valid_data'] = valid_data kwargs['reward_fn'] = tsp.reward kwargs['render_fn'] = tsp.render if args.checkpoint: path = os.path.join(args.checkpoint, 'actor.pt') actor.load_state_dict(torch.load(path, device)) path = os.path.join(args.checkpoint, 'critic.pt') critic.load_state_dict(torch.load(path, device)) if not args.test: train(actor, critic, **kwargs) test_data = TSPDataset(args.num_nodes, args.train_size, args.seed + 2) test_dir = 'test' test_loader = DataLoader(test_data, args.batch_size, False, num_workers=0) out = validate(test_loader, actor, tsp.reward, tsp.render, test_dir, num_plot=5) print('Average tour length: ', out)
def init_model(env, model_args, ckpt=None): # get input/output size and range s_dim = env.get_state_size() a_dim = env.get_action_size() a_min = env.a_min a_max = env.a_max a_noise = model_args["noise"] * np.ones(a_dim) # get reference memory for FFC ref_mem = env._mocap.get_ref_mem() if not model_args["with_ffc"]: ref_mem.fill(0) ref_mem = ref_mem[:, 1:] # no phase velocity # automatically use gpu if use_gpu: torch.set_default_tensor_type('torch.cuda.FloatTensor') from model import Normalizer, Actor, Critic GAMMA = file_args["train_args"]["gamma"] non_norm = [0] #FMD0 s_norm = Normalizer(s_dim, non_norm) actor = Actor(s_dim, a_dim, a_min, a_max, a_noise, ref_mem.shape[0]) critic = Critic(s_dim, 0, 1 / (1 - GAMMA)) actor.set_reference(ref_mem) actor.ref_mem.requires_grad = False if (args.ckpt is not None): try: checkpoint = torch.load(args.ckpt) actor.load_state_dict(checkpoint["actor"]) critic.load_state_dict(checkpoint["critic"]) s_norm.load_state_dict(checkpoint["s_norm"]) print("load from %s" % args.ckpt) except: print("fail to load from %s" % args.ckpt) assert (False) return s_norm, actor, critic
def main(): expert_demo = pickle.load(open('./Ree1_expert.p', "rb")) # Ree1 : action 1 # Ree2 : action 100 # Ree3 : action 50 # Ree4 : action 10 # Ree5 : action 4 # Ree6 : action 0.5 # print('expert_demo_shape : ', np.array(expert_demo).shape) expert_x = int(expert_demo[1][0]) expert_y = int(expert_demo[1][1]) env = Env(expert_x, expert_y) # env = Env(0,0) # env.seed(args.seed) torch.manual_seed(args.seed) num_inputs = 2 num_actions = 8 running_state = ZFilter((num_inputs, ), clip=5) print('state size:', num_inputs) print('action size:', num_actions) actor = Actor(num_inputs, num_actions, args) critic = Critic(num_inputs, args) discrim = Discriminator(num_inputs + num_actions, args) actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate) critic_optim = optim.Adam(critic.parameters(), lr=args.learning_rate, weight_decay=args.l2_rate) discrim_optim = optim.Adam(discrim.parameters(), lr=args.learning_rate) # load demonstrations # expert_demo, _ = pickle.load(open('./expert_demo/expert_demo.p', "rb")) demonstrations = np.array(expert_demo[0]) # print("demonstrations.shape", demonstrations.shape) writer = SummaryWriter(args.logdir) if args.load_model is not None: saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model)) ckpt = torch.load(saved_ckpt_path) actor.load_state_dict(ckpt['actor']) critic.load_state_dict(ckpt['critic']) discrim.load_state_dict(ckpt['discrim']) running_state.rs.n = ckpt['z_filter_n'] running_state.rs.mean = ckpt['z_filter_m'] running_state.rs.sum_square = ckpt['z_filter_s'] print("Loaded OK ex. Zfilter N {}".format(running_state.rs.n)) episodes = 0 train_discrim_flag = True for iter in range(args.max_iter_num): actor.eval(), critic.eval() memory = deque() steps = 0 scores = [] while steps < args.total_sample_size: state = env.reset() score = 0 state = running_state(state) for _ in range(1000): if args.render: env.render() steps += 1 mu, std = actor(torch.Tensor(state).unsqueeze(0)) action2 = np.argmax(get_action(mu, std)[0]) action = get_action(mu, std)[0] next_state, reward, done, _ = env.step(action2) # next_state, reward, done, _ = env.step(action) irl_reward = get_reward(discrim, state, action) if done: mask = 0 else: mask = 1 memory.append([state, action, irl_reward, mask]) next_state = running_state(next_state) state = next_state score += reward if done: break episodes += 1 scores.append(score) score_avg = np.mean(scores) print('{}:: {} episode score is {:.2f}'.format(iter, episodes, score_avg)) writer.add_scalar('log/score', float(score_avg), iter) actor.train(), critic.train(), discrim.train() if train_discrim_flag: expert_acc, learner_acc = train_discrim(discrim, memory, discrim_optim, demonstrations, args) print("Expert: %.2f%% | Learner: %.2f%%" % (expert_acc * 100, learner_acc * 100)) temp_learner.append(learner_acc * 100) temp_expert.append(expert_acc * 100) if ((expert_acc > args.suspend_accu_exp and learner_acc > args.suspend_accu_gen and iter % 55 == 0) or iter % 50 == 0): # train_discrim_flag = False plt.plot(temp_learner, label='learner') plt.plot(temp_expert, label='expert') plt.xlabel('Episode') plt.ylabel('Accuracy') plt.xticks([]) plt.legend() plt.savefig('accuracy{}.png'.format(iter)) # plt.show() model_path = 'C:/Users/USER/9 GAIL/lets-do-irl/mujoco/gail' ckpt_path = os.path.join(model_path, 'ckpt_' + str(score_avg) + '.pth.tar') print("check path", ckpt_path) save_checkpoint( { 'actor': actor.state_dict(), 'critic': critic.state_dict(), 'discrim': discrim.state_dict(), 'z_filter_n': running_state.rs.n, 'z_filter_m': running_state.rs.mean, 'z_filter_s': running_state.rs.sum_square, 'args': args, 'score': score_avg }, filename=ckpt_path) train_actor_critic(actor, critic, memory, actor_optim, critic_optim, args) if iter % 100: score_avg = int(score_avg) model_path = os.path.join(os.getcwd(), 'save_model') if not os.path.isdir(model_path): os.makedirs(model_path) model_path = 'C:/Users/USER/9 GAIL/lets-do-irl/mujoco/gail' ckpt_path = os.path.join(model_path, 'ckpt_' + str(score_avg) + '.pth.tar') save_checkpoint( { 'actor': actor.state_dict(), 'critic': critic.state_dict(), 'discrim': discrim.state_dict(), 'z_filter_n': running_state.rs.n, 'z_filter_m': running_state.rs.mean, 'z_filter_s': running_state.rs.sum_square, 'args': args, 'score': score_avg }, filename=ckpt_path) plt.plot(temp_learner) plt.plot(temp_expert) plt.xlabel('Episode') plt.ylabel('Accuracy') plt.xticks([]) plt.savefig('accuracy.png')
class DDPGAgent: def __init__(self, plot=True, seed=1, env: gym.Env = None, batch_size=128, learning_rate_actor=0.001, learning_rate_critic=0.001, weight_decay=0.01, gamma=0.999): np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.batch_size = batch_size self.learning_rate_actor = learning_rate_actor self.learning_rate_critic = learning_rate_critic self.weight_decay = weight_decay self.gamma = gamma self.tau = 0.001 self._to_tensor = util.to_tensor self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') self.actor = Actor(self.state_dim, self.action_dim).to(self.device) self.target_actor = Actor(self.state_dim, self.action_dim).to(self.device) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), self.learning_rate_actor, weight_decay=self.weight_decay) self.critic = Critic(self.state_dim, self.action_dim).to(self.device) self.target_critic = Critic(self.state_dim, self.action_dim).to(self.device) self.critic_optimizer = torch.optim.Adam( self.critic.parameters(), self.learning_rate_critic, weight_decay=self.weight_decay) hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) self.t = 0 def _learn_from_memory(self, memory): ''' 从记忆学习,更新两个网络的参数 ''' # 随机获取记忆里的Transition trans_pieces = memory.sample(self.batch_size) s0 = np.vstack([x.state for x in trans_pieces]) a0 = np.vstack([x.action for x in trans_pieces]) r1 = np.vstack([x.reward for x in trans_pieces]) s1 = np.vstack([x.next_state for x in trans_pieces]) terminal_batch = np.vstack([x.is_done for x in trans_pieces]) # 优化评论家网络参数 s1 = self._to_tensor(s1, device=self.device) s0 = self._to_tensor(s0, device=self.device) next_q_values = self.target_critic.forward( state=s1, action=self.target_actor.forward(s1)).detach() target_q_batch = self._to_tensor(r1, device=self.device) + \ self.gamma*self._to_tensor(terminal_batch.astype(np.float), device=self.device)*next_q_values q_batch = self.critic.forward(s0, self._to_tensor(a0, device=self.device)) # 计算critic的loss 更新critic网络参数 loss_critic = F.mse_loss(q_batch, target_q_batch) #self.critic_optimizer.zero_grad() self.critic.zero_grad() loss_critic.backward() self.critic_optimizer.step() # 反向传播,以某状态的价值估计为策略目标函数 loss_actor = -self.critic.forward(s0, self.actor.forward(s0)) # Q的梯度上升 loss_actor = loss_actor.mean() self.actor.zero_grad() #self.actor_optimizer.zero_grad() loss_actor.backward() self.actor_optimizer.step() # 软更新参数 soft_update(self.target_actor, self.actor, self.tau) soft_update(self.target_critic, self.critic, self.tau) return (loss_critic.item(), loss_actor.item()) def learning(self, memory): self.actor.train() return self._learn_from_memory(memory) def save_models(self, episode_count): torch.save(self.target_actor.state_dict(), './Models/' + str(episode_count) + '_actor.pt') torch.save(self.target_critic.state_dict(), './Models/' + str(episode_count) + '_critic.pt') def load_models(self, episode): self.actor.load_state_dict( torch.load('./Models/' + str(episode) + '_actor.pt')) self.critic.load_state_dict( torch.load('./Models/' + str(episode) + '_critic.pt')) hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) print('Models loaded successfully')
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, num_agents, seed, fc1=400, fc2=300, update_times=10): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.num_agents = num_agents self.update_times = update_times self.noise = [] for i in range(num_agents): self.noise.append( rm.OrnsteinUhlenbeckProcess(size=(action_size, ), std=LinearSchedule(0.2))) # critic local and target network (Q-Learning) self.critic_local = Critic(state_size, action_size, fc1, fc2, seed).to(device) self.critic_target = Critic(state_size, action_size, fc1, fc2, seed).to(device) self.critic_target.load_state_dict(self.critic_local.state_dict()) # actor local and target network (Policy gradient) self.actor_local = Actor(state_size, action_size, fc1, fc2, seed).to(device) self.actor_target = Actor(state_size, action_size, fc1, fc2, seed).to(device) self.actor_target.load_state_dict(self.actor_local.state_dict()) # optimizer for critic and actor network self.optimizer_critic = optim.Adam(self.critic_local.parameters(), lr=CRITIC_LR) self.optimizer_actor = optim.Adam(self.actor_local.parameters(), lr=ACTOR_LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 self.a_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory for i in range(self.num_agents): self.memory.add(state[i], action[i], reward[i], next_state[i], done[i]) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: for i in range(self.update_times): experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, training=True): """Returns continous actions values for all action for given state as per current policy. Params ====== state (array_like): current state """ state = torch.from_numpy(state).float().detach().to(device) #print(state.shape,"act") self.actor_local.eval() with torch.no_grad(): actions = self.actor_local(state) self.actor_local.train() noise = [] for i in range(self.num_agents): noise.append(self.noise[i].sample()) return np.clip(actions.cpu().data.numpy() + np.array(noise), -1, 1) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences next_actions = self.actor_target(next_states) with torch.no_grad(): Q_target_next = self.critic_target(next_states, next_actions) Q_targets = rewards + (gamma * Q_target_next * (1 - dones)) Q_expected = self.critic_local(states, actions) #critic loss loss = F.mse_loss(Q_expected, Q_targets.detach()) self.optimizer_critic.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.optimizer_critic.step() #actor loss action_pr = self.actor_local(states) p_loss = -self.critic_local(states, action_pr).mean() self.optimizer_actor.zero_grad() p_loss.backward() self.optimizer_actor.step() # ------------------- update target network ------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def reset_random(self): for i in range(self.num_agents): self.noise[i].reset_states()
class Agent(): """ Interacts with and learns from the environment """ def __init__(self, state_size, action_size, num_agents, seed): """ Initialize an Agent object Params ====== state_size (int): dimension of each state action_size (int): dimension of each action num_agents (int): number of agents to run seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(seed) # Actor network (with target network) self.actor_local = Actor(state_size, action_size, seed).to(device) self.actor_target = Actor(state_size, action_size, seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic network (with target network) self.critic_local = Critic(state_size, action_size, seed).to(device) self.critic_target = Critic(state_size, action_size, seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise((num_agents, action_size), seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) self.steps_counter = 0 self.train_counter = 0 def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward for i in range(self.num_agents): self.memory.add(state[i, :], action[i, :], reward[i], next_state[i, :], done[i]) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def add(self, state, action, reward, next_state, done): """ Save experience to replay memory """ # Save experience / reward self.memory.add(state, action, reward, next_state, done) def learn_from_buffer(self, train_counter): for i in range(train_counter): if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, states, add_noise=True): """ Returns actions for given state as per current policy """ states = torch.from_numpy(states).float().to(device) actions = np.zeros((self.num_agents, self.action_size)) self.actor_local.eval() with torch.no_grad(): for agent_num, state in enumerate(states): action = self.actor_local(state).cpu().data.numpy() actions[agent_num, :] = action self.actor_local.train() if add_noise: actions += self.noise.sample() return np.clip(actions, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """ Update policy/value parameters using batch of experience tuples Params ====== experiences (Tuple[torch.Tensor]): tuple of (s,a,r,s',done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences ############# Update Critic ############# # Get predicted next-state actions and Q-values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) ?? Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # ?? # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() ############# Update Actor ############# # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() ############# Update Target Networks ############# self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """ Soft update model parameters θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (pyTorch model): where weights come from target_model (pyTorch model): where weights will go tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def set_target_model(self, actor, critic): # ?? self.actor_target = actor self.critic_target = critic def get_target_model(self): return self.actor_target, self.critic_target def load_weights(self, actor_path, critic_path): # Actor self.actor_local.load_state_dict(torch.load(actor_path)) self.actor_target.load_state_dict(torch.load(actor_path)) # Critic self.critic_local.load_state_dict(torch.load(critic_path)) self.critic_target.load_state_dict(torch.load(critic_path))
class DDPG(object): def __init__(self, nb_status, nb_actions, args): self.num_actor = 3 self.nb_status = nb_status * args.window_length self.nb_actions = nb_actions self.discrete = args.discrete # Create Actor and Critic Network net_cfg = { 'hidden1': args.hidden1, 'hidden2': args.hidden2, 'use_bn': args.bn } self.actors = [Actor(self.nb_status, self.nb_actions) for _ in range(self.num_actor)] self.actor_targets = [Actor(self.nb_status, self.nb_actions) for _ in range(self.num_actor)] self.actor_optims = [Adam(self.actors[i].parameters(), lr=args.prate) for i in range(self.num_actor)] self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) for i in range(self.num_actor): hard_update(self.actor_targets[i], self.actors[i]) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) # Create replay buffer self.memory = rpm(args.rmsize) # SequentialMemory(limit=args.rmsize, window_length=args.window_length) self.random_process = Myrandom(size=nb_actions) # Hyper-parameters self.batch_size = args.batch_size self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon # self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.use_cuda = args.cuda # if self.use_cuda: self.cuda() def update_policy(self, train_actor=True): # Sample batch state_batch, action_batch, reward_batch, \ next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size) # Prepare for the target q batch next_q_values = 0 for i in range(self.num_actor): next_q_values = next_q_values + self.critic_target([ to_tensor(next_state_batch, volatile=True), self.actor_targets[i](to_tensor(next_state_batch, volatile=True)), ]) # print('batch of picture is ok') next_q_values = next_q_values / self.num_actor next_q_values.volatile = False target_q_batch = to_tensor(reward_batch) + \ self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values # Critic update self.critic.zero_grad() q_batch = self.critic([to_tensor(state_batch), to_tensor(action_batch)]) # print(reward_batch, next_q_values*self.discount, target_q_batch, terminal_batch.astype(np.float)) value_loss = criterion(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() sum_policy_loss = 0 for i in range(self.num_actor): self.actors[i].zero_grad() policy_loss = -self.critic([ to_tensor(state_batch), self.actors[i](to_tensor(state_batch)) ]) policy_loss = policy_loss.mean() policy_loss.backward() if train_actor: self.actor_optims[i].step() sum_policy_loss += policy_loss # Target update soft_update(self.actor_targets[i], self.actors[i], self.tau) soft_update(self.critic_target, self.critic, self.tau) return -sum_policy_loss / self.num_actor, value_loss def cuda(self): for i in range(self.num_actor): self.actors[i].cuda() self.actor_targets[i].cuda() self.critic.cuda() self.critic_target.cuda() def observe(self, r_t, s_t1, done): self.memory.append([self.s_t, self.a_t, r_t, s_t1, done]) self.s_t = s_t1 def random_action(self): action = np.random.uniform(-1., 1., self.nb_actions) self.a_t = action if self.discrete: return action.argmax() else: return action def select_action(self, s_t, decay_epsilon=True, return_fix=False, noise_level=0): actions = [] status = [] tot_score = [] for i in range(self.num_actor): action = to_numpy(self.actors[i](to_tensor(np.array([s_t]), volatile=True))).squeeze(0) noise_level = noise_level * max(self.epsilon, 0) action = action + self.random_process.sample() * noise_level status.append(s_t) actions.append(action) tot_score.append(0.) scores = self.critic([to_tensor(np.array(status), volatile=True), to_tensor(np.array(actions), volatile=True)]) for j in range(self.num_actor): tot_score[j] += scores.data[j][0] best = np.array(tot_score).argmax() if decay_epsilon: self.epsilon -= self.depsilon self.a_t = actions[best] return actions[best] def reset(self, obs): self.s_t = obs self.random_process.reset_status() def load_weights(self, output, num=0): if output is None: return for i in range(self.num_actor): actor = self.actors[i] actor_target = self.actor_targets[i] actor.load_state_dict( torch.load('{}/actor{}_{}.pkl'.format(output, num, i)) ) actor_target.load_state_dict( torch.load('{}/actor{}_{}.pkl'.format(output, num, i)) ) self.critic.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num)) ) self.critic_target.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num)) ) def save_model(self, output, num): if self.use_cuda: for i in range(self.num_actor): self.actors[i].cpu() self.critic.cpu() for i in range(self.num_actor): torch.save( self.actors[i].state_dict(), '{}/actor{}_{}.pkl'.format(output, num, i) ) torch.save( self.critic.state_dict(), '{}/critic{}.pkl'.format(output, num) ) if self.use_cuda: for i in range(self.num_actor): self.actors[i].cuda() self.critic.cuda()
class Agent(): "Single agent no learning algorithm" def __init__(self, state_size, action_size, random_seed, lr_actor=1e-4, lr_critic=1e-3, weight_decay=0): """Initialize an Agent object Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed lr_actor (float) : learning rate actor network lr_critic (float) : learning rate critic network weight_decay (float) : weight decay regularizer """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.noise = OUNoise(action_size, random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic, weight_decay=weight_decay) def act(self, state, add_noise=True): "Returns actions for given state as per current policy" if not isinstance(state, torch.Tensor): state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def load(self, filename, map_location=None): "Load weights for actor and critic" weights = torch.load(filename, map_location=map_location) self.actor_local.load_state_dict(weights['actor']) if 'critic' in weights: self.critic_local.load_state_dict(weights['critic']) def reset(self): self.noise.reset() def save(self, filename='checkpoint.pth'): "Serialize actor and critic weights" checkpoint = { 'actor': self.actor_local.state_dict(), 'critic': self.critic_local.state_dict() } torch.save(checkpoint, filename)
def main(): env = gym.make(args.env_name) env.seed(args.seed) torch.manual_seed(args.seed) num_inputs = env.observation_space.shape[0] num_actions = env.action_space.shape[0] running_state = ZFilter((num_inputs,), clip=5) print('state size:', num_inputs) print('action size:', num_actions) actor = Actor(num_inputs, num_actions, args) critic = Critic(num_inputs, args) discrim = Discriminator(num_inputs + num_actions, args) actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate) critic_optim = optim.Adam(critic.parameters(), lr=args.learning_rate, weight_decay=args.l2_rate) discrim_optim = optim.Adam(discrim.parameters(), lr=args.learning_rate) # load demonstrations expert_demo, _ = pickle.load(open('./expert_demo/expert_demo.p', "rb")) demonstrations = np.array(expert_demo) print("demonstrations.shape", demonstrations.shape) writer = SummaryWriter(args.logdir) if args.load_model is not None: saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model)) ckpt = torch.load(saved_ckpt_path) actor.load_state_dict(ckpt['actor']) critic.load_state_dict(ckpt['critic']) discrim.load_state_dict(ckpt['discrim']) running_state.rs.n = ckpt['z_filter_n'] running_state.rs.mean = ckpt['z_filter_m'] running_state.rs.sum_square = ckpt['z_filter_s'] print("Loaded OK ex. Zfilter N {}".format(running_state.rs.n)) episodes = 0 train_discrim_flag = True for iter in range(args.max_iter_num): actor.eval(), critic.eval() memory = deque() steps = 0 scores = [] while steps < args.total_sample_size: state = env.reset() score = 0 state = running_state(state) for _ in range(10000): if args.render: env.render() steps += 1 mu, std = actor(torch.Tensor(state).unsqueeze(0)) action = get_action(mu, std)[0] next_state, reward, done, _ = env.step(action) irl_reward = get_reward(discrim, state, action) if done: mask = 0 else: mask = 1 memory.append([state, action, irl_reward, mask]) next_state = running_state(next_state) state = next_state score += reward if done: break episodes += 1 scores.append(score) score_avg = np.mean(scores) print('{}:: {} episode score is {:.2f}'.format(iter, episodes, score_avg)) writer.add_scalar('log/score', float(score_avg), iter) actor.train(), critic.train(), discrim.train() if train_discrim_flag: expert_acc, learner_acc = train_discrim(discrim, memory, discrim_optim, demonstrations, args) print("Expert: %.2f%% | Learner: %.2f%%" % (expert_acc * 100, learner_acc * 100)) if expert_acc > args.suspend_accu_exp and learner_acc > args.suspend_accu_gen: train_discrim_flag = False train_actor_critic(actor, critic, memory, actor_optim, critic_optim, args) if iter % 100: score_avg = int(score_avg) model_path = os.path.join(os.getcwd(),'save_model') if not os.path.isdir(model_path): os.makedirs(model_path) ckpt_path = os.path.join(model_path, 'ckpt_'+ str(score_avg)+'.pth.tar') save_checkpoint({ 'actor': actor.state_dict(), 'critic': critic.state_dict(), 'discrim': discrim.state_dict(), 'z_filter_n':running_state.rs.n, 'z_filter_m': running_state.rs.mean, 'z_filter_s': running_state.rs.sum_square, 'args': args, 'score': score_avg }, filename=ckpt_path)
class TD3(object): def __init__(self, env, writer=None): """ Twin Delayed Deep Deterministic Policy Gradient Algorithm(TD3) """ self.env = env self.writer = writer state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] self.max_action = env.action_space.high[0] # Randomly initialize network parameter self.actor = Actor(state_dim, action_dim).to('cuda') self.critic = Critic(state_dim, action_dim).to('cuda') # Initialize target network parameter self.target_actor = Actor(state_dim, action_dim).to('cuda') self.target_actor.load_state_dict(self.actor.state_dict()) self.target_critic = Critic(state_dim, action_dim).to('cuda') self.target_critic.load_state_dict(self.critic.state_dict()) # Replay memory self.memory = ReplayMemory(state_dim, action_dim) self.gamma = gamma self.tau = tau # network parameter optimizer self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_lr) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=critic_lr, weight_decay=weight_decay) def get_action(self, state, initial_act=False): if initial_act: return self.env.action_space.sample() action = self.actor(torch.from_numpy(state).to('cuda', torch.float)) action = np.random.normal(0, 0.1) + action.detach().cpu().numpy() return np.clip(action, -1, 1) def store_transition(self, state, action, state_, reward, done): self.memory.store_transition(state, action, state_, reward, done) def soft_update(self, target_net, net): """Target parameters soft update""" for target_param, param in zip(target_net.parameters(), net.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) def update(self, time_step, batch_size=64): states, actions, states_, rewards, terminals = self.memory.sample( batch_size) # Update Critic with torch.no_grad(): noise = (torch.randn_like(actions) * policy_noise).clamp( -noise_clip, noise_clip) actions_ = (self.target_actor(states_) + noise).clamp( -self.max_action, self.max_action) target_q1, target_q2 = self.target_critic(states_, actions_) y = rewards.unsqueeze(1) + terminals.unsqueeze( 1) * gamma * torch.min(target_q1, target_q2) q1, q2 = self.critic(states, actions) critic_loss = F.mse_loss(q1, y) + F.mse_loss(q2, y) self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() if self.writer and time_step: self.writer.add_scalar("loss/critic", critic_loss.item(), time_step) # Delayed Policy Update if time_step % policy_freq == 0: # Update Actor actor_loss = -1 * self.critic.Q1(states, self.actor(states)).mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() if self.writer: self.writer.add_scalar("loss/actor", actor_loss.item(), time_step) # target parameter soft update self.soft_update(self.target_actor, self.actor) # update target actor network self.soft_update(self.target_critic, self.critic) # update target critic network def save_model(self, path='models/'): torch.save(self.actor.state_dict(), path + 'actor') torch.save(self.critic.state_dict(), path + 'critic') torch.save(self.target_actor.state_dict(), path + 'target_actor') torch.save(self.target_critic.state_dict(), path + 'target_critic') def load_model(self, path='models/'): self.actor.load_state_dict(torch.load(path + 'actor')) self.critic.load_state_dict(torch.load(path + 'critic')) self.target_actor.load_state_dict(torch.load(path + 'target_actor')) self.target_critic.load_state_dict(torch.load(path + 'target_critic'))
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, memory, random_seed=0, buffer_size=1e5, batch_size=128, gamma=0.99, tau=1e-3, lr_actor=1e-4, lr_critic=1e-3, weight_decay=0): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.buffer_size = buffer_size self.batch_size = batch_size self.gamma = gamma self.tau = tau self.lr_actor = lr_actor self.lr_critic = lr_critic self.weight_decay = weight_decay # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.lr_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.lr_critic, weight_decay=self.weight_decay) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory # self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, random_seed) self.memory = memory # Iteration self.n_learn = 0 self.acc_loss_actor = 0 self.acc_loss_critic = 0 def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.n_learn += 1 self.learn(experiences, self.gamma) def act(self, state, add_noise=False): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def load_actor(self, model_path): checkpoint = torch.load(model_path) self.actor_local.load_state_dict(checkpoint) self.actor_target.load_state_dict(checkpoint) def load_critic(self, model_path): checkpoint = torch.load(model_path) self.critic_local.load_state_dict(checkpoint) self.critic_target.load_state_dict(checkpoint) def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) self.acc_loss_critic += critic_loss.cpu().data.numpy() # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() self.acc_loss_actor += actor_loss.cpu().data.numpy() """ if self.n_learn % 10 == 0: print('\rIter {0}\tActor Loss: {1:.5f}\tCritic Loss: {2:.5f}\t=='.format(self.n_learn, self.acc_loss_actor, self.acc_loss_critic), end="\r") self.acc_loss_actor = 0 self.acc_loss_critic = 0 """ # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, self.tau) self.soft_update(self.actor_local, self.actor_target, self.tau) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class DDPG(object): def __init__(self, nb_status, nb_actions, args, writer): self.clip_actor_grad = args.clip_actor_grad self.nb_status = nb_status * args.window_length self.nb_actions = nb_actions self.writer = writer self.select_time = 0 # Create Actor and Critic Network net_cfg = { 'hidden1':args.hidden1, 'hidden2':args.hidden2, 'init_method':args.init_method } self.actor = Actor(self.nb_status, self.nb_actions, **net_cfg) self.actor_target = Actor(self.nb_status, self.nb_actions, **net_cfg) self.actor_optim = Adam(self.actor.parameters(), lr=args.prate) self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) #Create replay buffer self.memory = rpm(args.rmsize) self.random_process = Myrandom(size=nb_actions) # Hyper-parameters self.batch_size = args.batch_size self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon # self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.use_cuda = args.cuda # if self.use_cuda: self.cuda() def update_policy(self, train_actor = True): # Sample batch state_batch, action_batch, reward_batch, \ next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size) # Prepare for the target q batch next_q_values = self.critic_target([ to_tensor(next_state_batch, volatile=True), self.actor_target(to_tensor(next_state_batch, volatile=True)), ]) # print('batch of picture is ok') next_q_values.volatile = False target_q_batch = to_tensor(reward_batch) + \ self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values # Critic update self.critic.zero_grad() q_batch = self.critic([to_tensor(state_batch), to_tensor(action_batch)]) # print(reward_batch, next_q_values*self.discount, target_q_batch, terminal_batch.astype(np.float)) value_loss = nn.MSELoss()(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() self.actor.zero_grad() policy_loss = -self.critic([ to_tensor(state_batch), self.actor(to_tensor(state_batch)) ]) policy_loss = policy_loss.mean() policy_loss.backward() if self.clip_actor_grad is not None: torch.nn.utils.clip_grad_norm(self.actor.parameters(), float(self.clip_actor_grad)) if self.writer != None: mean_policy_grad = np.array(np.mean([np.linalg.norm(p.grad.data.cpu().numpy().ravel()) for p in self.actor.parameters()])) #print(mean_policy_grad) self.writer.add_scalar('train/mean_policy_grad', mean_policy_grad, self.select_time) if train_actor: self.actor_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) return -policy_loss, value_loss def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() def train(self): self.actor.train() self.actor_target.train() self.critic.train() self.critic_target.train() def cuda(self): self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() def observe(self, r_t, s_t1, done): self.memory.append([self.s_t, self.a_t, r_t, s_t1, done]) self.s_t = s_t1 def random_action(self): action = np.random.uniform(-1.,1.,self.nb_actions) self.a_t = action return action def select_action(self, s_t, decay_epsilon=True, return_fix=False, noise_level=0): self.eval() # print(s_t.shape) action = to_numpy( self.actor(to_tensor(np.array([s_t]))) ).squeeze(0) self.train() noise_level = noise_level * max(self.epsilon, 0) action = action * (1 - noise_level) + (self.random_process.sample() * noise_level) action = np.clip(action, -1., 1.) if decay_epsilon: self.epsilon -= self.depsilon self.a_t = action return action def reset(self, obs): self.s_t = obs self.random_process.reset_status() def load_weights(self, output, num=1): if output is None: return self.actor.load_state_dict( torch.load('{}/actor{}.pkl'.format(output, num)) ) self.actor_target.load_state_dict( torch.load('{}/actor{}.pkl'.format(output, num)) ) self.critic.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num)) ) self.critic_target.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num)) ) def save_model(self, output, num): if self.use_cuda: self.actor.cpu() self.critic.cpu() torch.save( self.actor.state_dict(), '{}/actor{}.pkl'.format(output, num) ) torch.save( self.critic.state_dict(), '{}/critic{}.pkl'.format(output, num) ) if self.use_cuda: self.actor.cuda() self.critic.cuda()
class DDPGTrainer(object): def __init__(self): self.actor = Actor().to(device) self.actor_target = Actor().to(device) self.actor_target.load_state_dict(self.actor.state_dict()) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=0.0001) self.critic = Critic().to(device) self.critic_target = Critic().to(device) self.critic_target.load_state_dict(self.critic.state_dict()) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), weight_decay=1e-2) self.loss = torch.nn.MSELoss() def train(self, replay_buffer, iterations, batch_size=64, discount=0.99, tau=0.001): for it in range(iterations): # Sample replay buffer smp = replay_buffer.sample(batch_size) x, y, u, r, d = smp state = torch.FloatTensor(x).to(device) action = torch.stack(u) next_state = torch.FloatTensor(y).to(device) done = torch.FloatTensor(d).to(device) reward = torch.FloatTensor(r).to(device) # Compute the target Q value ac = self.actor_target(next_state) ac = torch.cat(ac, dim=1) target_Q = self.critic_target(next_state, ac) target_Q = reward + (done * discount * target_Q).detach() # Get current Q estimate current_Q = self.critic(state, action) # Compute critic loss critic_loss = self.loss(current_Q, target_Q) # Optimize the critic self.critic_optimizer.zero_grad() critic_loss.backward(retain_graph=True) self.critic_optimizer.step() # Compute actor loss actor_loss = -self.critic(state, torch.cat(self.actor(state), dim=1)).mean() # Optimize the actor self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # Update the frozen target models for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()): target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data) for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()): target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)
class Agent(object): def __init__( self, a_dim, s_dim, a_bound, ): self.a_dim, self.s_dim, self.a_bound = a_dim, s_dim, a_bound, #self.sess = tf.Session() self.P_online = Actor(s_dim, a_dim) self.P_target = Actor(s_dim, a_dim) self.P_target.load_state_dict(self.P_online.state_dict()) self.Q_online = Critic(s_dim, a_dim) self.Q_target = Critic(s_dim, a_dim) self.Q_target.load_state_dict(self.Q_online.state_dict()) self.q_optimizer = torch.optim.Adam(self.Q_online.parameters(), lr=LR_C) self.p_optimizer = torch.optim.Adam(self.P_online.parameters(), lr=LR_A) self.loss_td = nn.MSELoss() self.replay_buffer = ReplayBuffer() self.batch_size = 32 self.discrete = False self.ep_step = 0 # noise self.noise = Noise(DELTA, SIGMA, OU_A, OU_MU) # Initialize noise self.ou_level = 0. self.action_low = -2 self.action_high = 2 def act(self, state, test=False): if not test: with torch.no_grad(): # boring type casting state = (( torch.from_numpy(state)).unsqueeze(0)).float().to('cpu') action = self.P_online(state) # continuous output a = action.data.cpu().numpy() if self.discrete: action = np.argmax(a) return a, action else: if self.ep_step < 200: self.ou_level = self.noise.ornstein_uhlenbeck_level( self.ou_level) action = np.clip(a + self.ou_level, self.action_low, self.action_high) return (torch.from_numpy(action)).view(-1) def collect_data(self, state, action, reward, next_state, done): self.replay_buffer.push( torch.from_numpy(state).float().unsqueeze(0), torch.from_numpy(action).float().unsqueeze(0), torch.tensor([reward]).float().unsqueeze(0), torch.from_numpy(next_state).float().unsqueeze(0), torch.tensor([done]).float().unsqueeze(0)) def clear_data(self): raise NotImplementedError("Circular Queue don't need this function") def update(self): if len(self.replay_buffer) < self.batch_size: return states, actions, rewards, next_states, dones = self.replay_buffer.sample( batch_size=self.batch_size, device='cpu') #===============================Critic Update=============================== with torch.no_grad(): target = rewards + GAMMA * (1 - dones) * self.Q_target( (next_states, self.P_target(next_states))) Q = self.Q_online((states, actions)) td_error = self.loss_td(target, Q) self.q_optimizer.zero_grad() td_error.backward() self.q_optimizer.step() #===============================Actor Update=============================== q = self.Q_online((states, self.P_online(states))) loss_a = -torch.mean(q) self.p_optimizer.zero_grad() loss_a.backward() self.p_optimizer.step() #===============================Target Update=============================== soft_update(self.Q_target, self.Q_online, tau=1e-2) soft_update(self.P_target, self.P_online, tau=1e-2)
class DDPG(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed, num_agents, agent_id): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.num_agents = num_agents self.agent_id = agent_id self.eps = EPS_START self.eps_decay = 1 / (EPS_EP_END * EPOCHS) self.timestep = 0 # Actor Network (w/ Target Network) self.actor_local = Actor(state_size * 2, action_size, random_seed).to(device) self.actor_target = Actor(state_size * 2, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size * 2, action_size * 2, random_seed).to(device) self.critic_target = Critic(state_size * 2, action_size * 2, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise((num_agents, action_size), random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.timestep += 1 priority = (abs(reward) + PRIORITY_EPS)**PRIORITY_ALPHA self.memory.add(state, action, reward, next_state, done, priority) if self.timestep % UPDATE_EVERY != 0: return # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: for i in range(EPOCHS): experiences = self.memory.sample(device) self.learn(experiences, GAMMA) def act(self, state, add_noise): """Returns actions for both agents as per current policy, given their respective states.""" state = torch.from_numpy(state).float().to(device) actions = np.zeros((self.num_agents, self.action_size)) self.actor_local.eval() with torch.no_grad(): # get action for each agent and concatenate them actions = self.actor_local(state).cpu().data.numpy() self.actor_local.train() # add noise to actions if add_noise: actions += self.eps * self.noise.sample() actions = np.clip(actions, -1, 1) return actions def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) # Construct next actions vector relative to the agent if self.agent_id == 0: actions_next = torch.cat((actions_next, actions[:, 2:]), dim=1) else: actions_next = torch.cat((actions[:, :2], actions_next), dim=1) # Compute Q targets for current states (y_i) Q_targets_next = self.critic_target(next_states, actions_next) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) # Construct action prediction vector relative to each agent if self.agent_id == 0: actions_pred = torch.cat((actions_pred, actions[:, 2:]), dim=1) else: actions_pred = torch.cat((actions[:, :2], actions_pred), dim=1) # Compute actor loss actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) # update noise decay parameter self.eps -= self.eps_decay self.eps = max(self.eps, EPS_FINAL) self.noise.reset() def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def save(self): torch.save(self.actor_local.state_dict(), 'actor{}.pth'.format(self.agent_id)) torch.save(self.critic_local.state_dict(), 'critic{}.pth'.format(self.agent_id)) def load(self): self.actor_local.load_state_dict( torch.load('actor{}.pth'.format(self.agent_id))) self.critic_local.load_state_dict( torch.load('critic{}.pth'.format(self.agent_id)))
class DDPG_Agent: def __init__(self, state_size, action_size, seed, index=0, num_agents=2): """Initialize an Agent object. Params ====== state_size (int): Dimension of each state action_size (int): Dimension of each action seed (int): Random seed index (int): Index assigned to the agent num_agents (int): Number of agents in the environment """ self.state_size = state_size # State size self.action_size = action_size # Action size self.seed = torch.manual_seed(seed) # Random seed self.index = index # Index of this agent, not used at the moment self.tau = TAU # Parameter for soft weight update self.num_updates = N_UPDATES # Number of updates to perform when updating self.num_agents = num_agents # Number of agents in the environment self.tstep = 0 # Simulation step (modulo (%) UPDATE_EVERY) self.gamma = GAMMA # Gamma for the reward discount self.alpha = ALPHA # PER: toggle prioritization (0..1) # Set up actor and critic networks self.actor_local = Actor(state_size, action_size, seed).to(device) self.critic_local = Critic(state_size, action_size, seed).to(device) self.actor_target = Actor(state_size, action_size, seed).to(device) self.critic_target = Critic(state_size, action_size, seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Ornstein-Uhlenbeck noise self.noise = OUNoise((1, action_size), seed) # Replay buffer self.memory = PrioritizedReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, self.alpha) # act and act_targets similar to exercises and MADDPG Lab def act(self, states, noise=1.0): """Returns actions for given state as per current policy. Params ====== state [n_agents, state_size]: current state noise (float): control whether or not noise is added """ # Uncomment if state is numpy array instead of tensor states = torch.from_numpy(states).float().to(device) actions = np.zeros((1, self.action_size)) # Put model into evaluation mode self.actor_local.eval() # Get actions for current state, transformed from probabilities with torch.no_grad(): actions = self.actor_local(states).cpu().data.numpy() # Put actor back into training mode self.actor_local.train() # Ornstein-Uhlenbeck noise addition actions += noise * self.noise.sample() # Transform probability into valid action ranges return np.clip(actions, -1, 1) def step(self, states, actions, rewards, next_states, dones, beta): """Save experience in replay memory, use random samples from buffer to learn. PARAMS ====== states: [n_agents, state_size] current state actions: [n_agents, action_size] taken action rewards: [n_agents] earned reward next_states:[n_agents, state_size] next state dones: [n_agents] Whether episode has finished beta: [0..1] PER: toggles correction for importance weights (0 - no corrections, 1 - full correction) """ # ------------------------------------------------------------------ # Save experience in replay memory - slightly more effort due to Prioritization # We need to calculate priorities for the experience tuple. # This is in our case (Q_expected - Q_target)**2 # ----------------------------------------------------------------- # Set all networks to evaluation mode self.actor_target.eval() self.critic_target.eval() self.critic_local.eval() state = torch.from_numpy(states).float().to(device) next_state = torch.from_numpy(next_states).float().to(device) action = torch.from_numpy(actions).float().to(device) #reward = torch.from_numpy(rewards).float().to(device) #done = torch.from_numpy(dones).float().to(device) with torch.no_grad(): next_actions = self.actor_target(state) own_action = action[:, self.index * self.action_size:(self.index + 1) * self.action_size] if self.index: # Agent 1 next_actions_agent = torch.cat((own_action, next_actions), dim=1) else: # Agent 0: flipped order next_actions_agent = torch.cat((next_actions, own_action), dim=1) # Predicted Q value from Critic target network Q_targets_next = self.critic_target(next_state, next_actions_agent).float() #print(f"Type Q_t_n: {type(Q_targets_next)}") #print(f"Type gamma: {type(self.gamma)}") #print(f"Type dones: {type(dones)}") Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) Q_expected = self.critic_local(state, action) # Use error between Q_expected and Q_targets as priority in buffer error = (Q_expected - Q_targets)**2 self.memory.add(state, action, rewards, next_state, dones, error) # Set all networks back to training mode self.actor_target.train() self.critic_target.train() self.critic_local.train() # ------------------------------------------------------------------ # Usual learning procedure # ----------------------------------------------------------------- # Learn every UPDATE_EVERY time steps self.tstep = (self.tstep + 1) % UPDATE_EVERY # If UPDATE_EVERY and enough samples are available in memory, get random subset and learn if self.tstep == 0 and len(self.memory) > BATCH_SIZE: for _ in range(self.num_updates): experiences = self.memory.sample(beta) self.learn(experiences) def reset(self): """Reset the noise parameter of the agent.""" self.noise.reset() def learn(self, experiences): """Update value parameters using given batch of experience tuples. Update according to Q_targets = r + gamma * critic_target(next_state, actor_target(next_state)) According to the lessons: actor_target (state) gives action critic_target (state, action) gives Q-value Params ====== experiences (Tuple[torch.Variable]): tuple of states states visited actions actions taken by all agents rewards rewards received next states all next states dones whether or not a final state is reached weights weights of the experiences indices indices of the experiences """ # Load experiences from sample states, actions, rewards, next_states, dones, weights_cur, indices = experiences # ------------------- update critic ------------------- # # Get next actions via actor network next_actions = self.actor_target(next_states) # Stack action together with action of the agent own_actions = actions[:, self.index * self.action_size:(self.index + 1) * self.action_size] if self.index: # Agent 1 next_actions_agent = torch.cat((own_actions, next_actions), dim=1) else: # Agent 0: flipped order next_actions_agent = torch.cat((next_actions, own_actions), dim=1) # Predicted Q value from Critic target network Q_targets_next = self.critic_target(next_states, next_actions_agent) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) Q_expected = self.critic_local(states, actions) # Update priorities in ReplayBuffer loss = (Q_expected - Q_targets).pow(2).reshape( weights_cur.shape) * weights_cur self.memory.update(indices, loss.data.cpu().numpy()) # Compute critic loss critic_loss = F.mse_loss(Q_expected, Q_targets) self.critic_optimizer.zero_grad() critic_loss.backward() # Clip gradients #torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), GRAD_CLIPPING) self.critic_optimizer.step() # ------------------- update actor ------------------- # actions_expected = self.actor_local(states) # Stack action together with action of the agent own_actions = actions[:, self.index * self.action_size:(self.index + 1) * self.action_size] if self.index: # Agent 1: actions_expected_agent = torch.cat((own_actions, actions_expected), dim=1) else: # Agent 0: flipped order actions_expected_agent = torch.cat((actions_expected, own_actions), dim=1) # Compute actor loss based on expectation from actions_expected actor_loss = -self.critic_local(states, actions_expected_agent).mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # Update target networks self.target_soft_update(self.critic_local, self.critic_target) self.target_soft_update(self.actor_local, self.actor_target) def target_soft_update(self, local_model, target_model): """Soft update model parameters for actor and critic of all MADDPG agents. θ_target = τ*θ_local + (1 - τ)*θ_target """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(self.tau * local_param.data + (1.0 - self.tau) * target_param.data) def save(self, filename): """Saves the agent to the local workplace Params ====== filename (string): where to save the weights """ checkpoint = { 'input_size': self.state_size, 'output_size': self.action_size, 'actor_hidden_layers': [ each.out_features for each in self.actor_local.hidden_layers if each._get_name() != 'BatchNorm1d' ], 'actor_state_dict': self.actor_local.state_dict(), 'critic_hidden_layers': [ each.out_features for each in self.critic_local.hidden_layers if each._get_name() != 'BatchNorm1d' ], 'critic_state_dict': self.critic_local.state_dict() } torch.save(checkpoint, filename) def load_weights(self, filename): """ Load weights to update agent's actor and critic networks. Expected is a format like the one produced by self.save() Params ====== filename (string): where to load data from. """ checkpoint = torch.load(filename) if not checkpoint['input_size'] == self.state_size: print( f"Error when loading weights from checkpoint {filename}: input size {checkpoint['input_size']} doesn't match state size of agent {self.state_size}" ) return None if not checkpoint['output_size'] == self.action_size: print( f"Error when loading weights from checkpoint {filename}: output size {checkpoint['output_size']} doesn't match action space size of agent {self.action_size}" ) return None my_actor_hidden_layers = [ each.out_features for each in self.actor_local.hidden_layers if each._get_name() != 'BatchNorm1d' ] if not checkpoint['actor_hidden_layers'] == my_actor_hidden_layers: print( f"Error when loading weights from checkpoint {filename}: actor hidden layers {checkpoint['actor_hidden_layers']} don't match agent's actor hidden layers {my_actor_hidden_layers}" ) return None my_critic_hidden_layers = [ each.out_features for each in self.critic_local.hidden_layers if each._get_name() != 'BatchNorm1d' ] if not checkpoint['critic_hidden_layers'] == my_critic_hidden_layers: print( f"Error when loading weights from checkpoint {filename}: critic hidden layers {checkpoint['critic_hidden_layers']} don't match agent's critic hidden layers {my_critic_hidden_layers}" ) return None self.actor_local.load_state_dict(checkpoint['actor_state_dict']) self.critic_local.load_state_dict(checkpoint['critic_state_dict'])
class MiADDPG(): """Multiple independent Agents trained with DDPG This class allows to shared experience-buffer and critic network of the class::Agent. """ def __init__(self, num_agents, state_size, action_size, random_seed, lr_actor=1e-4, lr_critic=1e-3, weight_decay=0, tau=1e-3, gamma=0.99, batch_size=128, buffer_size=int(1e5), share_critic=True, share_buffer=True): """Initialize an multi-agent wrapper Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed tau (float): control soft-update gamma (float): discount factor batch_size (int): size of training batch buffer_size (int) : cap on number of experiences """ self.state_size = state_size self.action_size = action_size self.batch_size = batch_size self.tau = tau self.gamma = gamma self.agents = [ Agent(state_size, action_size, random_seed, lr_actor=1e-4, lr_critic=1e-3, weight_decay=0) for i in range(num_agents) ] self.share_critic = share_critic if share_critic: self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic, weight_decay=weight_decay) for agent in self.agents: agent.critic_local = None agent.critic_target = None agent.critic_optimizer = None self.share_buffer = share_buffer num_buffer = num_agents if share_buffer: num_buffer = 2 self.memory = [ ReplayBuffer(buffer_size, batch_size) for i in range(num_buffer) ] def step(self, state, action, reward, next_state, done): "Save experience and random sample from buffer to learn" # Save experience / reward in replay memory for i in range(len(state)): ind = i if self.share_buffer: ind = 0 self.memory[i].add(state[i, ...], action[i, ...], reward[i], next_state[i, ...], done[i]) # Learn, if enough samples are available in memory c_i = random.randint(0, len(self.agents) - 1) for i, agent in enumerate(self.agents): update_critic = True if self.share_critic and i != c_i: update_critic = False ind = i if self.share_buffer: ind = 0 if len(self.memory[ind]) < self.batch_size: continue experiences = self.memory[ind].sample() self.learn(agent, experiences, self.gamma, update_critic) def act(self, state, add_noise=True): "Returns actions for given state as per current policy" state = torch.from_numpy(state).float().to(device) action_list = [] for i, agent in enumerate(self.agents): action_list.append(agent.act(state[[i], ...])) return np.concatenate(action_list, axis=0) def load(self, filename, map_location=None): "Load weights for actor and critic" weights = torch.load(filename, map_location=map_location) for i, agent in enumerate(self.agents): agent.load_state_dict(weights[f'actor_{i}']) if self.share_critic: self.critic_local.load_state_dict(weights['critic']) continue agent.load_state_dict(weights[f'critic_{i}']) def reset(self): self.noise.reset() def save(self, filename='checkpoint.pth'): "Serialize actor and critic weights" checkpoint = {} for i, agent in enumerate(self.agents): checkpoint[f'actor_{i}'] = agent.actor_local.state_dict() if not self.share_critic: checkpoint[f'critic_{i}'] = agent.critic_local.state_dict() if self.share_critic: checkpoint[f'critic'] = self.critic_local.state_dict() torch.save(checkpoint, filename) def learn(self, agent, experiences, gamma, update_critic=True): """Update policy and value parameters with a batch of experiences Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences critic_target = agent.critic_target critic_local = agent.critic_local critic_optimizer = agent.critic_optimizer if self.share_critic: critic_target = self.critic_target critic_local = self.critic_local critic_optimizer = self.critic_optimizer # Update critic # Get predicted next-state actions and Q values from target models actions_next = agent.actor_target(next_states) Q_targets_next = critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss if update_critic: critic_optimizer.zero_grad() critic_loss.backward() critic_optimizer.step() # Update actor # Compute actor loss actions_pred = agent.actor_local(states) actor_loss = -critic_local(states, actions_pred).mean() # Minimize the loss agent.actor_optimizer.zero_grad() actor_loss.backward() agent.actor_optimizer.step() # Update target networks soft_update(critic_local, critic_target, self.tau) soft_update(agent.actor_local, agent.actor_target, self.tau)
class Academy: def __init__(self, state_size, action_size, random_seed, memory): # checked """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) self.checkpoint_file_path = './checkpoint_critic.pth' if os.path.isfile(self.checkpoint_file_path): self.critic_local.load_state_dict( torch.load(self.checkpoint_file_path)) self.critic_target.load_state_dict( torch.load(self.checkpoint_file_path)) def step(self, actor, memory): # unchecked # Learn, if enough samples are available in memory if len(memory) > BATCH_SIZE: experiences = memory.sample() self.learn(actor, experiences, GAMMA) def learn(self, actor, experiences, gamma): # checked """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = actor.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = actor.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss actor.actor_optimizer.zero_grad() actor_loss.backward() actor.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(actor.actor_local, actor.actor_target, TAU) def soft_update(self, local_model, target_model, tau): # checked """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Multi_Agents(): """ Implements interactions and learning on environments for a set of agents """ def __init__(self, agents_count, state_size, action_size, random_seed, buffer_size, batch_size, gamma, fc1_units, fc2_units, noise, lr_actor, lr_critic): """Initialize a Multi_Agent. Params ====== agents_count (int): the number of agents state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed buffer_size(int): replay buffer size gamma(float): discount factor fc1_units (int): Number of nodes in first hidden layer fc2_units (int): Number of nodes in second hidden layer noise(Object): The noise applied to the actions selection lr_actor(float) : learning rates of the actor lr_critic(float) : learning rates of the critic """ self.agents_count = agents_count self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.gamma = gamma self.batch_size = batch_size # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed, fc1_units, fc2_units).to(device) self.actor_target = Actor(state_size, action_size, random_seed, fc1_units, fc2_units).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed, fc1_units, fc2_units).to(device) self.critic_target = Critic(state_size, action_size, random_seed, fc1_units, fc2_units).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic, weight_decay=WEIGHT_DECAY) # after reading implementating of ShangtongZhang as suggested in the course, # It seems relevant to initialize the weights of the target networks # with the same values as the local network : self.actor_target.load_state_dict(self.actor_local.state_dict()) self.critic_target.load_state_dict(self.critic_local.state_dict()) # Noise process self.noise = noise # Replay memory self.memory = ReplayBuffer(action_size, buffer_size, batch_size, random_seed) def step(self, states, actions, rewards, next_states, dones): """Save experience in replay memory, and use random sample from buffer to learn.""" for a in range(self.agents_count): # save for each agent self.memory.add(states[a], actions[a], rewards[a], next_states[a], dones[a]) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) def act(self, states, add_noise=True): """Returns actions for each given state of each agent as per current policy.""" states = torch.from_numpy(states).float().to(device) actions = np.empty([self.agents_count, self.action_size]) self.actor_local.eval() with torch.no_grad(): for a in range(self.agents_count): actions[a] = self.actor_local(states[a]).cpu().data.numpy() self.actor_local.train() if add_noise: actions += self.noise.sample() return np.clip(actions, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() # as suggested in the "Benchmak implementation" section of the course" torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size).to(device) self.actor_target = Actor(state_size, action_size).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic1_local = Critic(state_size, action_size).to(device) self.critic1_target = Critic(state_size, action_size).to(device) self.critic1_optimizer = optim.Adam(self.critic1_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) self.critic2_local = Critic(state_size, action_size).to(device) self.critic2_target = Critic(state_size, action_size).to(device) self.critic2_optimizer = optim.Adam(self.critic2_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size) # Replay memory self.memory = PER(BUFFER_SIZE) def step(self, state, action, reward, next_state, done): """Save experience in replay memory.""" # Set reward as initial priority, see: # https://jaromiru.com/2016/11/07/lets-make-a-dqn-double-learning-and-prioritized-experience-replay/ self.memory.add((state, action, reward, next_state, done), reward) def act(self, state): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() action += self.noise.sample() return np.clip(action, -1., 1.) def reset(self): self.noise.reset() def mse(self, expected, targets, is_weights): """Custom loss function that takes into account the importance-sampling weights.""" td_error = expected - targets weighted_squared_error = is_weights * td_error * td_error return torch.sum(weighted_squared_error) / torch.numel( weighted_squared_error) def learn(self): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value """ for i in range(1, LEARN_BATCH + 1): idxs, experiences, is_weights = self.memory.sample(BATCH_SIZE) states = torch.from_numpy( np.vstack([e[0] for e in experiences if e is not None])).float().to(device) actions = torch.from_numpy( np.vstack([e[1] for e in experiences if e is not None])).float().to(device) rewards = torch.from_numpy( np.vstack([e[2] for e in experiences if e is not None])).float().to(device) next_states = torch.from_numpy( np.vstack([e[3] for e in experiences if e is not None])).float().to(device) dones = torch.from_numpy( np.vstack([e[4] for e in experiences if e is not None ]).astype(np.uint8)).float().to(device) is_weights = torch.from_numpy(is_weights).float().to(device) # ---------------------------- update critic ---------------------------- # # Target Policy Smoothing Regularization: add a small amount of clipped random noises to the selected action if POLICY_NOISE > 0.0: noise = torch.empty_like(actions).data.normal_( 0, POLICY_NOISE).to(device) noise = noise.clamp(-POLICY_NOISE_CLIP, POLICY_NOISE_CLIP) # Get predicted next-state actions and Q values from target models actions_next = (self.actor_target(next_states) + noise).clamp( -1., 1.) else: # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) # Error Mitigation Q_targets_next = torch.min(\ self.critic1_target(next_states, actions_next), \ self.critic2_target(next_states, actions_next)).detach() # Compute Q targets for current states (y_i) Q_targets = rewards + (GAMMA * Q_targets_next * (1 - dones)) # Compute critic1 loss Q_expected = self.critic1_local(states, actions) errors1 = np.abs((Q_expected - Q_targets).detach().cpu().numpy()) critic1_loss = self.mse(Q_expected, Q_targets, is_weights) # Minimize the loss self.critic1_optimizer.zero_grad() critic1_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic1_local.parameters(), 1) self.critic1_optimizer.step() # Update priorities in the replay buffer self.memory.batch_update(idxs, errors1) # Compute critic2 loss Q_expected = self.critic2_local(states, actions) critic2_loss = self.mse(Q_expected, Q_targets, is_weights) # Minimize the loss self.critic2_optimizer.zero_grad() critic2_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic2_local.parameters(), 1) self.critic2_optimizer.step() # Delayed Policy Updates if i % UPDATE_ACTOR_EVERY == 0: # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic1_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic1_local, self.critic1_target, TAU) self.soft_update(self.critic2_local, self.critic2_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def save_weights(self): torch.save(self.actor_local.state_dict(), actor_weights_file) torch.save(self.critic1_local.state_dict(), critic1_weights_file) torch.save(self.critic2_local.state_dict(), critic2_weights_file) def load_weights(self): self.actor_local.load_state_dict(torch.load(actor_weights_file)) self.critic1_local.load_state_dict(torch.load(critic1_weights_file)) self.critic2_local.load_state_dict(torch.load(critic2_weights_file))
class DDPG(object): def __init__(self, nb_status, nb_actions, args): self.num_actor = 3 self.nb_status = nb_status * args.window_length self.nb_actions = nb_actions self.discrete = args.discrete self.pic = args.pic if self.pic: self.nb_status = args.pic_status # Create Actor and Critic Network net_cfg = { 'hidden1': args.hidden1, 'hidden2': args.hidden2, 'use_bn': args.bn } if args.pic: self.cnn = CNN(3, args.pic_status) self.cnn_optim = Adam(self.cnn.parameters(), lr=args.crate) self.actors = [ Actor(self.nb_status, self.nb_actions) for _ in range(self.num_actor) ] self.actor_targets = [ Actor(self.nb_status, self.nb_actions) for _ in range(self.num_actor) ] self.actor_optims = [ Adam(self.actors[i].parameters(), lr=args.prate) for i in range(self.num_actor) ] self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) for i in range(self.num_actor): hard_update( self.actor_targets[i], self.actors[i]) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) #Create replay buffer self.memory = rpm( args.rmsize ) # SequentialMemory(limit=args.rmsize, window_length=args.window_length) self.random_process = Myrandom(size=nb_actions) # Hyper-parameters self.batch_size = args.batch_size self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon # self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.use_cuda = args.cuda # if self.use_cuda: self.cuda() def normalize(self, pic): pic = pic.swapaxes(0, 2).swapaxes(1, 2) return pic def update_policy(self, train_actor=True): # Sample batch state_batch, action_batch, reward_batch, \ next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size) # Prepare for the target q batch if self.pic: state_batch = np.array([self.normalize(x) for x in state_batch]) state_batch = to_tensor(state_batch, volatile=True) print('label 1') print('size = ', state_batch.shape) state_batch = self.cnn(state_batch) print('label 2') next_state_batch = np.array( [self.normalize(x) for x in next_state_batch]) next_state_batch = to_tensor(next_state_batch, volatile=True) next_state_batch = self.cnn(next_state_batch) next_q_values = self.critic_target( [next_state_batch, self.actor_target(next_state_batch)]) else: index = np.random.randint(low=0, high=self.num_actor) next_q_values = self.critic_target([ to_tensor(next_state_batch, volatile=True), self.actor_targets[index](to_tensor(next_state_batch, volatile=True)), ]) # print('batch of picture is ok') next_q_values.volatile = False target_q_batch = to_tensor(reward_batch) + \ self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values # Critic update self.critic.zero_grad() if self.pic: self.cnn.zero_grad() if self.pic: state_batch.volatile = False q_batch = self.critic([state_batch, to_tensor(action_batch)]) else: q_batch = self.critic( [to_tensor(state_batch), to_tensor(action_batch)]) # print(reward_batch, next_q_values*self.discount, target_q_batch, terminal_batch.astype(np.float)) value_loss = criterion(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() if self.pic: self.cnn_optim.step() sum_policy_loss = 0 for i in range(self.num_actor): self.actors[i].zero_grad() policy_loss = -self.critic([ to_tensor(state_batch), self.actors[i](to_tensor(state_batch)) ]) policy_loss = policy_loss.mean() policy_loss.backward() if train_actor: self.actor_optims[i].step() sum_policy_loss += policy_loss # Target update soft_update(self.actor_targets[i], self.actors[i], self.tau) soft_update(self.critic_target, self.critic, self.tau) return -sum_policy_loss / self.num_actor, value_loss def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() def train(self): self.actor.train() self.actor_target.train() self.critic.train() self.critic_target.train() def cuda(self): for i in range(self.num_actor): self.actors[i].cuda() self.actor_targets[i].cuda() self.critic.cuda() self.critic_target.cuda() def observe(self, r_t, s_t1, done): self.memory.append([self.s_t, self.a_t, r_t, s_t1, done]) self.s_t = s_t1 def random_action(self): action = np.random.uniform(-1., 1., self.nb_actions) self.a_t = action if self.discrete: return action.argmax() else: return action def select_action(self, s_t, decay_epsilon=True, return_fix=False, noise_level=0): actions = [] status = [] tot_score = [] for i in range(self.num_actor): action = to_numpy(self.actors[i](to_tensor( np.array([s_t]), volatile=True))).squeeze(0) noise_level = noise_level * max(self.epsilon, 0) action = action + self.random_process.sample() * noise_level status.append(s_t) actions.append(action) tot_score.append(0.) scores = self.critic([ to_tensor(np.array(status), volatile=True), to_tensor(np.array(actions), volatile=True) ]) for j in range(self.num_actor): tot_score[j] += scores.data[j][0] best = np.array(tot_score).argmax() if decay_epsilon: self.epsilon -= self.depsilon self.a_t = actions[best] return actions[best] def reset(self, obs): self.s_t = obs self.random_process.reset_status() def load_weights(self, output, num=0): if output is None: return self.actor.load_state_dict( torch.load('{}/actor{}.pkl'.format(output, num))) self.actor_target.load_state_dict( torch.load('{}/actor{}.pkl'.format(output, num))) self.critic.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num))) self.critic_target.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num))) def save_model(self, output, num): if self.use_cuda: self.actor.cpu() self.critic.cpu() torch.save(self.actor.state_dict(), '{}/actor{}.pkl'.format(output, num)) torch.save(self.critic.state_dict(), '{}/critic{}.pkl'.format(output, num)) if self.use_cuda: self.actor.cuda() self.critic.cuda()
'valid_critic_loss': 0.0, 'valid_generator_loss': 0.0 } train_loader, valid_loader, test_loader = svhn_sampler(data_root, train_batch_size, test_batch_size) train_loader, valid_loader, test_loader = repeater(train_loader), repeater(valid_loader), repeater(test_loader) train_iter, valid_iter, test_iter = iter(train_loader), iter(valid_loader), iter(test_loader) generator = Generator(z_dim=z_dim).to(device) critic = Critic().to(device) optim_critic = optim.Adam(critic.parameters(), lr=lr, betas=(beta1, beta2)) optim_generator = optim.Adam(generator.parameters(), lr=lr, betas=(beta1, beta2)) checkpoint = torch.load('save.tar') critic.load_state_dict(checkpoint['critic']) generator.load_state_dict(checkpoint['generator']) optim_critic.load_state_dict(checkpoint['optim_critic']) optim_generator.load_state_dict(checkpoint['optim_generator']) for i in range(n_iter*n_critic_updates): generator.train() critic.train() # update critic x = next(train_iter)[0].to(device) noise = torch.randn(train_batch_size, z_dim).to(device) y = generator(noise).detach() optim_critic.zero_grad() score = (-distances.vf_wasserstein_distance(x, y, critic))
class Agent: """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, buffer_size=int(1e5), batch_size=256, learn_every=1, update_every=1, gamma=0.99, tau=0.02, lr_actor=2e-4, lr_critic=2e-3, random_seed=None, use_asn=True, asn_kwargs={}, use_psn=False, psn_kwargs={}, use_per=False, restore=None): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.update_every = update_every self.learn_every = learn_every self.batch_size = batch_size self.gamma = gamma self.tau = tau # Keep track of how many times we've updated weights self.i_updates = 0 self.i_step = 0 self.use_asn = use_asn self.use_psn = use_psn self.use_per = use_per if random_seed is not None: random.seed(random_seed) self.actor_local = Actor(state_size, action_size).to(device) self.actor_target = Actor(state_size, action_size).to(device) if self.use_psn: self.actor_perturbed = Actor(state_size, action_size).to(device) self.critic_local = Critic(state_size, action_size).to(device) self.critic_target = Critic(state_size, action_size).to(device) # restore networks if needed if restore is not None: checkpoint = torch.load(restore, map_location=device) self.actor_local.load_state_dict(checkpoint[0]['actor']) self.actor_target.load_state_dict(checkpoint[0]['actor']) if self.use_psn: self.actor_perturbed.load_state_dict(checkpoint[0]['actor']) self.critic_local.load_state_dict(checkpoint[0]['critic']) self.critic_target.load_state_dict(checkpoint[0]['critic']) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic) # Hard copy weights from local to target networks policy_update(self.actor_local, self.actor_target, 1.0) policy_update(self.critic_local, self.critic_target, 1.0) # Noise process if self.use_asn: self.action_noise = OUNoise(action_size, **asn_kwargs) if self.use_psn: self.param_noise = ParameterSpaceNoise(**psn_kwargs) if self.use_per: self.buffer = PrioritizedExperienceReplay(buffer_size, batch_size, random_seed) else: self.buffer = ExperienceReplay(buffer_size, batch_size, random_seed) def act(self, states, perturb_mode=True, train_mode=True): """Returns actions for given state as per current policy.""" if not train_mode: self.actor_local.eval() if self.use_psn: self.actor_perturbed.eval() with torch.no_grad(): states = torch.from_numpy(states).float().to(device) actor = self.actor_perturbed if ( self.use_psn and perturb_mode) else self.actor_local actions = actor(states).cpu().numpy()[0] if train_mode: actions += self.action_noise.sample() self.actor_local.train() if self.use_psn: self.actor_perturbed.train() return np.clip(actions, -1, 1) def perturb_actor_parameters(self): """Apply parameter space noise to actor model, for exploration""" policy_update(self.actor_local, self.actor_perturbed, 1.0) params = self.actor_perturbed.state_dict() for name in params: if 'ln' in name: pass param = params[name] random = torch.randn(param.shape) if use_cuda: random = random.cuda() param += random * self.param_noise.current_stddev def reset(self): self.action_noise.reset() if self.use_psn: self.perturb_actor_parameters() def step(self, experience, priority=0.0): self.buffer.push(experience) self.i_step += 1 if len(self.buffer) > self.batch_size: if self.i_step % self.learn_every == 0: self.learn(priority) if self.i_step % self.update_every == 0: self.update( ) # soft update the target network towards the actual networks def learn(self, priority=0.0): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ if self.use_per: (states, actions, rewards, states_next, dones), batch_idx = self.buffer.sample(priority) else: states, actions, rewards, states_next, dones = self.buffer.sample() # Get predicted next-state actions and Q values from target models with torch.no_grad(): actions_next = self.actor_target(states_next) Q_targets_next = self.critic_target(states_next, actions_next) Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones)) # ---------------------------- update critic ---------------------------- # # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.smooth_l1_loss(Q_expected, Q_targets) # Minimize the loss self.critic_local.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_local.zero_grad() actor_loss.backward() self.actor_optimizer.step() if self.use_per: Q_error = Q_expected - Q_targets new_deltas = torch.abs(Q_error.detach().squeeze(1)).numpy() self.buffer.update_deltas(batch_idx, new_deltas) def update(self): """soft update targets""" self.i_updates += 1 policy_update(self.actor_local, self.actor_target, self.tau) policy_update(self.critic_local, self.critic_target, self.tau) def save_model(self, model_dir, session_name, i_episode, best): filename = os.path.join( model_dir, f'ddpg_{session_name}-EP_{i_episode}-score_{best:.3f}.pt') filename_best = os.path.join(model_dir, f'ddpg_{session_name}-best.pt') save_dict_list = [] save_dict = { 'actor': self.actor_local.state_dict(), 'actor_optim_params': self.actor_optimizer.state_dict(), 'critic': self.critic_local.state_dict(), 'critic_optim_params': self.critic_optimizer.state_dict() } save_dict_list.append(save_dict) torch.save(save_dict_list, filename) copyfile(filename, filename_best) def postprocess(self, t_step): if self.use_psn and t_step > 0: perturbed_states, perturbed_actions, _, _, _ = self.buffer.tail( t_step) unperturbed_actions = self.act(np.array(perturbed_states), False, False) diff = np.array(perturbed_actions) - unperturbed_actions mean_diff = np.mean(np.square(diff), axis=0) dist = sqrt(np.mean(mean_diff)) self.param_noise.adapt(dist)
print('action size:', num_actions) writer = SummaryWriter(args.logdir) actor = Actor(num_inputs, num_actions) critic = Critic(num_inputs) running_state = ZFilter((num_inputs, ), clip=5) if args.load_model is not None: saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model)) ckpt = torch.load(saved_ckpt_path) actor.load_state_dict(ckpt['actor']) critic.load_state_dict(ckpt['critic']) running_state.rs.n = ckpt['z_filter_n'] running_state.rs.mean = ckpt['z_filter_m'] running_state.rs.sum_square = ckpt['z_filter_s'] print("Loaded OK ex. Zfilter N {}".format(running_state.rs.n)) actor_optim = optim.Adam(actor.parameters(), lr=hp.actor_lr) critic_optim = optim.Adam(critic.parameters(), lr=hp.critic_lr, weight_decay=hp.l2_rate) episodes = 0 for iter in range(15000): actor.eval(), critic.eval()
class DDPG(object): def __init__(self, nb_status, nb_actions, args, writer): self.clip_actor_grad = args.clip_actor_grad self.nb_status = nb_status * args.window_length self.nb_actions = nb_actions self.discrete = args.discrete self.pic = args.pic self.writer = writer self.select_time = 0 if self.pic: self.nb_status = args.pic_status # Create Actor and Critic Network net_cfg = { 'hidden1':args.hidden1, 'hidden2':args.hidden2, 'use_bn':args.bn, 'init_method':args.init_method } if args.pic: self.cnn = CNN(1, args.pic_status) self.cnn_target = CNN(1, args.pic_status) self.cnn_optim = Adam(self.cnn.parameters(), lr=args.crate) self.actor = Actor(self.nb_status, self.nb_actions, **net_cfg) self.actor_target = Actor(self.nb_status, self.nb_actions, **net_cfg) self.actor_optim = Adam(self.actor.parameters(), lr=args.prate) self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) if args.pic: hard_update(self.cnn_target, self.cnn) #Create replay buffer self.memory = rpm(args.rmsize) # SequentialMemory(limit=args.rmsize, window_length=args.window_length) self.random_process = Myrandom(size=nb_actions) # Hyper-parameters self.batch_size = args.batch_size self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon # self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.use_cuda = args.cuda # if self.use_cuda: self.cuda() def normalize(self, pic): pic = pic.swapaxes(0, 2).swapaxes(1, 2) return pic def update_policy(self): # Sample batch state_batch, action_batch, reward_batch, \ next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size) # Prepare for the target q batch if self.pic: state_batch = np.array([self.normalize(x) for x in state_batch]) state_batch = to_tensor(state_batch, volatile=True) state_batch = self.cnn(state_batch) next_state_batch = np.array([self.normalize(x) for x in next_state_batch]) next_state_batch = to_tensor(next_state_batch, volatile=True) next_state_batch = self.cnn_target(next_state_batch) next_q_values = self.critic_target([ next_state_batch, self.actor_target(next_state_batch) ]) else: next_q_values = self.critic_target([ to_tensor(next_state_batch, volatile=True), self.actor_target(to_tensor(next_state_batch, volatile=True)), ]) # print('batch of picture is ok') next_q_values.volatile = False target_q_batch = to_tensor(reward_batch) + \ self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values # Critic update self.critic.zero_grad() if self.pic: self.cnn.zero_grad() if self.pic: state_batch.volatile = False q_batch = self.critic([state_batch, to_tensor(action_batch)]) else: q_batch = self.critic([to_tensor(state_batch), to_tensor(action_batch)]) # print(reward_batch, next_q_values*self.discount, target_q_batch, terminal_batch.astype(np.float)) value_loss = criterion(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() if self.pic: self.cnn_optim.step() self.actor.zero_grad() if self.pic: self.cnn.zero_grad() if self.pic: state_batch.volatile = False policy_loss = -self.critic([ state_batch, self.actor(state_batch) ]) else: policy_loss = -self.critic([ to_tensor(state_batch), self.actor(to_tensor(state_batch)) ]) policy_loss = policy_loss.mean() policy_loss.backward() if self.clip_actor_grad is not None: torch.nn.utils.clip_grad_norm(self.actor.parameters(), float(self.clip_actor_grad)) if self.writer != None: mean_policy_grad = np.array(np.mean([np.linalg.norm(p.grad.data.cpu().numpy().ravel()) for p in self.actor.parameters()])) #print(mean_policy_grad) self.writer.add_scalar('train/mean_policy_grad', mean_policy_grad, self.select_time) self.actor_optim.step() if self.pic: self.cnn_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) if self.pic: soft_update(self.cnn_target, self.cnn, self.tau) return -policy_loss, value_loss def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() if(self.pic): self.cnn.eval() self.cnn_target.eval() def train(self): self.actor.train() self.actor_target.train() self.critic.train() self.critic_target.train() if(self.pic): self.cnn.train() self.cnn_target.train() def cuda(self): self.cnn.cuda() self.cnn_target.cuda() self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() def observe(self, r_t, s_t1, done): self.memory.append([self.s_t, self.a_t, r_t, s_t1, done]) self.s_t = s_t1 def random_action(self, fix=False): action = np.random.uniform(-1.,1.,self.nb_actions) self.a_t = action if self.discrete and fix == False: action = action.argmax() # if self.pic: # action = np.concatenate((softmax(action[:16]), softmax(action[16:]))) return action def select_action(self, s_t, decay_epsilon=True, return_fix=False, noise_level=0): self.eval() if self.pic: s_t = self.normalize(s_t) s_t = self.cnn(to_tensor(np.array([s_t]))) if self.pic: action = to_numpy( self.actor_target(s_t) ).squeeze(0) else: action = to_numpy( self.actor(to_tensor(np.array([s_t]))) ).squeeze(0) self.train() noise_level = noise_level * max(self.epsilon, 0) if np.random.uniform(0, 1) < noise_level: action = self.random_action(fix=True) # episilon greedy if decay_epsilon: self.epsilon -= self.depsilon self.a_t = action if return_fix: return action if self.discrete: return action.argmax() else: return action def reset(self, obs): self.s_t = obs self.random_process.reset_status() def load_weights(self, output, num=1): if output is None: return self.actor.load_state_dict( torch.load('{}/actor{}.pkl'.format(output, num)) ) self.actor_target.load_state_dict( torch.load('{}/actor{}.pkl'.format(output, num)) ) self.critic.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num)) ) self.critic_target.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num)) ) def save_model(self, output, num): if self.use_cuda: self.cnn.cpu() self.actor.cpu() self.critic.cpu() torch.save( self.actor.state_dict(), '{}/actor{}.pkl'.format(output, num) ) torch.save( self.critic.state_dict(), '{}/critic{}.pkl'.format(output, num) ) if self.use_cuda: self.cnn.cuda() self.actor.cuda() self.critic.cuda()
class Agent(): def __init__(self, state_size, action_size, num_agents, device, gamma=GAMMA, tau=TAU, lr_actor=LR_ACTOR, lr_critic=LR_CRITIC, random_seed=0): """ Initialize an Agent object. :param state_size: size of state :param action_size: size of action :param num_agents: number of agents :param gamma: discount factor :param tau: factor for soft update of target parameters :param lr_actor: Learning rate of actor :param lr_critic: Learning rate of critic :param random_seed: Random seed :param device: cuda or cpu """ self.device = device self.gamma = gamma self.tau = tau self.num_agents = num_agents self.state_size = state_size self.action_size = action_size self.full_state_size = state_size * num_agents self.full_action_size = action_size * num_agents self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, device, random_seed).to(device) self.actor_target = Actor(state_size, action_size, device, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(self.full_state_size, self.full_action_size, device=device, random_seed=random_seed).to(device) self.critic_target = Critic(self.full_state_size, self.full_action_size, device=device, random_seed=random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic, weight_decay=0) self.noise = OUNoise(action_size, random_seed) def save_model(self, agent_number): torch.save(self.actor_local.state_dict(), f'models/checkpoint_actor_{agent_number}.pth') torch.save(self.critic_local.state_dict(), f'models/checkpoint_critic_{agent_number}.pth') def load_model(self, agent_number): checkpoint = torch.load(f'models/checkpoint_actor_{agent_number}.pth', map_location=torch.device('cpu')) self.actor_local.load_state_dict(checkpoint) checkpoint = torch.load(f'models/checkpoint_critic_{agent_number}.pth', map_location=torch.device('cpu')) self.critic_local.load_state_dict(checkpoint) def act(self, state, noise=0., train=False): """Returns actions for given state as per current policy. :param state: state as seen from single agent """ if train is True: self.actor_local.train() else: self.actor_local.eval() action = self.actor_local(state) if noise > 0: noise = torch.tensor(noise * self.noise.sample(), dtype=state.dtype, device=state.device) return action + noise def target_act(self, state, noise=0.): #self.actor_target.eval() # convert to cpu() since noise is in cpu() self.actor_target.eval() action = self.actor_target(state).cpu() if noise > 0.: noise = torch.tensor(noise * self.noise.sample(), dtype=state.dtype, device=state.device) return action + noise def update_critic(self, rewards, dones, all_states, all_actions, all_next_states, all_next_actions): with torch.no_grad(): Q_targets_next = self.critic_target(all_next_states, all_next_actions) # Compute Q targets for current states (y_i) q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones)) # Compute critic loss q_expected = self.critic_local(all_states, all_actions) # critic_loss = F.mse_loss(q_expected, q_targets) critic_loss = ((q_expected - q_targets.detach())**2).mean() self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() def update_actor(self, all_states, all_predicted_actions): """Update actor network :param all_states: all states :param all_predicted_actions: all predicted actions """ actor_loss = -self.critic_local(all_states, all_predicted_actions).mean() self.actor_optimizer.zero_grad() actor_loss.backward(retain_graph=True) self.actor_optimizer.step() def update_targets(self): self.soft_update(self.actor_local, self.actor_target, self.tau) self.soft_update(self.critic_local, self.critic_target, self.tau) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def reset(self): self.noise.reset()
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed, memory=None, buffer_size=BUFFER_SIZE, batch_size=BATCH_SIZE, gamma=GAMMA, tau=TAU, lr_actor=LR_ACTOR, lr_critic=LR_CRITIC, weigth_decay=WEIGHT_DECAY, pretrained_actor_weights=None, pretrained_critic_weights=None): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.buffer_size = buffer_size self.batch_size = batch_size self.gamma = gamma self.tau = tau self.lr_actor = lr_actor self.lr_critic = lr_critic self.weight_decay = weigth_decay # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.lr_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.lr_critic, weight_decay=self.weight_decay) if pretrained_actor_weights: actor_weights = torch.load(pretrained_actor_weights) self.actor_local.load_state_dict(actor_weights) self.actor_target.load_state_dict(actor_weights) if pretrained_critic_weights: critic_weights = torch.load(pretrained_critic_weights) self.critic_local.load_state_dict(critic_weights) self.critic_target.load_state_dict(critic_weights) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory if memory: self.memory = memory else: self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, random_seed) def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences, self.gamma) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device).unsqueeze(0) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, self.tau) self.soft_update(self.actor_local, self.actor_target, self.tau) self.noise.reset() def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class DDPG(object): def __init__(self, nb_states, nb_actions, args): if args.seed > 0: self.seed(args.seed) self.nb_states = nb_states self.nb_actions = nb_actions # Create Actor and Critic Network net_cfg = { "hidden1": args.hidden1, "hidden2": args.hidden2, "init_w": args.init_w, } self.actor = Actor(self.nb_states, self.nb_actions, **net_cfg) self.actor_target = Actor(self.nb_states, self.nb_actions, **net_cfg) self.actor_optim = Adam(self.actor.parameters(), lr=args.prate) self.critic = Critic(self.nb_states, self.nb_actions, **net_cfg) self.critic_target = Critic(self.nb_states, self.nb_actions, **net_cfg) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) hard_update( self.actor_target, self.actor ) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) # Create replay buffer self.memory = SequentialMemory( limit=args.rmsize, window_length=args.window_length ) self.random_process = OrnsteinUhlenbeckProcess( size=nb_actions, theta=args.ou_theta, mu=args.ou_mu, sigma=args.ou_sigma ) # Hyper-parameters self.batch_size = args.bsize self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon # self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.is_training = True # if USE_CUDA: self.cuda() def update_policy(self): # Sample batch ( state_batch, action_batch, reward_batch, next_state_batch, terminal_batch, ) = self.memory.sample_and_split(self.batch_size) # Prepare for the target q batch next_q_values = self.critic_target( [ to_tensor(next_state_batch, volatile=True), self.actor_target(to_tensor(next_state_batch, volatile=True)), ] ) # next_q_values.volatile = False target_q_batch = ( to_tensor(reward_batch) + self.discount * to_tensor(terminal_batch.astype(np.float)) * next_q_values ) # Critic update self.critic.zero_grad() q_batch = self.critic([to_tensor(state_batch), to_tensor(action_batch)]) value_loss = criterion(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() # Actor update self.actor.zero_grad() policy_loss = -self.critic( [to_tensor(state_batch), self.actor(to_tensor(state_batch))] ) policy_loss = policy_loss.mean() policy_loss.backward() self.actor_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() def cuda(self): self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() def observe(self, r_t, s_t1, done): if self.is_training: self.memory.append(self.s_t, self.a_t, r_t, done) self.s_t = s_t1 def random_action(self): action = np.random.uniform(-1.0, 1.0, self.nb_actions) self.a_t = action return action def select_action(self, s_t, decay_epsilon=True): action = to_numpy(self.actor(to_tensor(np.array([s_t])))).squeeze(0) action += self.is_training * max(self.epsilon, 0) * self.random_process.sample() action = np.clip(action, -1.0, 1.0) if decay_epsilon: self.epsilon -= self.depsilon self.a_t = action return action def reset(self, obs): self.s_t = obs self.random_process.reset_states() def load_weights(self, output): if output is None: return self.actor.load_state_dict(torch.load("{}/actor.pkl".format(output))) self.critic.load_state_dict(torch.load("{}/critic.pkl".format(output))) def save_model(self, output): torch.save(self.actor.state_dict(), "{}/actor.pkl".format(output)) torch.save(self.critic.state_dict(), "{}/critic.pkl".format(output)) def seed(self, s): torch.manual_seed(s) if USE_CUDA: torch.cuda.manual_seed(s)
class DDPG(object): def __init__(self, nb_states, nb_actions, args, discrete, use_cuda=False): if args.seed > 0: self.seed(args.seed) self.nb_states = nb_states self.nb_actions = nb_actions self.discrete = discrete # Create Actor and Critic Network net_cfg = { 'hidden1':args.hidden1, 'hidden2':args.hidden2, 'init_w':args.init_w } self.actor = Actor(self.nb_states * args.window_length, self.nb_actions, **net_cfg) self.actor_target = Actor(self.nb_states * args.window_length, self.nb_actions, **net_cfg) self.actor_optim = Adam(self.actor.parameters(), lr=args.prate) self.critic = Critic(self.nb_states * args.window_length, self.nb_actions, **net_cfg) self.critic_target = Critic(self.nb_states * args.window_length, self.nb_actions, **net_cfg) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) #Create replay buffer self.memory = rpm(args.rmsize) # SequentialMemory(limit=args.rmsize, window_length=args.window_length) self.random_process = Myrandom(size=nb_actions) # Hyper-parameters self.batch_size = args.bsize self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon # self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.use_cuda = use_cuda # if self.use_cuda: self.cuda() def update_policy(self, train_actor = True): # Sample batch state_batch, action_batch, reward_batch, \ next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size) # state_batch, action_batch, reward_batch, \ # next_state_batch, terminal_batch = self.memory.sample_and_split(self.batch_size) # Prepare for the target q batch next_q_values = self.critic_target([ to_tensor(next_state_batch, volatile=True), self.actor_target(to_tensor(next_state_batch, volatile=True)), ]) next_q_values.volatile = False target_q_batch = to_tensor(reward_batch) + \ self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values # Critic update self.critic.zero_grad() q_batch = self.critic([ to_tensor(state_batch), to_tensor(action_batch) ]) value_loss = criterion(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() self.actor.zero_grad() policy_loss = -self.critic([ to_tensor(state_batch), self.actor(to_tensor(state_batch)) ]) policy_loss = policy_loss.mean() policy_loss.backward() if train_actor == True: self.actor_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) return -policy_loss, value_loss def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() def cuda(self): print("use cuda") self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() def observe(self, r_t, s_t1, done): self.memory.append([self.s_t, self.a_t, r_t, s_t1, done]) self.s_t = s_t1 def random_action(self): action = np.random.uniform(-1.,1.,self.nb_actions) self.a_t = action if self.discrete: return action.argmax() else: return action def select_action(self, s_t, decay_epsilon=True, return_fix=False, noise_level=1): action = to_numpy( self.actor(to_tensor(np.array([s_t]))) ).squeeze(0) # print(self.random_process.sample(), action) noise_level = noise_level * max(self.epsilon, 0) action = action * (1 - noise_level) + (self.random_process.sample() * noise_level) # print(max(self.epsilon, 0) * self.random_process.sample() * noise_level, noise_level) action = np.clip(action, -1., 1.) # print(action) if decay_epsilon: self.epsilon -= self.depsilon self.a_t = action if return_fix: return action if self.discrete: return action.argmax() else: return action def reset(self, obs): self.s_t = obs self.random_process.reset_states() def load_weights(self, output): if output is None: return self.actor.load_state_dict( torch.load('{}/actor.pkl'.format(output)) ) self.critic.load_state_dict( torch.load('{}/critic.pkl'.format(output)) ) def save_model(self, output): if self.use_cuda: self.actor.cpu() self.critic.cpu() torch.save( self.actor.state_dict(), '{}/actor.pkl'.format(output) ) torch.save( self.critic.state_dict(), '{}/critic.pkl'.format(output) ) if self.use_cuda: self.actor.cuda() self.critic.cuda() def seed(self,s): torch.manual_seed(s) if self.use_cuda: torch.cuda.manual_seed(s)
class Agent(): def __init__(self, test=False): # device if torch.cuda.is_available(): self.device = torch.device('cuda') else: self.device = torch.device('cpu') ######################################### """ Some hand tune config(for developing) """ self.discrete = False self.action_dim = 1 self.state_dim = 3 self.batch_size = 100 self.action_low = -2 self.action_high = 2 ########################################## self.P_online = Actor(state_dim=self.state_dim, action_size=self.action_dim).to(self.device) self.P_target = Actor(state_dim=self.state_dim, action_size=self.action_dim).to(self.device) self.P_target.load_state_dict(self.P_online.state_dict()) self.Q_online = Critic(state_size=self.state_dim, action_size=self.action_dim).to(self.device) self.Q_target = Critic(state_size=self.state_dim, action_size=self.action_dim).to(self.device) self.Q_target.load_state_dict(self.Q_online.state_dict()) # discounted reward self.gamma = 0.99 self.eps = 0.25 # optimizer self.q_optimizer = torch.optim.Adam(self.Q_online.parameters(), lr=1e-3) self.p_optimizer = torch.optim.Adam(self.P_online.parameters(), lr=1e-3) # saved rewards and actions self.replay_buffer = ReplayBuffer() # noise self.noise = Noise(DELTA, SIGMA, OU_A, OU_MU) # Initialize noise self.ou_level = 0. self.ep_step = 0 def act(self, state, test=False): if not test: with torch.no_grad(): # boring type casting state = ((torch.from_numpy(state)).unsqueeze(0)).float().to( self.device) action = self.P_online(state) # continuous output a = action.data.cpu().numpy() # if self.ep_step < 200: # self.ou_level = self.noise.ornstein_uhlenbeck_level(self.ou_level) # a = a + self.ou_level if self.discrete: action = np.argmax(a) return a, action else: if self.ep_step < 200: self.ou_level = self.noise.ornstein_uhlenbeck_level( self.ou_level) action = np.clip(a + self.ou_level, self.action_low, self.action_high) return action, action def collect_data(self, state, action, reward, next_state, done): self.replay_buffer.push( torch.from_numpy(state).float().unsqueeze(0), torch.from_numpy(action).float(), torch.tensor([reward]).float().unsqueeze(0), torch.from_numpy(next_state).float().unsqueeze(0), torch.tensor([done]).float().unsqueeze(0)) def clear_data(self): raise NotImplementedError("Circular Queue don't need this function") def update(self): if len(self.replay_buffer) < self.batch_size: return states, actions, rewards, next_states, dones = self.replay_buffer.sample( batch_size=self.batch_size, device=self.device) # discounted rewards # rewards = torch.from_numpy(discount((rewards.view(rewards.shape[0])).cpu().numpy())).float().to(self.device) ### debug shape : ok #===============================Critic Update=============================== self.Q_online.train() Q = self.Q_online((states, actions)) with torch.no_grad(): # don't need backprop for target value self.Q_target.eval() self.P_target.eval() target = rewards + self.gamma * (1 - dones) * self.Q_target( (next_states, self.P_target(next_states))) critic_loss_fn = torch.nn.MSELoss() critic_loss = critic_loss_fn(Q, target).mean() # update self.q_optimizer.zero_grad() critic_loss.backward() self.q_optimizer.step() # print("critic loss", critic_loss.item()) #===============================Actor Update=============================== # fix online_critic , update online_actor self.Q_online.eval() for p in self.Q_online.parameters(): p.requires_grad = False for p in self.P_online.parameters(): p.requires_grad = True policy_loss = -self.Q_online((states, self.P_online(states))) policy_loss = policy_loss.mean() self.p_optimizer.zero_grad() policy_loss.backward() self.p_optimizer.step() # print("policy loss", policy_loss.item()) for p in self.Q_online.parameters(): p.requires_grad = True #===============================Target Update=============================== soft_update(self.Q_target, self.Q_online, tau=1e-3) soft_update(self.P_target, self.P_online, tau=1e-3) self.eps -= EPSILON_DECAY if self.eps <= 0: self.eps = 0
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, memory, batch_size, lr_actor, lr_critic, clip_critic, gamma, tau, weight_decay, update_network_steps, sgd_epoch, checkpoint_prefix): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed memory (ReplayBuffer): The replay buffer for storing xperiences batch_size (int): Number of experiences to sample from the memory lr_actor (float): The learning rate for the actor lr_critic (float): The learning rate critic clip_critic (float): The clip value for updating grads gamma (float): The reward discount factor tau (float): For soft update of target parameters weight_decay (float): The weight decay update_network_steps (int): How often to update the network sgd_epoch (int): Number of iterations for each network update checkpoint_prefix (string): The string prefix for saving checkpoint files """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.memory = memory self.batch_size = batch_size self.lr_actor = lr_actor self.lr_critic = lr_critic self.clip_critic = clip_critic self.gamma = gamma self.tau = tau self.weight_decay = weight_decay self.update_network_steps = update_network_steps self.sgd_epoch = sgd_epoch self.n_step = 0 # checkpoint self.checkpoint_prefix = checkpoint_prefix self.actor_loss_episodes = [] self.critic_loss_episodes = [] self.actor_loss = 0 self.critic_loss = 0 # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, seed).to(device) self.actor_target = Actor(state_size, action_size, seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, seed).to(device) self.critic_target = Critic(state_size, action_size, seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic, weight_decay=weight_decay) # Noise process self.noise = OUNoise(action_size, seed) def step(self, state, action, action_prob, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward for i in range(len(state)): self.memory.add(state[i], action[i], action_prob[i], reward[i], next_state[i], done[i]) # learn every n steps self.n_step = (self.n_step + 1) % self.update_network_steps if self.n_step == 0: # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: for i in range(self.sgd_epoch): experiences = self.memory.sample() self.learn(experiences, self.gamma) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1), np.zeros_like(action) # N/A action prob for DDPG def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, action_probs, rewards, next_states, dones = experiences # normalize rewards rewards = utils.normalize_rewards(rewards) # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) self.critic_loss = critic_loss # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() if self.clip_critic > 0: torch.nn.utils.clip_grad_norm(self.critic_local.parameters(), self.clip_critic) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() self.actor_loss = actor_loss # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, self.tau) self.soft_update(self.actor_local, self.actor_target, self.tau) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data) def checkpoint(self): """Save internal information in memory for later checkpointing""" self.actor_loss_episodes.append(self.actor_loss) self.critic_loss_episodes.append(self.critic_loss) def save_checkpoint(self): """Persist checkpoint information""" # the history loss utils.plot_scores(self.checkpoint_prefix + "_actor_loss.png", self.actor_loss_episodes, label="loss") utils.plot_scores(self.checkpoint_prefix + "_critic_loss.png", self.critic_loss_episodes, label="loss") # network torch.save(self.actor_local.state_dict(), self.checkpoint_prefix + "_actor.pth") torch.save(self.critic_local.state_dict(), self.checkpoint_prefix + "_critic.pth") def load_checkpoint(self): """Restore checkpoint information""" self.actor_local.load_state_dict(torch.load(self.checkpoint_prefix + "_actor.pth")) self.critic_local.load_state_dict(torch.load(self.checkpoint_prefix + "_critic.pth"))
class ddpg_Agent(): """Interacts with and learns from the environment.""" def __init__(self, env, config): """Initialize an Agent object. Params ====== env : environment to be handled config : configuration given a variety of parameters """ self.env = env self.config = config # set parameter for ML self.set_parameters(config) # Q-Network self.create_networks() # Noise process self.noise = OUNoise(self.action_size, self.seed) # Replay memory self.memory = ReplayBuffer(self.action_size, self.buffer_size, self.batch_size, self.seed) def set_parameters(self, config): # Base agent parameters self.gamma = config['gamma'] # discount factor self.tau = config['tau'] self.max_episodes = config['max_episodes'] # max numbers of episdoes to train self.env_file_name = config['env_file_name'] # name and path for env app self.brain_name = config['brain_name'] # name for env brain used in step self.num_agents = config['num_agents'] self.state_size = config['state_size'] self.action_size = config['action_size'] self.hidden_size = config['hidden_size'] self.buffer_size = config['buffer_size'] self.batch_size = config['batch_size'] self.dropout = config['dropout'] self.critic_learning_rate = config['critic_learning_rate'] self.actor_learning_rate = config['actor_learning_rate'] self.seed = (config['seed']) self.noise_scale = 1 self.noise_sigma = 0.1 # Some debug flags self.DoDebugEpisodeLists = False def create_networks(self): # Actor Network (local & Target Network) self.actor_local = Actor(self.state_size, self.hidden_size, self.action_size, self.seed).to(device) self.actor_target = Actor(self.state_size, self.hidden_size, self.action_size, self.seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.actor_learning_rate) # Critic Network (local & Target Network) self.critic_local = Critic(self.state_size, self.hidden_size, self.action_size, self.seed, self.dropout).to(device) self.critic_target = Critic(self.state_size, self.hidden_size, self.action_size, self.seed, self.dropout).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.critic_learning_rate) def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward # print('step : Next States : ',next_state.shape) self.memory.add(state, action, reward, next_state, done) # print('New step added to memory, length : ',len(self.memory)) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences, self.gamma) def update_noise_scale(self, cur_reward, scale_min = 0.2, scale_noise=False): """ If scale_noise == True the self.noise_scale will be decreased in relation to rewards Currently hand coded as rewards go up noise_scale will go down from 1 to scale_min""" if scale_noise: rewlow = 2 # below rewlow noise_scale is 1 from there on it increases linearly down to scale_min + 0.5*(1 - scale_min) until rewhigh is reached rewhigh = 10 # above rewhigh noise_scale falls linearly down to scale_min until rewrd = 30 is reached. Beyond 30 it stays at scale_min if cur_reward > rewlow: if cur_reward < rewhigh: self.noise_scale = (1 - scale_min)*(0.5*(rewhigh-cur_reward)/(rewhigh - rewlow) + 0.5) + scale_min else: self.noise_scale = (1 - scale_min)*np.min(0.5*(30-cur_reward)/((30-rewhigh)),0) + scale_min print('Updated noise scale to : ',self.noise_scale) return def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = ten(state) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise_scale * self.noise.sample() # ToDo check if tanh works better return np.clip(action, -1, 1) def train(self): if False: filename = 'trained_reacher_a_e100.pth' self.load_agent(filename) all_rewards = [] reward_window = deque(maxlen=100) print('Running on device : ',device) for i_episode in range(self.max_episodes): tic = time.time() # Reset the enviroment env_info = self.env.reset(train_mode=True)[self.brain_name] state = env_info.vector_observations total_reward = np.zeros(self.num_agents) t = 0 done = np.zeros(self.num_agents, dtype = bool) # loop over episode time steps while all(done==False): # t < self.tmax: # act and collect data action = self.act(state) env_info = self.env.step(action)[self.brain_name] next_state = env_info.vector_observations reward = np.asarray(env_info.rewards) done = np.asarray(env_info.local_done) # np.set_printoptions(formatter={'float': '{: 0.3f}'.format}) # print('Episode {} step {} taken action {} reward {} and done is {}'.format(i_episode,t,action,reward,done)) # increment stuff t += 1 total_reward += reward # Proceed agent step self.step(state, action, reward, next_state, done) # prepare for next round state = next_state # while not done # keep track of rewards: all_rewards.append(np.mean(total_reward)) reward_window.append(np.mean(total_reward)) # Output Episode info : toc = time.time() if (i_episode == 100): self.stable_update() self.update_noise_scale(np.mean(reward_window)) if not (i_episode % 25 == 0): print('Episode {} || Total Reward : {:6.3f} || average reward : {:6.3f} || Used {:5.3f} seconds, mem : {}'.format(i_episode,np.mean(total_reward),np.mean(reward_window),toc-tic,len(self.memory))) else: print(Back.RED + 'Episode {} || Total Reward : {:6.3f} || average reward : {:6.3f}'.format(i_episode,np.mean(total_reward),np.mean(reward_window))) print(Style.RESET_ALL) if (i_episode % 50 == 0): self.save_agent(i_episode) # for i_episode return all_rewards def reset(self): self.noise.reset() def stable_update(self): """ Update Hyperparameters which proved more stable """ self.buffer_size = 400000 self.memory.enlarge(self.buffer_size) self.noise_sigma = 0.05 self.noise.sigma = 0.05 def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples self.gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models # print('learn : Next States : ',next_states.shape) actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # print('learn : Actions : ',actions_next.shape) # print('learn : Q_target_next : ',Q_targets_next.shape) # Compute Q targets for current states (y_i) Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, self.tau) self.soft_update(self.actor_local, self.actor_target, self.tau) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data) def save_agent(self,i_episode): filename = 'trained_reacher_e'+str(i_episode)+'.pth' torch.save({ 'critic_local': self.critic_local.state_dict(), 'critic_target': self.critic_target.state_dict(), 'actor_local': self.actor_local.state_dict(), 'actor_target': self.actor_target.state_dict(), }, filename) print('Saved Networks in ',filename) return def load_agent(self,filename): savedata = torch.load(filename) self.critic_local.load_state_dict(savedata['critic_local']) self.critic_target.load_state_dict(savedata['critic_target']) self.actor_local.load_state_dict(savedata['actor_local']) self.actor_target.load_state_dict(savedata['actor_target']) return
class DDPG(object): def __init__(self, nb_status, nb_actions, args, writer): self.clip_actor_grad = args.clip_actor_grad self.nb_status = nb_status * args.window_length self.nb_actions = nb_actions self.discrete = args.discrete self.pic = args.pic self.writer = writer self.select_time = 0 if self.pic: self.nb_status = args.pic_status # Create Actor and Critic Network net_cfg = { 'hidden1': args.hidden1, 'hidden2': args.hidden2, 'use_bn': args.bn, 'init_method': args.init_method } if args.pic: self.cnn = CNN(1, args.pic_status) self.cnn_target = CNN(1, args.pic_status) self.cnn_optim = Adam(self.cnn.parameters(), lr=args.crate) self.actor = Actor(self.nb_status, self.nb_actions, **net_cfg) self.actor_target = Actor(self.nb_status, self.nb_actions, **net_cfg) self.actor_optim = Adam(self.actor.parameters(), lr=args.prate) self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) if args.pic: hard_update(self.cnn_target, self.cnn) #Create replay buffer self.memory = rpm( args.rmsize ) # SequentialMemory(limit=args.rmsize, window_length=args.window_length) self.random_process = Myrandom(size=nb_actions) # Hyper-parameters self.batch_size = args.batch_size self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon # self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.use_cuda = args.cuda # if self.use_cuda: self.cuda() def normalize(self, pic): pic = pic.swapaxes(0, 2).swapaxes(1, 2) return pic def update_policy(self): # Sample batch state_batch, action_batch, reward_batch, \ next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size) # Prepare for the target q batch if self.pic: state_batch = np.array([self.normalize(x) for x in state_batch]) state_batch = to_tensor(state_batch, volatile=True) state_batch = self.cnn(state_batch) next_state_batch = np.array( [self.normalize(x) for x in next_state_batch]) next_state_batch = to_tensor(next_state_batch, volatile=True) next_state_batch = self.cnn_target(next_state_batch) next_q_values = self.critic_target( [next_state_batch, self.actor_target(next_state_batch)]) else: next_q_values = self.critic_target([ to_tensor(next_state_batch, volatile=True), self.actor_target(to_tensor(next_state_batch, volatile=True)), ]) # print('batch of picture is ok') next_q_values.volatile = False target_q_batch = to_tensor(reward_batch) + \ self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values # Critic update self.critic.zero_grad() if self.pic: self.cnn.zero_grad() if self.pic: state_batch.volatile = False q_batch = self.critic([state_batch, to_tensor(action_batch)]) else: q_batch = self.critic( [to_tensor(state_batch), to_tensor(action_batch)]) # print(reward_batch, next_q_values*self.discount, target_q_batch, terminal_batch.astype(np.float)) value_loss = criterion(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() if self.pic: self.cnn_optim.step() self.actor.zero_grad() if self.pic: self.cnn.zero_grad() if self.pic: state_batch.volatile = False policy_loss = -self.critic([state_batch, self.actor(state_batch)]) else: policy_loss = -self.critic( [to_tensor(state_batch), self.actor(to_tensor(state_batch))]) policy_loss = policy_loss.mean() policy_loss.backward() if self.clip_actor_grad is not None: torch.nn.utils.clip_grad_norm(self.actor.parameters(), float(self.clip_actor_grad)) if self.writer != None: mean_policy_grad = np.array( np.mean([ np.linalg.norm(p.grad.data.cpu().numpy().ravel()) for p in self.actor.parameters() ])) #print(mean_policy_grad) self.writer.add_scalar('train/mean_policy_grad', mean_policy_grad, self.select_time) self.actor_optim.step() if self.pic: self.cnn_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) if self.pic: soft_update(self.cnn_target, self.cnn, self.tau) return -policy_loss, value_loss def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() if (self.pic): self.cnn.eval() self.cnn_target.eval() def train(self): self.actor.train() self.actor_target.train() self.critic.train() self.critic_target.train() if (self.pic): self.cnn.train() self.cnn_target.train() def cuda(self): self.cnn.cuda() self.cnn_target.cuda() self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() def observe(self, r_t, s_t1, done): self.memory.append([self.s_t, self.a_t, r_t, s_t1, done]) self.s_t = s_t1 def random_action(self, fix=False): action = np.random.uniform(-1., 1., self.nb_actions) self.a_t = action if self.discrete and fix == False: action = action.argmax() if self.pic: action = np.concatenate( (softmax(action[:16]), softmax(action[16:]))) return action def select_action(self, s_t, decay_epsilon=True, return_fix=False, noise_level=0): self.eval() if self.pic: s_t = self.normalize(s_t) s_t = self.cnn(to_tensor(np.array([s_t]))) if self.pic: action = to_numpy(self.actor_target(s_t)).squeeze(0) else: action = to_numpy(self.actor(to_tensor(np.array([s_t ])))).squeeze(0) self.train() noise_level = noise_level * max(self.epsilon, 0) if np.random.uniform(0, 1) < noise_level: action = (action + self.random_action(fix=True)) / 2. # episilon greedy if decay_epsilon: self.epsilon -= self.depsilon self.a_t = action if return_fix: return action if self.discrete: return action.argmax() else: return action def reset(self, obs): self.s_t = obs self.random_process.reset_status() def load_weights(self, output, num=1): if output is None: return self.actor.load_state_dict( torch.load('{}/actor{}.pkl'.format(output, num))) self.actor_target.load_state_dict( torch.load('{}/actor{}.pkl'.format(output, num))) self.critic.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num))) self.critic_target.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num))) def save_model(self, output, num): if self.use_cuda: self.cnn.cpu() self.actor.cpu() self.critic.cpu() torch.save(self.actor.state_dict(), '{}/actor{}.pkl'.format(output, num)) torch.save(self.critic.state_dict(), '{}/critic{}.pkl'.format(output, num)) if self.use_cuda: self.cnn.cuda() self.actor.cuda() self.critic.cuda()
def main(): env = gym.make(args.env_name) env.seed(args.seed) torch.manual_seed(args.seed) num_inputs = env.observation_space.shape[0] num_actions = env.action_space.shape[0] running_state = ZFilter((num_inputs,), clip=5) print('state size:', num_inputs) print('action size:', num_actions) actor = Actor(num_inputs, num_actions, args) critic = Critic(num_inputs, args) actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate) critic_optim = optim.Adam(critic.parameters(), lr=args.learning_rate, weight_decay=args.l2_rate) writer = SummaryWriter(comment="-ppo_iter-" + str(args.max_iter_num)) if args.load_model is not None: saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model)) ckpt = torch.load(saved_ckpt_path) actor.load_state_dict(ckpt['actor']) critic.load_state_dict(ckpt['critic']) running_state.rs.n = ckpt['z_filter_n'] running_state.rs.mean = ckpt['z_filter_m'] running_state.rs.sum_square = ckpt['z_filter_s'] print("Loaded OK ex. Zfilter N {}".format(running_state.rs.n)) episodes = 0 for iter in range(args.max_iter_num): actor.eval(), critic.eval() memory = deque() steps = 0 scores = [] while steps < args.total_sample_size: state = env.reset() score = 0 state = running_state(state) for _ in range(10000): if args.render: env.render() steps += 1 mu, std = actor(torch.Tensor(state).unsqueeze(0)) action = get_action(mu, std)[0] next_state, reward, done, _ = env.step(action) if done: mask = 0 else: mask = 1 memory.append([state, action, reward, mask]) next_state = running_state(next_state) state = next_state score += reward if done: break episodes += 1 scores.append(score) score_avg = np.mean(scores) print('{}:: {} episode score is {:.2f}'.format(iter, episodes, score_avg)) writer.add_scalar('log/score', float(score_avg), iter) actor.train(), critic.train() train_model(actor, critic, memory, actor_optim, critic_optim, args) if iter % 100: score_avg = int(score_avg) model_path = os.path.join(os.getcwd(),'save_model') if not os.path.isdir(model_path): os.makedirs(model_path) ckpt_path = os.path.join(model_path, 'ckpt_'+ str(score_avg)+'.pth.tar') save_checkpoint({ 'actor': actor.state_dict(), 'critic': critic.state_dict(), 'z_filter_n':running_state.rs.n, 'z_filter_m': running_state.rs.mean, 'z_filter_s': running_state.rs.sum_square, 'args': args, 'score': score_avg }, filename=ckpt_path)