def test(): env = gym.make(args.env_name) env.seed(10) torch.manual_seed(10) state_size = env.observation_space.shape[0] action_size = env.action_space.shape[0] actor = Actor(state_size, action_size, args) actor.load_state_dict(torch.load(args.load_model)) for ep in range(args.test_iter): score = 0 done = False state = env.reset() state = np.reshape(state, [1, state_size]) while not done: mu, std = actor(torch.Tensor(state)) action = actor.get_action(mu, std) #random_action = env.action_space.sample() next_state, reward, done, info = env.step(mu.detach().numpy()) env.render() score += reward next_state = np.reshape(next_state, [1, state_size]) state = next_state if ep % args.log_interval == 0: print(ep, " ep | score ", score) env.close()
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, agent_count, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action """ self.state_size = state_size self.action_size = action_size # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(agent_count * state_size, agent_count * action_size, random_seed).to(device) self.critic_target = Critic(agent_count * state_size, agent_count * action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) def soft_update(self): self.soft_update_network(self.critic_local, self.critic_target, TAU) self.soft_update_network(self.actor_local, self.actor_target, TAU) def soft_update_network(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def save(self, name): torch.save(self.actor_local.state_dict(), name + '_actor.pth') torch.save(self.critic_local.state_dict(), name + '_critic.pth') def load(self, name): self.actor_local.load_state_dict(torch.load(name + '_actor.pth')) self.critic_local.load_state_dict(torch.load(name + '_critic.pth'))
def train_tsp(args): # Goals from paper: # TSP20, 3.97 # TSP50, 6.08 # TSP100, 8.44 from tasks import tsp from tasks.tsp import TSPDataset STATIC_SIZE = 2 # (x, y) DYNAMIC_SIZE = 1 # dummy for compatibility train_data = TSPDataset(args.num_nodes, args.train_size, args.seed) valid_data = TSPDataset(args.num_nodes, args.valid_size, args.seed + 1) update_fn = None actor = Actor(STATIC_SIZE, DYNAMIC_SIZE, args.hidden_size, update_fn, tsp.update_mask, args.num_layers, args.dropout).to(device) critic = Critic(STATIC_SIZE, DYNAMIC_SIZE, args.hidden_size).to(device) kwargs = vars(args) kwargs['train_data'] = train_data kwargs['valid_data'] = valid_data kwargs['reward_fn'] = tsp.reward kwargs['render_fn'] = tsp.render if args.checkpoint: path = os.path.join(args.checkpoint, 'actor.pt') actor.load_state_dict(torch.load(path, device)) path = os.path.join(args.checkpoint, 'critic.pt') critic.load_state_dict(torch.load(path, device)) if not args.test: train(actor, critic, **kwargs) test_data = TSPDataset(args.num_nodes, args.train_size, args.seed + 2) test_dir = 'test' test_loader = DataLoader(test_data, args.batch_size, False, num_workers=0) out = validate(test_loader, actor, tsp.reward, tsp.render, test_dir, num_plot=5) print('Average tour length: ', out)
def init_model(env, model_args, ckpt=None): # get input/output size and range s_dim = env.get_state_size() a_dim = env.get_action_size() a_min = env.a_min a_max = env.a_max a_noise = model_args["noise"] * np.ones(a_dim) # get reference memory for FFC ref_mem = env._mocap.get_ref_mem() if not model_args["with_ffc"]: ref_mem.fill(0) ref_mem = ref_mem[:, 1:] # no phase velocity # automatically use gpu if use_gpu: torch.set_default_tensor_type('torch.cuda.FloatTensor') from model import Normalizer, Actor, Critic GAMMA = file_args["train_args"]["gamma"] non_norm = [0] #FMD0 s_norm = Normalizer(s_dim, non_norm) actor = Actor(s_dim, a_dim, a_min, a_max, a_noise, ref_mem.shape[0]) critic = Critic(s_dim, 0, 1 / (1 - GAMMA)) actor.set_reference(ref_mem) actor.ref_mem.requires_grad = False if (args.ckpt is not None): try: checkpoint = torch.load(args.ckpt) actor.load_state_dict(checkpoint["actor"]) critic.load_state_dict(checkpoint["critic"]) s_norm.load_state_dict(checkpoint["s_norm"]) print("load from %s" % args.ckpt) except: print("fail to load from %s" % args.ckpt) assert (False) return s_norm, actor, critic
class DDPGTrainer(object): def __init__(self): self.actor = Actor().to(device) self.actor_target = Actor().to(device) self.actor_target.load_state_dict(self.actor.state_dict()) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=0.0001) self.critic = Critic().to(device) self.critic_target = Critic().to(device) self.critic_target.load_state_dict(self.critic.state_dict()) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), weight_decay=1e-2) self.loss = torch.nn.MSELoss() def train(self, replay_buffer, iterations, batch_size=64, discount=0.99, tau=0.001): for it in range(iterations): # Sample replay buffer smp = replay_buffer.sample(batch_size) x, y, u, r, d = smp state = torch.FloatTensor(x).to(device) action = torch.stack(u) next_state = torch.FloatTensor(y).to(device) done = torch.FloatTensor(d).to(device) reward = torch.FloatTensor(r).to(device) # Compute the target Q value ac = self.actor_target(next_state) ac = torch.cat(ac, dim=1) target_Q = self.critic_target(next_state, ac) target_Q = reward + (done * discount * target_Q).detach() # Get current Q estimate current_Q = self.critic(state, action) # Compute critic loss critic_loss = self.loss(current_Q, target_Q) # Optimize the critic self.critic_optimizer.zero_grad() critic_loss.backward(retain_graph=True) self.critic_optimizer.step() # Compute actor loss actor_loss = -self.critic(state, torch.cat(self.actor(state), dim=1)).mean() # Optimize the actor self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # Update the frozen target models for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()): target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data) for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()): target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)
def main(): expert_demo = pickle.load(open('./Ree1_expert.p', "rb")) # Ree1 : action 1 # Ree2 : action 100 # Ree3 : action 50 # Ree4 : action 10 # Ree5 : action 4 # Ree6 : action 0.5 # print('expert_demo_shape : ', np.array(expert_demo).shape) expert_x = int(expert_demo[1][0]) expert_y = int(expert_demo[1][1]) env = Env(expert_x, expert_y) # env = Env(0,0) # env.seed(args.seed) torch.manual_seed(args.seed) num_inputs = 2 num_actions = 8 running_state = ZFilter((num_inputs, ), clip=5) print('state size:', num_inputs) print('action size:', num_actions) actor = Actor(num_inputs, num_actions, args) critic = Critic(num_inputs, args) discrim = Discriminator(num_inputs + num_actions, args) actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate) critic_optim = optim.Adam(critic.parameters(), lr=args.learning_rate, weight_decay=args.l2_rate) discrim_optim = optim.Adam(discrim.parameters(), lr=args.learning_rate) # load demonstrations # expert_demo, _ = pickle.load(open('./expert_demo/expert_demo.p', "rb")) demonstrations = np.array(expert_demo[0]) # print("demonstrations.shape", demonstrations.shape) writer = SummaryWriter(args.logdir) if args.load_model is not None: saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model)) ckpt = torch.load(saved_ckpt_path) actor.load_state_dict(ckpt['actor']) critic.load_state_dict(ckpt['critic']) discrim.load_state_dict(ckpt['discrim']) running_state.rs.n = ckpt['z_filter_n'] running_state.rs.mean = ckpt['z_filter_m'] running_state.rs.sum_square = ckpt['z_filter_s'] print("Loaded OK ex. Zfilter N {}".format(running_state.rs.n)) episodes = 0 train_discrim_flag = True for iter in range(args.max_iter_num): actor.eval(), critic.eval() memory = deque() steps = 0 scores = [] while steps < args.total_sample_size: state = env.reset() score = 0 state = running_state(state) for _ in range(1000): if args.render: env.render() steps += 1 mu, std = actor(torch.Tensor(state).unsqueeze(0)) action2 = np.argmax(get_action(mu, std)[0]) action = get_action(mu, std)[0] next_state, reward, done, _ = env.step(action2) # next_state, reward, done, _ = env.step(action) irl_reward = get_reward(discrim, state, action) if done: mask = 0 else: mask = 1 memory.append([state, action, irl_reward, mask]) next_state = running_state(next_state) state = next_state score += reward if done: break episodes += 1 scores.append(score) score_avg = np.mean(scores) print('{}:: {} episode score is {:.2f}'.format(iter, episodes, score_avg)) writer.add_scalar('log/score', float(score_avg), iter) actor.train(), critic.train(), discrim.train() if train_discrim_flag: expert_acc, learner_acc = train_discrim(discrim, memory, discrim_optim, demonstrations, args) print("Expert: %.2f%% | Learner: %.2f%%" % (expert_acc * 100, learner_acc * 100)) temp_learner.append(learner_acc * 100) temp_expert.append(expert_acc * 100) if ((expert_acc > args.suspend_accu_exp and learner_acc > args.suspend_accu_gen and iter % 55 == 0) or iter % 50 == 0): # train_discrim_flag = False plt.plot(temp_learner, label='learner') plt.plot(temp_expert, label='expert') plt.xlabel('Episode') plt.ylabel('Accuracy') plt.xticks([]) plt.legend() plt.savefig('accuracy{}.png'.format(iter)) # plt.show() model_path = 'C:/Users/USER/9 GAIL/lets-do-irl/mujoco/gail' ckpt_path = os.path.join(model_path, 'ckpt_' + str(score_avg) + '.pth.tar') print("check path", ckpt_path) save_checkpoint( { 'actor': actor.state_dict(), 'critic': critic.state_dict(), 'discrim': discrim.state_dict(), 'z_filter_n': running_state.rs.n, 'z_filter_m': running_state.rs.mean, 'z_filter_s': running_state.rs.sum_square, 'args': args, 'score': score_avg }, filename=ckpt_path) train_actor_critic(actor, critic, memory, actor_optim, critic_optim, args) if iter % 100: score_avg = int(score_avg) model_path = os.path.join(os.getcwd(), 'save_model') if not os.path.isdir(model_path): os.makedirs(model_path) model_path = 'C:/Users/USER/9 GAIL/lets-do-irl/mujoco/gail' ckpt_path = os.path.join(model_path, 'ckpt_' + str(score_avg) + '.pth.tar') save_checkpoint( { 'actor': actor.state_dict(), 'critic': critic.state_dict(), 'discrim': discrim.state_dict(), 'z_filter_n': running_state.rs.n, 'z_filter_m': running_state.rs.mean, 'z_filter_s': running_state.rs.sum_square, 'args': args, 'score': score_avg }, filename=ckpt_path) plt.plot(temp_learner) plt.plot(temp_expert) plt.xlabel('Episode') plt.ylabel('Accuracy') plt.xticks([]) plt.savefig('accuracy.png')
class DDPGAgent: def __init__(self, plot=True, seed=1, env: gym.Env = None, batch_size=128, learning_rate_actor=0.001, learning_rate_critic=0.001, weight_decay=0.01, gamma=0.999): np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.batch_size = batch_size self.learning_rate_actor = learning_rate_actor self.learning_rate_critic = learning_rate_critic self.weight_decay = weight_decay self.gamma = gamma self.tau = 0.001 self._to_tensor = util.to_tensor self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') self.actor = Actor(self.state_dim, self.action_dim).to(self.device) self.target_actor = Actor(self.state_dim, self.action_dim).to(self.device) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), self.learning_rate_actor, weight_decay=self.weight_decay) self.critic = Critic(self.state_dim, self.action_dim).to(self.device) self.target_critic = Critic(self.state_dim, self.action_dim).to(self.device) self.critic_optimizer = torch.optim.Adam( self.critic.parameters(), self.learning_rate_critic, weight_decay=self.weight_decay) hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) self.t = 0 def _learn_from_memory(self, memory): ''' 从记忆学习,更新两个网络的参数 ''' # 随机获取记忆里的Transition trans_pieces = memory.sample(self.batch_size) s0 = np.vstack([x.state for x in trans_pieces]) a0 = np.vstack([x.action for x in trans_pieces]) r1 = np.vstack([x.reward for x in trans_pieces]) s1 = np.vstack([x.next_state for x in trans_pieces]) terminal_batch = np.vstack([x.is_done for x in trans_pieces]) # 优化评论家网络参数 s1 = self._to_tensor(s1, device=self.device) s0 = self._to_tensor(s0, device=self.device) next_q_values = self.target_critic.forward( state=s1, action=self.target_actor.forward(s1)).detach() target_q_batch = self._to_tensor(r1, device=self.device) + \ self.gamma*self._to_tensor(terminal_batch.astype(np.float), device=self.device)*next_q_values q_batch = self.critic.forward(s0, self._to_tensor(a0, device=self.device)) # 计算critic的loss 更新critic网络参数 loss_critic = F.mse_loss(q_batch, target_q_batch) #self.critic_optimizer.zero_grad() self.critic.zero_grad() loss_critic.backward() self.critic_optimizer.step() # 反向传播,以某状态的价值估计为策略目标函数 loss_actor = -self.critic.forward(s0, self.actor.forward(s0)) # Q的梯度上升 loss_actor = loss_actor.mean() self.actor.zero_grad() #self.actor_optimizer.zero_grad() loss_actor.backward() self.actor_optimizer.step() # 软更新参数 soft_update(self.target_actor, self.actor, self.tau) soft_update(self.target_critic, self.critic, self.tau) return (loss_critic.item(), loss_actor.item()) def learning(self, memory): self.actor.train() return self._learn_from_memory(memory) def save_models(self, episode_count): torch.save(self.target_actor.state_dict(), './Models/' + str(episode_count) + '_actor.pt') torch.save(self.target_critic.state_dict(), './Models/' + str(episode_count) + '_critic.pt') def load_models(self, episode): self.actor.load_state_dict( torch.load('./Models/' + str(episode) + '_actor.pt')) self.critic.load_state_dict( torch.load('./Models/' + str(episode) + '_critic.pt')) hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) print('Models loaded successfully')
env = gym.make(args.env_name) env.seed(500) torch.manual_seed(500) state_size = env.observation_space.shape[0] action_size = env.action_space.shape[0] print('state size:', state_size) print('action size:', action_size) actor = Actor(state_size, action_size, args) if args.load_model is not None: pretrained_model_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model)) pretrained_model = torch.load(pretrained_model_path) actor.load_state_dict(pretrained_model) ou_noise = OUNoise(action_size, args.theta, args.mu, args.sigma) steps = 0 for episode in range(args.iter): done = False score = 0 state = env.reset() state = np.reshape(state, [1, state_size]) while not done: if args.render: env.render()
def main(args): with open(args.data_dir+'/ptb.vocab.json', 'r') as file: vocab = json.load(file) # required to map between integer-value sentences and real sentences w2i, i2w = vocab['w2i'], vocab['i2w'] # make sure our models for the VAE and Actor exist if not os.path.exists(args.load_vae): raise FileNotFoundError(args.load_vae) model = SentenceVAE( vocab_size=len(w2i), sos_idx=w2i['<sos>'], eos_idx=w2i['<eos>'], pad_idx=w2i['<pad>'], unk_idx=w2i['<unk>'], max_sequence_length=args.max_sequence_length, embedding_size=args.embedding_size, rnn_type=args.rnn_type, hidden_size=args.hidden_size, word_dropout=args.word_dropout, embedding_dropout=args.embedding_dropout, latent_size=args.latent_size, num_layers=args.num_layers, bidirectional=args.bidirectional ) model.load_state_dict( torch.load(args.load_vae, map_location=lambda storage, loc: storage)) model.eval() print("vae model loaded from %s"%(args.load_vae)) # to run in constraint mode, we need the trained generator if args.constraint_mode: if not os.path.exists(args.load_actor): raise FileNotFoundError(args.load_actor) actor = Actor( dim_z=args.latent_size, dim_model=2048, num_labels=args.n_tags) actor.load_state_dict( torch.load(args.load_actor, map_location=lambda storage, loc:storage)) actor.eval() print("actor model loaded from %s"%(args.load_actor)) if torch.cuda.is_available(): model = model.cuda() if args.constraint_mode: actor = actor.cuda() # TODO: to(self.devices) if args.sample: print('*** SAMPLE Z: ***') # get samples from the prior sample_sents, z = model.inference(n=args.num_samples) sample_sents, sample_tags = get_sents_and_tags(sample_sents, i2w, w2i) pickle_it(z.cpu().numpy(), 'samples/z_sample_n{}.pkl'.format(args.num_samples)) pickle_it(sample_sents, 'samples/sents_sample_n{}.pkl'.format(args.num_samples)) pickle_it(sample_tags, 'samples/tags_sample_n{}.pkl'.format(args.num_samples)) print(sample_sents, sep='\n') if args.constraint_mode: print('*** SAMPLE Z_PRIME: ***') # get samples from the prior, conditioned via the actor all_tags_sample_prime = [] all_sents_sample_prime = {} all_z_sample_prime = {} for i, condition in enumerate(LABELS): # binary vector denoting each of the PHRASE_TAGS labels = torch.Tensor(condition).repeat(args.num_samples, 1).cuda() # take z and manipulate using the actor to generate z_prime z_prime = actor.forward(z, labels) sample_sents_prime, z_prime = model.inference( z=z_prime, n=args.num_samples) sample_sents_prime, sample_tags_prime = get_sents_and_tags( sample_sents_prime, i2w, w2i) print('conditoned on: {}'.format(condition)) print(sample_sents_prime, sep='\n') all_tags_sample_prime.append(sample_tags_prime) all_sents_sample_prime[LABEL_NAMES[i]] = sample_sents_prime all_z_sample_prime[LABEL_NAMES[i]] = z_prime.data.cpu().numpy() pickle_it(all_tags_sample_prime, 'samples/tags_sample_prime_n{}.pkl'.format(args.num_samples)) pickle_it(all_sents_sample_prime, 'samples/sents_sample_prime_n{}.pkl'.format(args.num_samples)) pickle_it(all_z_sample_prime, 'samples/z_sample_prime_n{}.pkl'.format(args.num_samples)) if args.interpolate: # get random samples from the latent space z1 = torch.randn([args.latent_size]).numpy() z2 = torch.randn([args.latent_size]).numpy() z = to_var(torch.from_numpy(interpolate(start=z1, end=z2, steps=args.num_samples-2)).float()) print('*** INTERP Z: ***') interp_sents, _ = model.inference(z=z) interp_sents, interp_tags = get_sents_and_tags(interp_sents, i2w, w2i) pickle_it(z.cpu().numpy(), 'samples/z_interp_n{}.pkl'.format(args.num_samples)) pickle_it(interp_sents, 'samples/sents_interp_n{}.pkl'.format(args.num_samples)) pickle_it(interp_tags, 'samples/tags_interp_n{}.pkl'.format(args.num_samples)) print(interp_sents, sep='\n') if args.constraint_mode: print('*** INTERP Z_PRIME: ***') all_tags_interp_prime = [] all_sents_interp_prime = {} all_z_interp_prime = {} for i, condition in enumerate(LABELS): # binary vector denoting each of the PHRASE_TAGS labels = torch.Tensor(condition).repeat(args.num_samples, 1).cuda() # z prime conditioned on this particular binary variable z_prime = actor.forward(z, labels) interp_sents_prime, z_prime = model.inference( z=z_prime, n=args.num_samples) interp_sents_prime, interp_tags_prime = get_sents_and_tags( interp_sents_prime, i2w, w2i) print('conditoned on: {}'.format(condition)) print(interp_sents_prime, sep='\n') all_tags_interp_prime.append(interp_tags_prime) all_sents_interp_prime[LABEL_NAMES[i]] = interp_sents_prime all_z_interp_prime[LABEL_NAMES[i]] = z_prime.data.cpu().numpy() pickle_it(all_tags_interp_prime, 'samples/tags_interp_prime_n{}.pkl'.format(args.num_samples)) pickle_it(all_sents_interp_prime, 'samples/sents_interp_prime_n{}.pkl'.format(args.num_samples)) pickle_it(all_z_interp_prime, 'samples/z_interp_prime_n{}.pkl'.format(args.num_samples)) import IPython; IPython.embed()
# Import the Actor model from model import Actor # Create instance of Reacher environment env = UnityEnvironment(file_name='Reacher-20.app' ) # Update the app name/location if not using macOS # Get brain # Get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # Load Actor model weights actor = Actor(state_size=33, action_size=4, seed=0) actor.load_state_dict(torch.load('checkpoint_actor.pth')) # Testing def test(state, agents, action_size): """ Testing the Reacher agent for a single agent Params ====== state (numpy.ndarray): Current state that the agents are experiencing agents (int): The number of agents (= 20 in this case) action_size (int): Number of possible actions an agent can take """ global actor
class Agent(): """ Interacts with and learns from the environment """ def __init__(self, state_size, action_size, num_agents, seed): """ Initialize an Agent object Params ====== state_size (int): dimension of each state action_size (int): dimension of each action num_agents (int): number of agents to run seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(seed) # Actor network (with target network) self.actor_local = Actor(state_size, action_size, seed).to(device) self.actor_target = Actor(state_size, action_size, seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic network (with target network) self.critic_local = Critic(state_size, action_size, seed).to(device) self.critic_target = Critic(state_size, action_size, seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise((num_agents, action_size), seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) self.steps_counter = 0 self.train_counter = 0 def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward for i in range(self.num_agents): self.memory.add(state[i, :], action[i, :], reward[i], next_state[i, :], done[i]) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def add(self, state, action, reward, next_state, done): """ Save experience to replay memory """ # Save experience / reward self.memory.add(state, action, reward, next_state, done) def learn_from_buffer(self, train_counter): for i in range(train_counter): if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, states, add_noise=True): """ Returns actions for given state as per current policy """ states = torch.from_numpy(states).float().to(device) actions = np.zeros((self.num_agents, self.action_size)) self.actor_local.eval() with torch.no_grad(): for agent_num, state in enumerate(states): action = self.actor_local(state).cpu().data.numpy() actions[agent_num, :] = action self.actor_local.train() if add_noise: actions += self.noise.sample() return np.clip(actions, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """ Update policy/value parameters using batch of experience tuples Params ====== experiences (Tuple[torch.Tensor]): tuple of (s,a,r,s',done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences ############# Update Critic ############# # Get predicted next-state actions and Q-values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) ?? Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # ?? # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() ############# Update Actor ############# # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() ############# Update Target Networks ############# self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """ Soft update model parameters θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (pyTorch model): where weights come from target_model (pyTorch model): where weights will go tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def set_target_model(self, actor, critic): # ?? self.actor_target = actor self.critic_target = critic def get_target_model(self): return self.actor_target, self.critic_target def load_weights(self, actor_path, critic_path): # Actor self.actor_local.load_state_dict(torch.load(actor_path)) self.actor_target.load_state_dict(torch.load(actor_path)) # Critic self.critic_local.load_state_dict(torch.load(critic_path)) self.critic_target.load_state_dict(torch.load(critic_path))
def val(train_models, n_episodes=100000, max_t=1000, print_every=10, eps=0, temperature=1, actor_state_dict=None): log_str = "Start Validating !!!" epf.write(log_str) epf.flush() print(log_str) train_env = None for train_model in train_models: start = datetime.now() start_date = "{}".format(start.date()) start_time = "{}".format(start.time()) val_actor = Actor(dim_size=dim_size, resource_size=2, n_action_steps=2, action_size=12, h_size=128, num_steps=opt.num_steps).to(device) val_actor.load_state_dict(actor_state_dict) val_agent = Agent(val_actor, h_size=128, device=device) val_agent.set_fitness(opt.fitness) val_agent.reset() best_score = float("-Inf") best_reward_whole = float("-Inf") best_constraint = 0. best_episode = 0 best_sol_whole = None scores_window = deque(maxlen=print_every) scores = [] episodes = 0 has_succeed_history = False train_m_file = os.path.join(m_file_path, train_model + ".csv") train_df = pd.read_csv(train_m_file) train_model_defs = train_df.to_numpy() if train_env is None: train_env = MaestroEnvironment(model_defs=train_model_defs, dim_size=dim_size, resource_size=2, n_action_steps=2, dataflow=opt.df) else: train_env.reset_dimension(model_defs=train_model_defs) train_env.set_fitness(opt.fitness) train_env.set_constraint(opt.cstr) constraint_temp = [ train_env.get_ref_constraint([action_bound[0], action_bound[1]]), train_env.get_ref_constraint([action_bottom[0], action_bottom[1]]), train_env.get_ref_constraint([action_bound[0], action_bottom[1]]), train_env.get_ref_constraint([action_bottom[0], action_bound[1]]) ] max_constraint, min_constraint = max(constraint_temp), min( constraint_temp) # epf.write("Max constraint: {}\n".format(max_constraint)) # epf.flush() # epf.write("Min constraint: {}\n".format(min_constraint)) # epf.flush() set_constraint = min_constraint + (max_constraint - min_constraint) * ratio train_env.set_constraint_value(max_constraint, min_constraint, set_constraint) theta = {} for name, param in val_actor.named_parameters(): theta[name] = param for i_adapt in range(n_episodes): score = 0 # env.shuffle_model() state, infos = train_env.reset() for _ in range(max_t): action, log_prob = val_agent.act(state, infos, eps, temperature, theta) # print(log_prob) # pdb.set_trace() next_state, reward, done, infos, sig, impt = train_env.step( action) val_agent.step(state, action, log_prob, reward, next_state, done, sig, impt, infos) state = next_state score += reward if done: break # inner-gradient learned_loss = compute_learned_loss(val_agent, theta) if learned_loss is None: continue inner_gradient = torch.autograd.grad( learned_loss, [v for _, v in theta.items()], create_graph=not first_order, retain_graph=not first_order, allow_unused=True, ) theta = SGD_step(theta, inner_gradient, INNER_LR) env_chkpt = train_env.get_chkpt() if env_chkpt["best_sol"] is not None: best_sol = env_chkpt["best_sol"]["sol"] set_constraint = env_chkpt["ctrs_info"]["value"] max_constraint = env_chkpt["ctrs_info"]["max"] min_constraint = env_chkpt["ctrs_info"]["min"] episode_reward, episode_constraint = train_env.exterior_search( best_sol) if episode_reward > best_reward_whole: best_reward_whole = episode_reward best_constraint = episode_constraint best_episode = i_adapt best_sol_whole = best_sol val_agent.reset() log_str = f'RL result: {train_model} at {best_episode} reward: {best_reward_whole:.4e}, Used Constraint: {best_constraint/train_env.constraint_value * 100:.1f}%\n' print(log_str) epf.write(log_str) epf.flush() end = datetime.now() end_date = "{}".format(end.date()) end_time = "{}".format(end.time()) log_str = 'Running time: from {} to {}'.format(start_time, end_time) print(log_str) epf.write(log_str) epf.flush() if model2sol[train_model][ 'best_reward'] is None or best_reward_whole > model2sol[ train_model]['best_reward']: model2sol[train_model]['best_reward'] = best_reward_whole model2sol[train_model]['best_sol'] = best_sol_whole for f in glob.glob("*.m"): os.remove(f) for f in glob.glob("*.csv"): os.remove(f)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size).to(device) self.actor_target = Actor(state_size, action_size).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic1_local = Critic(state_size, action_size).to(device) self.critic1_target = Critic(state_size, action_size).to(device) self.critic1_optimizer = optim.Adam(self.critic1_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) self.critic2_local = Critic(state_size, action_size).to(device) self.critic2_target = Critic(state_size, action_size).to(device) self.critic2_optimizer = optim.Adam(self.critic2_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size) # Replay memory self.memory = PER(BUFFER_SIZE) def step(self, state, action, reward, next_state, done): """Save experience in replay memory.""" # Set reward as initial priority, see: # https://jaromiru.com/2016/11/07/lets-make-a-dqn-double-learning-and-prioritized-experience-replay/ self.memory.add((state, action, reward, next_state, done), reward) def act(self, state): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() action += self.noise.sample() return np.clip(action, -1., 1.) def reset(self): self.noise.reset() def mse(self, expected, targets, is_weights): """Custom loss function that takes into account the importance-sampling weights.""" td_error = expected - targets weighted_squared_error = is_weights * td_error * td_error return torch.sum(weighted_squared_error) / torch.numel( weighted_squared_error) def learn(self): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value """ for i in range(1, LEARN_BATCH + 1): idxs, experiences, is_weights = self.memory.sample(BATCH_SIZE) states = torch.from_numpy( np.vstack([e[0] for e in experiences if e is not None])).float().to(device) actions = torch.from_numpy( np.vstack([e[1] for e in experiences if e is not None])).float().to(device) rewards = torch.from_numpy( np.vstack([e[2] for e in experiences if e is not None])).float().to(device) next_states = torch.from_numpy( np.vstack([e[3] for e in experiences if e is not None])).float().to(device) dones = torch.from_numpy( np.vstack([e[4] for e in experiences if e is not None ]).astype(np.uint8)).float().to(device) is_weights = torch.from_numpy(is_weights).float().to(device) # ---------------------------- update critic ---------------------------- # # Target Policy Smoothing Regularization: add a small amount of clipped random noises to the selected action if POLICY_NOISE > 0.0: noise = torch.empty_like(actions).data.normal_( 0, POLICY_NOISE).to(device) noise = noise.clamp(-POLICY_NOISE_CLIP, POLICY_NOISE_CLIP) # Get predicted next-state actions and Q values from target models actions_next = (self.actor_target(next_states) + noise).clamp( -1., 1.) else: # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) # Error Mitigation Q_targets_next = torch.min(\ self.critic1_target(next_states, actions_next), \ self.critic2_target(next_states, actions_next)).detach() # Compute Q targets for current states (y_i) Q_targets = rewards + (GAMMA * Q_targets_next * (1 - dones)) # Compute critic1 loss Q_expected = self.critic1_local(states, actions) errors1 = np.abs((Q_expected - Q_targets).detach().cpu().numpy()) critic1_loss = self.mse(Q_expected, Q_targets, is_weights) # Minimize the loss self.critic1_optimizer.zero_grad() critic1_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic1_local.parameters(), 1) self.critic1_optimizer.step() # Update priorities in the replay buffer self.memory.batch_update(idxs, errors1) # Compute critic2 loss Q_expected = self.critic2_local(states, actions) critic2_loss = self.mse(Q_expected, Q_targets, is_weights) # Minimize the loss self.critic2_optimizer.zero_grad() critic2_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic2_local.parameters(), 1) self.critic2_optimizer.step() # Delayed Policy Updates if i % UPDATE_ACTOR_EVERY == 0: # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic1_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic1_local, self.critic1_target, TAU) self.soft_update(self.critic2_local, self.critic2_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def save_weights(self): torch.save(self.actor_local.state_dict(), actor_weights_file) torch.save(self.critic1_local.state_dict(), critic1_weights_file) torch.save(self.critic2_local.state_dict(), critic2_weights_file) def load_weights(self): self.actor_local.load_state_dict(torch.load(actor_weights_file)) self.critic1_local.load_state_dict(torch.load(critic1_weights_file)) self.critic2_local.load_state_dict(torch.load(critic2_weights_file))
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, identity, state_size, action_size, random_seed, memory, noise): # checked """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) self.checkpoint_file_path = './checkpoint_agent_' + str( identity) + '.pth' if os.path.isfile(self.checkpoint_file_path): self.actor_local.load_state_dict( torch.load(self.checkpoint_file_path)) self.actor_target.load_state_dict( torch.load(self.checkpoint_file_path)) # Replay memory self.memory = memory # Noise process self.noise = noise def act(self, state, add_noise=True): # checked """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def step(self, state, action, reward, next_state, done): # checked """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) def reset(self): # checked self.noise.reset()
class DDPG(object): def __init__(self, nb_status, nb_actions, args, writer): self.clip_actor_grad = args.clip_actor_grad self.nb_status = nb_status * args.window_length self.nb_actions = nb_actions self.writer = writer self.select_time = 0 # Create Actor and Critic Network net_cfg = { 'hidden1':args.hidden1, 'hidden2':args.hidden2, 'init_method':args.init_method } self.actor = Actor(self.nb_status, self.nb_actions, **net_cfg) self.actor_target = Actor(self.nb_status, self.nb_actions, **net_cfg) self.actor_optim = Adam(self.actor.parameters(), lr=args.prate) self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) #Create replay buffer self.memory = rpm(args.rmsize) self.random_process = Myrandom(size=nb_actions) # Hyper-parameters self.batch_size = args.batch_size self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon # self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.use_cuda = args.cuda # if self.use_cuda: self.cuda() def update_policy(self, train_actor = True): # Sample batch state_batch, action_batch, reward_batch, \ next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size) # Prepare for the target q batch next_q_values = self.critic_target([ to_tensor(next_state_batch, volatile=True), self.actor_target(to_tensor(next_state_batch, volatile=True)), ]) # print('batch of picture is ok') next_q_values.volatile = False target_q_batch = to_tensor(reward_batch) + \ self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values # Critic update self.critic.zero_grad() q_batch = self.critic([to_tensor(state_batch), to_tensor(action_batch)]) # print(reward_batch, next_q_values*self.discount, target_q_batch, terminal_batch.astype(np.float)) value_loss = nn.MSELoss()(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() self.actor.zero_grad() policy_loss = -self.critic([ to_tensor(state_batch), self.actor(to_tensor(state_batch)) ]) policy_loss = policy_loss.mean() policy_loss.backward() if self.clip_actor_grad is not None: torch.nn.utils.clip_grad_norm(self.actor.parameters(), float(self.clip_actor_grad)) if self.writer != None: mean_policy_grad = np.array(np.mean([np.linalg.norm(p.grad.data.cpu().numpy().ravel()) for p in self.actor.parameters()])) #print(mean_policy_grad) self.writer.add_scalar('train/mean_policy_grad', mean_policy_grad, self.select_time) if train_actor: self.actor_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) return -policy_loss, value_loss def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() def train(self): self.actor.train() self.actor_target.train() self.critic.train() self.critic_target.train() def cuda(self): self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() def observe(self, r_t, s_t1, done): self.memory.append([self.s_t, self.a_t, r_t, s_t1, done]) self.s_t = s_t1 def random_action(self): action = np.random.uniform(-1.,1.,self.nb_actions) self.a_t = action return action def select_action(self, s_t, decay_epsilon=True, return_fix=False, noise_level=0): self.eval() # print(s_t.shape) action = to_numpy( self.actor(to_tensor(np.array([s_t]))) ).squeeze(0) self.train() noise_level = noise_level * max(self.epsilon, 0) action = action * (1 - noise_level) + (self.random_process.sample() * noise_level) action = np.clip(action, -1., 1.) if decay_epsilon: self.epsilon -= self.depsilon self.a_t = action return action def reset(self, obs): self.s_t = obs self.random_process.reset_status() def load_weights(self, output, num=1): if output is None: return self.actor.load_state_dict( torch.load('{}/actor{}.pkl'.format(output, num)) ) self.actor_target.load_state_dict( torch.load('{}/actor{}.pkl'.format(output, num)) ) self.critic.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num)) ) self.critic_target.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num)) ) def save_model(self, output, num): if self.use_cuda: self.actor.cpu() self.critic.cpu() torch.save( self.actor.state_dict(), '{}/actor{}.pkl'.format(output, num) ) torch.save( self.critic.state_dict(), '{}/critic{}.pkl'.format(output, num) ) if self.use_cuda: self.actor.cuda() self.critic.cuda()
class DDPG(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed, num_agents, agent_id): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.num_agents = num_agents self.agent_id = agent_id self.eps = EPS_START self.eps_decay = 1 / (EPS_EP_END * EPOCHS) self.timestep = 0 # Actor Network (w/ Target Network) self.actor_local = Actor(state_size * 2, action_size, random_seed).to(device) self.actor_target = Actor(state_size * 2, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size * 2, action_size * 2, random_seed).to(device) self.critic_target = Critic(state_size * 2, action_size * 2, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise((num_agents, action_size), random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.timestep += 1 priority = (abs(reward) + PRIORITY_EPS)**PRIORITY_ALPHA self.memory.add(state, action, reward, next_state, done, priority) if self.timestep % UPDATE_EVERY != 0: return # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: for i in range(EPOCHS): experiences = self.memory.sample(device) self.learn(experiences, GAMMA) def act(self, state, add_noise): """Returns actions for both agents as per current policy, given their respective states.""" state = torch.from_numpy(state).float().to(device) actions = np.zeros((self.num_agents, self.action_size)) self.actor_local.eval() with torch.no_grad(): # get action for each agent and concatenate them actions = self.actor_local(state).cpu().data.numpy() self.actor_local.train() # add noise to actions if add_noise: actions += self.eps * self.noise.sample() actions = np.clip(actions, -1, 1) return actions def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) # Construct next actions vector relative to the agent if self.agent_id == 0: actions_next = torch.cat((actions_next, actions[:, 2:]), dim=1) else: actions_next = torch.cat((actions[:, :2], actions_next), dim=1) # Compute Q targets for current states (y_i) Q_targets_next = self.critic_target(next_states, actions_next) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) # Construct action prediction vector relative to each agent if self.agent_id == 0: actions_pred = torch.cat((actions_pred, actions[:, 2:]), dim=1) else: actions_pred = torch.cat((actions[:, :2], actions_pred), dim=1) # Compute actor loss actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) # update noise decay parameter self.eps -= self.eps_decay self.eps = max(self.eps, EPS_FINAL) self.noise.reset() def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def save(self): torch.save(self.actor_local.state_dict(), 'actor{}.pth'.format(self.agent_id)) torch.save(self.critic_local.state_dict(), 'critic{}.pth'.format(self.agent_id)) def load(self): self.actor_local.load_state_dict( torch.load('actor{}.pth'.format(self.agent_id))) self.critic_local.load_state_dict( torch.load('critic{}.pth'.format(self.agent_id)))
def main(): env = gym.make(args.env_name) env.seed(args.seed) torch.manual_seed(args.seed) num_inputs = env.observation_space.shape[0] num_actions = env.action_space.shape[0] running_state = ZFilter((num_inputs,), clip=5) print('state size:', num_inputs) print('action size:', num_actions) actor = Actor(num_inputs, num_actions, args) critic = Critic(num_inputs, args) discrim = Discriminator(num_inputs + num_actions, args) actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate) critic_optim = optim.Adam(critic.parameters(), lr=args.learning_rate, weight_decay=args.l2_rate) discrim_optim = optim.Adam(discrim.parameters(), lr=args.learning_rate) # load demonstrations expert_demo, _ = pickle.load(open('./expert_demo/expert_demo.p', "rb")) demonstrations = np.array(expert_demo) print("demonstrations.shape", demonstrations.shape) writer = SummaryWriter(args.logdir) if args.load_model is not None: saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model)) ckpt = torch.load(saved_ckpt_path) actor.load_state_dict(ckpt['actor']) critic.load_state_dict(ckpt['critic']) discrim.load_state_dict(ckpt['discrim']) running_state.rs.n = ckpt['z_filter_n'] running_state.rs.mean = ckpt['z_filter_m'] running_state.rs.sum_square = ckpt['z_filter_s'] print("Loaded OK ex. Zfilter N {}".format(running_state.rs.n)) episodes = 0 train_discrim_flag = True for iter in range(args.max_iter_num): actor.eval(), critic.eval() memory = deque() steps = 0 scores = [] while steps < args.total_sample_size: state = env.reset() score = 0 state = running_state(state) for _ in range(10000): if args.render: env.render() steps += 1 mu, std = actor(torch.Tensor(state).unsqueeze(0)) action = get_action(mu, std)[0] next_state, reward, done, _ = env.step(action) irl_reward = get_reward(discrim, state, action) if done: mask = 0 else: mask = 1 memory.append([state, action, irl_reward, mask]) next_state = running_state(next_state) state = next_state score += reward if done: break episodes += 1 scores.append(score) score_avg = np.mean(scores) print('{}:: {} episode score is {:.2f}'.format(iter, episodes, score_avg)) writer.add_scalar('log/score', float(score_avg), iter) actor.train(), critic.train(), discrim.train() if train_discrim_flag: expert_acc, learner_acc = train_discrim(discrim, memory, discrim_optim, demonstrations, args) print("Expert: %.2f%% | Learner: %.2f%%" % (expert_acc * 100, learner_acc * 100)) if expert_acc > args.suspend_accu_exp and learner_acc > args.suspend_accu_gen: train_discrim_flag = False train_actor_critic(actor, critic, memory, actor_optim, critic_optim, args) if iter % 100: score_avg = int(score_avg) model_path = os.path.join(os.getcwd(),'save_model') if not os.path.isdir(model_path): os.makedirs(model_path) ckpt_path = os.path.join(model_path, 'ckpt_'+ str(score_avg)+'.pth.tar') save_checkpoint({ 'actor': actor.state_dict(), 'critic': critic.state_dict(), 'discrim': discrim.state_dict(), 'z_filter_n':running_state.rs.n, 'z_filter_m': running_state.rs.mean, 'z_filter_s': running_state.rs.sum_square, 'args': args, 'score': score_avg }, filename=ckpt_path)
def test(n_episodes=100000, max_t=1000, print_every=10, eps=0, temperature=1, actor_state_dict=None): log_str = "Start Testing !!!" epf.write(log_str) epf.flush() print(log_str) Test_LR_ACTOR = 1e-3 test_actor = Actor(dim_size=dim_size, resource_size=2, n_action_steps=2, action_size=12, h_size=128, num_steps=opt.num_steps).to(device) test_actor.load_state_dict(actor_state_dict) test_actor_optimizer = optim.Adam(test_actor.parameters(), lr=Test_LR_ACTOR) test_scheduler = optim.lr_scheduler.ReduceLROnPlateau(test_actor_optimizer, factor=0.9, min_lr=1e-6) test_agent = Agent(test_actor, h_size=128, device=device) test_agent.set_fitness(opt.fitness) test_agent.reset() best_score = float("-Inf") best_reward_whole = float("-Inf") best_constraint = 0. best_episode = 0 scores_window = deque(maxlen=print_every) scores = [] episodes = 0 has_succeed_history = False model = test_model m_file = os.path.join(m_file_path, model + ".csv") df = pd.read_csv(m_file) model_defs = df.to_numpy() env = MaestroEnvironment(model_defs=model_defs, dim_size=dim_size, resource_size=2, n_action_steps=2, dataflow=opt.df) state = env.reset() env.set_fitness(opt.fitness) env.set_constraint(opt.cstr) constraint_temp = [ env.get_ref_constraint([action_bound[0], action_bound[1]]), env.get_ref_constraint([action_bottom[0], action_bottom[1]]), env.get_ref_constraint([action_bound[0], action_bottom[1]]), env.get_ref_constraint([action_bottom[0], action_bound[1]]) ] max_constraint, min_constraint = max(constraint_temp), min(constraint_temp) # epf.write("Max constraint: {}\n".format(max_constraint)) # epf.flush() # epf.write("Min constraint: {}\n".format(min_constraint)) # epf.flush() set_constraint = min_constraint + (max_constraint - min_constraint) * ratio env.set_constraint_value(max_constraint, min_constraint, set_constraint) # epf.write("Set constraint: {}\n".format(set_constraint)) # epf.flush() params_list = [get_params(test_actor, device=device)] temp_params = params_list[-1] # print(state, infos, eps,temperature) for i_adapt in range(n_episodes): score = 0 # env.shuffle_model() state, infos = env.reset() for _ in range(max_t): action, log_prob = test_agent.act(state, infos, eps, temperature, temp_params) # print(log_prob) # pdb.set_trace() next_state, reward, done, infos, sig, impt = env.step(action) test_agent.step(state, action, log_prob, reward, next_state, done, sig, impt, infos) state = next_state score += reward if done: break ## inner-gradient # learned_loss = compute_loss(test_agent, GAMMA, impt,infos) learned_loss = compute_learned_loss(test_agent, temp_params) inner_gradient = torch.autograd.grad( learned_loss, [v for _, v in params_list[i_adapt].items()], create_graph=not first_order, retain_graph=not first_order, allow_unused=True, ) params_list.append( SGD_step(params_list[i_adapt], inner_gradient, INNER_LR, testing=True)) temp_params = params_list[-1] # for i_episode in range(0, n_episodes): # if (i_episode+1) %100 ==0 and has_succeed_history: # eps /= 1.2 # temperature /=1.01 # temperature = max(temperature,1) # ajust_lr(ratio=0.8, actor_optimizer=test_actor_optimizer, min_lr=1e-6) # # print(state, infos, eps,temperature) # temp_params = get_params(test_actor, device=device) # score = 0 # # env.shuffle_model() # state, infos = env.reset() # for t in range(max_t): # action, log_prob = test_agent.act(state, infos, eps,temperature, temp_params) # # print(log_prob) # # pdb.set_trace() # next_state, reward, done, infos, sig, impt= env.step(action) # test_agent.step(state, action, log_prob, reward, next_state, done, sig, impt, infos) # state = next_state # score += reward # if done: # break # if done and sig: # loss = compute_loss(test_agent, GAMMA, impt, infos) # test_actor_optimizer.zero_grad() # # policy_loss.backward() # meta_gradient = torch.autograd.grad( # loss, # [v for _, v in temp_params.items()], # allow_unused=True, # ) # transfer_gradient_to_shared(meta_gradient, test_actor, device) # torch.nn.utils.clip_grad_norm_(test_actor.parameters(), CLIPPING_MODEL) # torch.nn.utils.clip_grad_norm_(test_actor.lstm.parameters(), CLIPPING_LSTM) # test_actor_optimizer.step() # test_agent.reset() scores_window.append(score) scores.append(score) if np.mean(scores_window) > best_score: best_score = np.mean(scores_window) env_chkpt = env.get_chkpt() if infos["succeed"]: has_succeed_history = True if env_chkpt["best_sol"] is not None: best_sol = env_chkpt["best_sol"]["sol"] set_constraint = env_chkpt["ctrs_info"]["value"] max_constraint = env_chkpt["ctrs_info"]["max"] min_constraint = env_chkpt["ctrs_info"]["min"] episode_reward, episode_constraint = env.exterior_search(best_sol) else: episode_reward = float("-Inf") episode_constraint = 0. if episode_reward > best_reward_whole: best_reward_whole = episode_reward best_constraint = episode_constraint best_episode = i_adapt # for f in glob.glob("*.m"): # os.remove(f) # for f in glob.glob("*.csv"): # os.remove(f) print(best_episode) return best_reward_whole, best_constraint / env.constraint_value * 100
print('state size:', num_inputs) print('action size:', num_actions) writer = SummaryWriter(args.logdir) actor = Actor(num_inputs, num_actions) critic = Critic(num_inputs) running_state = ZFilter((num_inputs, ), clip=5) if args.load_model is not None: saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model)) ckpt = torch.load(saved_ckpt_path) actor.load_state_dict(ckpt['actor']) critic.load_state_dict(ckpt['critic']) running_state.rs.n = ckpt['z_filter_n'] running_state.rs.mean = ckpt['z_filter_m'] running_state.rs.sum_square = ckpt['z_filter_s'] print("Loaded OK ex. Zfilter N {}".format(running_state.rs.n)) actor_optim = optim.Adam(actor.parameters(), lr=hp.actor_lr) critic_optim = optim.Adam(critic.parameters(), lr=hp.critic_lr, weight_decay=hp.l2_rate) episodes = 0 for iter in range(15000):
class Agent(): def __init__(self, state_size, action_size, num_agents, device, gamma=GAMMA, tau=TAU, lr_actor=LR_ACTOR, lr_critic=LR_CRITIC, random_seed=0): """ Initialize an Agent object. :param state_size: size of state :param action_size: size of action :param num_agents: number of agents :param gamma: discount factor :param tau: factor for soft update of target parameters :param lr_actor: Learning rate of actor :param lr_critic: Learning rate of critic :param random_seed: Random seed :param device: cuda or cpu """ self.device = device self.gamma = gamma self.tau = tau self.num_agents = num_agents self.state_size = state_size self.action_size = action_size self.full_state_size = state_size * num_agents self.full_action_size = action_size * num_agents self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, device, random_seed).to(device) self.actor_target = Actor(state_size, action_size, device, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(self.full_state_size, self.full_action_size, device=device, random_seed=random_seed).to(device) self.critic_target = Critic(self.full_state_size, self.full_action_size, device=device, random_seed=random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic, weight_decay=0) self.noise = OUNoise(action_size, random_seed) def save_model(self, agent_number): torch.save(self.actor_local.state_dict(), f'models/checkpoint_actor_{agent_number}.pth') torch.save(self.critic_local.state_dict(), f'models/checkpoint_critic_{agent_number}.pth') def load_model(self, agent_number): checkpoint = torch.load(f'models/checkpoint_actor_{agent_number}.pth', map_location=torch.device('cpu')) self.actor_local.load_state_dict(checkpoint) checkpoint = torch.load(f'models/checkpoint_critic_{agent_number}.pth', map_location=torch.device('cpu')) self.critic_local.load_state_dict(checkpoint) def act(self, state, noise=0., train=False): """Returns actions for given state as per current policy. :param state: state as seen from single agent """ if train is True: self.actor_local.train() else: self.actor_local.eval() action = self.actor_local(state) if noise > 0: noise = torch.tensor(noise * self.noise.sample(), dtype=state.dtype, device=state.device) return action + noise def target_act(self, state, noise=0.): #self.actor_target.eval() # convert to cpu() since noise is in cpu() self.actor_target.eval() action = self.actor_target(state).cpu() if noise > 0.: noise = torch.tensor(noise * self.noise.sample(), dtype=state.dtype, device=state.device) return action + noise def update_critic(self, rewards, dones, all_states, all_actions, all_next_states, all_next_actions): with torch.no_grad(): Q_targets_next = self.critic_target(all_next_states, all_next_actions) # Compute Q targets for current states (y_i) q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones)) # Compute critic loss q_expected = self.critic_local(all_states, all_actions) # critic_loss = F.mse_loss(q_expected, q_targets) critic_loss = ((q_expected - q_targets.detach())**2).mean() self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() def update_actor(self, all_states, all_predicted_actions): """Update actor network :param all_states: all states :param all_predicted_actions: all predicted actions """ actor_loss = -self.critic_local(all_states, all_predicted_actions).mean() self.actor_optimizer.zero_grad() actor_loss.backward(retain_graph=True) self.actor_optimizer.step() def update_targets(self): self.soft_update(self.actor_local, self.actor_target, self.tau) self.soft_update(self.critic_local, self.critic_target, self.tau) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def reset(self): self.noise.reset()
class Agent: """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, buffer_size=int(1e5), batch_size=256, learn_every=1, update_every=1, gamma=0.99, tau=0.02, lr_actor=2e-4, lr_critic=2e-3, random_seed=None, use_asn=True, asn_kwargs={}, use_psn=False, psn_kwargs={}, use_per=False, restore=None): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.update_every = update_every self.learn_every = learn_every self.batch_size = batch_size self.gamma = gamma self.tau = tau # Keep track of how many times we've updated weights self.i_updates = 0 self.i_step = 0 self.use_asn = use_asn self.use_psn = use_psn self.use_per = use_per if random_seed is not None: random.seed(random_seed) self.actor_local = Actor(state_size, action_size).to(device) self.actor_target = Actor(state_size, action_size).to(device) if self.use_psn: self.actor_perturbed = Actor(state_size, action_size).to(device) self.critic_local = Critic(state_size, action_size).to(device) self.critic_target = Critic(state_size, action_size).to(device) # restore networks if needed if restore is not None: checkpoint = torch.load(restore, map_location=device) self.actor_local.load_state_dict(checkpoint[0]['actor']) self.actor_target.load_state_dict(checkpoint[0]['actor']) if self.use_psn: self.actor_perturbed.load_state_dict(checkpoint[0]['actor']) self.critic_local.load_state_dict(checkpoint[0]['critic']) self.critic_target.load_state_dict(checkpoint[0]['critic']) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic) # Hard copy weights from local to target networks policy_update(self.actor_local, self.actor_target, 1.0) policy_update(self.critic_local, self.critic_target, 1.0) # Noise process if self.use_asn: self.action_noise = OUNoise(action_size, **asn_kwargs) if self.use_psn: self.param_noise = ParameterSpaceNoise(**psn_kwargs) if self.use_per: self.buffer = PrioritizedExperienceReplay(buffer_size, batch_size, random_seed) else: self.buffer = ExperienceReplay(buffer_size, batch_size, random_seed) def act(self, states, perturb_mode=True, train_mode=True): """Returns actions for given state as per current policy.""" if not train_mode: self.actor_local.eval() if self.use_psn: self.actor_perturbed.eval() with torch.no_grad(): states = torch.from_numpy(states).float().to(device) actor = self.actor_perturbed if ( self.use_psn and perturb_mode) else self.actor_local actions = actor(states).cpu().numpy()[0] if train_mode: actions += self.action_noise.sample() self.actor_local.train() if self.use_psn: self.actor_perturbed.train() return np.clip(actions, -1, 1) def perturb_actor_parameters(self): """Apply parameter space noise to actor model, for exploration""" policy_update(self.actor_local, self.actor_perturbed, 1.0) params = self.actor_perturbed.state_dict() for name in params: if 'ln' in name: pass param = params[name] random = torch.randn(param.shape) if use_cuda: random = random.cuda() param += random * self.param_noise.current_stddev def reset(self): self.action_noise.reset() if self.use_psn: self.perturb_actor_parameters() def step(self, experience, priority=0.0): self.buffer.push(experience) self.i_step += 1 if len(self.buffer) > self.batch_size: if self.i_step % self.learn_every == 0: self.learn(priority) if self.i_step % self.update_every == 0: self.update( ) # soft update the target network towards the actual networks def learn(self, priority=0.0): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ if self.use_per: (states, actions, rewards, states_next, dones), batch_idx = self.buffer.sample(priority) else: states, actions, rewards, states_next, dones = self.buffer.sample() # Get predicted next-state actions and Q values from target models with torch.no_grad(): actions_next = self.actor_target(states_next) Q_targets_next = self.critic_target(states_next, actions_next) Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones)) # ---------------------------- update critic ---------------------------- # # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.smooth_l1_loss(Q_expected, Q_targets) # Minimize the loss self.critic_local.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_local.zero_grad() actor_loss.backward() self.actor_optimizer.step() if self.use_per: Q_error = Q_expected - Q_targets new_deltas = torch.abs(Q_error.detach().squeeze(1)).numpy() self.buffer.update_deltas(batch_idx, new_deltas) def update(self): """soft update targets""" self.i_updates += 1 policy_update(self.actor_local, self.actor_target, self.tau) policy_update(self.critic_local, self.critic_target, self.tau) def save_model(self, model_dir, session_name, i_episode, best): filename = os.path.join( model_dir, f'ddpg_{session_name}-EP_{i_episode}-score_{best:.3f}.pt') filename_best = os.path.join(model_dir, f'ddpg_{session_name}-best.pt') save_dict_list = [] save_dict = { 'actor': self.actor_local.state_dict(), 'actor_optim_params': self.actor_optimizer.state_dict(), 'critic': self.critic_local.state_dict(), 'critic_optim_params': self.critic_optimizer.state_dict() } save_dict_list.append(save_dict) torch.save(save_dict_list, filename) copyfile(filename, filename_best) def postprocess(self, t_step): if self.use_psn and t_step > 0: perturbed_states, perturbed_actions, _, _, _ = self.buffer.tail( t_step) unperturbed_actions = self.act(np.array(perturbed_states), False, False) diff = np.array(perturbed_actions) - unperturbed_actions mean_diff = np.mean(np.square(diff), axis=0) dist = sqrt(np.mean(mean_diff)) self.param_noise.adapt(dist)
class DDPG(object): def __init__(self, nb_states, nb_actions, args): if args.seed > 0: self.seed(args.seed) self.nb_states = nb_states self.nb_actions = nb_actions # Create Actor and Critic Network net_cfg = { "hidden1": args.hidden1, "hidden2": args.hidden2, "init_w": args.init_w, } self.actor = Actor(self.nb_states, self.nb_actions, **net_cfg) self.actor_target = Actor(self.nb_states, self.nb_actions, **net_cfg) self.actor_optim = Adam(self.actor.parameters(), lr=args.prate) self.critic = Critic(self.nb_states, self.nb_actions, **net_cfg) self.critic_target = Critic(self.nb_states, self.nb_actions, **net_cfg) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) hard_update( self.actor_target, self.actor ) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) # Create replay buffer self.memory = SequentialMemory( limit=args.rmsize, window_length=args.window_length ) self.random_process = OrnsteinUhlenbeckProcess( size=nb_actions, theta=args.ou_theta, mu=args.ou_mu, sigma=args.ou_sigma ) # Hyper-parameters self.batch_size = args.bsize self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon # self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.is_training = True # if USE_CUDA: self.cuda() def update_policy(self): # Sample batch ( state_batch, action_batch, reward_batch, next_state_batch, terminal_batch, ) = self.memory.sample_and_split(self.batch_size) # Prepare for the target q batch next_q_values = self.critic_target( [ to_tensor(next_state_batch, volatile=True), self.actor_target(to_tensor(next_state_batch, volatile=True)), ] ) # next_q_values.volatile = False target_q_batch = ( to_tensor(reward_batch) + self.discount * to_tensor(terminal_batch.astype(np.float)) * next_q_values ) # Critic update self.critic.zero_grad() q_batch = self.critic([to_tensor(state_batch), to_tensor(action_batch)]) value_loss = criterion(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() # Actor update self.actor.zero_grad() policy_loss = -self.critic( [to_tensor(state_batch), self.actor(to_tensor(state_batch))] ) policy_loss = policy_loss.mean() policy_loss.backward() self.actor_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() def cuda(self): self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() def observe(self, r_t, s_t1, done): if self.is_training: self.memory.append(self.s_t, self.a_t, r_t, done) self.s_t = s_t1 def random_action(self): action = np.random.uniform(-1.0, 1.0, self.nb_actions) self.a_t = action return action def select_action(self, s_t, decay_epsilon=True): action = to_numpy(self.actor(to_tensor(np.array([s_t])))).squeeze(0) action += self.is_training * max(self.epsilon, 0) * self.random_process.sample() action = np.clip(action, -1.0, 1.0) if decay_epsilon: self.epsilon -= self.depsilon self.a_t = action return action def reset(self, obs): self.s_t = obs self.random_process.reset_states() def load_weights(self, output): if output is None: return self.actor.load_state_dict(torch.load("{}/actor.pkl".format(output))) self.critic.load_state_dict(torch.load("{}/critic.pkl".format(output))) def save_model(self, output): torch.save(self.actor.state_dict(), "{}/actor.pkl".format(output)) torch.save(self.critic.state_dict(), "{}/critic.pkl".format(output)) def seed(self, s): torch.manual_seed(s) if USE_CUDA: torch.cuda.manual_seed(s)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, num_agents, seed, fc1=400, fc2=300, update_times=10): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.num_agents = num_agents self.update_times = update_times self.noise = [] for i in range(num_agents): self.noise.append( rm.OrnsteinUhlenbeckProcess(size=(action_size, ), std=LinearSchedule(0.2))) # critic local and target network (Q-Learning) self.critic_local = Critic(state_size, action_size, fc1, fc2, seed).to(device) self.critic_target = Critic(state_size, action_size, fc1, fc2, seed).to(device) self.critic_target.load_state_dict(self.critic_local.state_dict()) # actor local and target network (Policy gradient) self.actor_local = Actor(state_size, action_size, fc1, fc2, seed).to(device) self.actor_target = Actor(state_size, action_size, fc1, fc2, seed).to(device) self.actor_target.load_state_dict(self.actor_local.state_dict()) # optimizer for critic and actor network self.optimizer_critic = optim.Adam(self.critic_local.parameters(), lr=CRITIC_LR) self.optimizer_actor = optim.Adam(self.actor_local.parameters(), lr=ACTOR_LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 self.a_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory for i in range(self.num_agents): self.memory.add(state[i], action[i], reward[i], next_state[i], done[i]) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: for i in range(self.update_times): experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, training=True): """Returns continous actions values for all action for given state as per current policy. Params ====== state (array_like): current state """ state = torch.from_numpy(state).float().detach().to(device) #print(state.shape,"act") self.actor_local.eval() with torch.no_grad(): actions = self.actor_local(state) self.actor_local.train() noise = [] for i in range(self.num_agents): noise.append(self.noise[i].sample()) return np.clip(actions.cpu().data.numpy() + np.array(noise), -1, 1) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences next_actions = self.actor_target(next_states) with torch.no_grad(): Q_target_next = self.critic_target(next_states, next_actions) Q_targets = rewards + (gamma * Q_target_next * (1 - dones)) Q_expected = self.critic_local(states, actions) #critic loss loss = F.mse_loss(Q_expected, Q_targets.detach()) self.optimizer_critic.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.optimizer_critic.step() #actor loss action_pr = self.actor_local(states) p_loss = -self.critic_local(states, action_pr).mean() self.optimizer_actor.zero_grad() p_loss.backward() self.optimizer_actor.step() # ------------------- update target network ------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def reset_random(self): for i in range(self.num_agents): self.noise[i].reset_states()
class Agent(): def __init__(self, test=False): # device if torch.cuda.is_available(): self.device = torch.device('cuda') else: self.device = torch.device('cpu') ######################################### """ Some hand tune config(for developing) """ self.discrete = False self.action_dim = 1 self.state_dim = 3 self.batch_size = 100 self.action_low = -2 self.action_high = 2 ########################################## self.P_online = Actor(state_dim=self.state_dim, action_size=self.action_dim).to(self.device) self.P_target = Actor(state_dim=self.state_dim, action_size=self.action_dim).to(self.device) self.P_target.load_state_dict(self.P_online.state_dict()) self.Q_online = Critic(state_size=self.state_dim, action_size=self.action_dim).to(self.device) self.Q_target = Critic(state_size=self.state_dim, action_size=self.action_dim).to(self.device) self.Q_target.load_state_dict(self.Q_online.state_dict()) # discounted reward self.gamma = 0.99 self.eps = 0.25 # optimizer self.q_optimizer = torch.optim.Adam(self.Q_online.parameters(), lr=1e-3) self.p_optimizer = torch.optim.Adam(self.P_online.parameters(), lr=1e-3) # saved rewards and actions self.replay_buffer = ReplayBuffer() # noise self.noise = Noise(DELTA, SIGMA, OU_A, OU_MU) # Initialize noise self.ou_level = 0. self.ep_step = 0 def act(self, state, test=False): if not test: with torch.no_grad(): # boring type casting state = ((torch.from_numpy(state)).unsqueeze(0)).float().to( self.device) action = self.P_online(state) # continuous output a = action.data.cpu().numpy() # if self.ep_step < 200: # self.ou_level = self.noise.ornstein_uhlenbeck_level(self.ou_level) # a = a + self.ou_level if self.discrete: action = np.argmax(a) return a, action else: if self.ep_step < 200: self.ou_level = self.noise.ornstein_uhlenbeck_level( self.ou_level) action = np.clip(a + self.ou_level, self.action_low, self.action_high) return action, action def collect_data(self, state, action, reward, next_state, done): self.replay_buffer.push( torch.from_numpy(state).float().unsqueeze(0), torch.from_numpy(action).float(), torch.tensor([reward]).float().unsqueeze(0), torch.from_numpy(next_state).float().unsqueeze(0), torch.tensor([done]).float().unsqueeze(0)) def clear_data(self): raise NotImplementedError("Circular Queue don't need this function") def update(self): if len(self.replay_buffer) < self.batch_size: return states, actions, rewards, next_states, dones = self.replay_buffer.sample( batch_size=self.batch_size, device=self.device) # discounted rewards # rewards = torch.from_numpy(discount((rewards.view(rewards.shape[0])).cpu().numpy())).float().to(self.device) ### debug shape : ok #===============================Critic Update=============================== self.Q_online.train() Q = self.Q_online((states, actions)) with torch.no_grad(): # don't need backprop for target value self.Q_target.eval() self.P_target.eval() target = rewards + self.gamma * (1 - dones) * self.Q_target( (next_states, self.P_target(next_states))) critic_loss_fn = torch.nn.MSELoss() critic_loss = critic_loss_fn(Q, target).mean() # update self.q_optimizer.zero_grad() critic_loss.backward() self.q_optimizer.step() # print("critic loss", critic_loss.item()) #===============================Actor Update=============================== # fix online_critic , update online_actor self.Q_online.eval() for p in self.Q_online.parameters(): p.requires_grad = False for p in self.P_online.parameters(): p.requires_grad = True policy_loss = -self.Q_online((states, self.P_online(states))) policy_loss = policy_loss.mean() self.p_optimizer.zero_grad() policy_loss.backward() self.p_optimizer.step() # print("policy loss", policy_loss.item()) for p in self.Q_online.parameters(): p.requires_grad = True #===============================Target Update=============================== soft_update(self.Q_target, self.Q_online, tau=1e-3) soft_update(self.P_target, self.P_online, tau=1e-3) self.eps -= EPSILON_DECAY if self.eps <= 0: self.eps = 0
class DDPG(object): def __init__(self, nb_status, nb_actions, args, writer): self.clip_actor_grad = args.clip_actor_grad self.nb_status = nb_status * args.window_length self.nb_actions = nb_actions self.discrete = args.discrete self.pic = args.pic self.writer = writer self.select_time = 0 if self.pic: self.nb_status = args.pic_status # Create Actor and Critic Network net_cfg = { 'hidden1':args.hidden1, 'hidden2':args.hidden2, 'use_bn':args.bn, 'init_method':args.init_method } if args.pic: self.cnn = CNN(1, args.pic_status) self.cnn_target = CNN(1, args.pic_status) self.cnn_optim = Adam(self.cnn.parameters(), lr=args.crate) self.actor = Actor(self.nb_status, self.nb_actions, **net_cfg) self.actor_target = Actor(self.nb_status, self.nb_actions, **net_cfg) self.actor_optim = Adam(self.actor.parameters(), lr=args.prate) self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) if args.pic: hard_update(self.cnn_target, self.cnn) #Create replay buffer self.memory = rpm(args.rmsize) # SequentialMemory(limit=args.rmsize, window_length=args.window_length) self.random_process = Myrandom(size=nb_actions) # Hyper-parameters self.batch_size = args.batch_size self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon # self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.use_cuda = args.cuda # if self.use_cuda: self.cuda() def normalize(self, pic): pic = pic.swapaxes(0, 2).swapaxes(1, 2) return pic def update_policy(self): # Sample batch state_batch, action_batch, reward_batch, \ next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size) # Prepare for the target q batch if self.pic: state_batch = np.array([self.normalize(x) for x in state_batch]) state_batch = to_tensor(state_batch, volatile=True) state_batch = self.cnn(state_batch) next_state_batch = np.array([self.normalize(x) for x in next_state_batch]) next_state_batch = to_tensor(next_state_batch, volatile=True) next_state_batch = self.cnn_target(next_state_batch) next_q_values = self.critic_target([ next_state_batch, self.actor_target(next_state_batch) ]) else: next_q_values = self.critic_target([ to_tensor(next_state_batch, volatile=True), self.actor_target(to_tensor(next_state_batch, volatile=True)), ]) # print('batch of picture is ok') next_q_values.volatile = False target_q_batch = to_tensor(reward_batch) + \ self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values # Critic update self.critic.zero_grad() if self.pic: self.cnn.zero_grad() if self.pic: state_batch.volatile = False q_batch = self.critic([state_batch, to_tensor(action_batch)]) else: q_batch = self.critic([to_tensor(state_batch), to_tensor(action_batch)]) # print(reward_batch, next_q_values*self.discount, target_q_batch, terminal_batch.astype(np.float)) value_loss = criterion(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() if self.pic: self.cnn_optim.step() self.actor.zero_grad() if self.pic: self.cnn.zero_grad() if self.pic: state_batch.volatile = False policy_loss = -self.critic([ state_batch, self.actor(state_batch) ]) else: policy_loss = -self.critic([ to_tensor(state_batch), self.actor(to_tensor(state_batch)) ]) policy_loss = policy_loss.mean() policy_loss.backward() if self.clip_actor_grad is not None: torch.nn.utils.clip_grad_norm(self.actor.parameters(), float(self.clip_actor_grad)) if self.writer != None: mean_policy_grad = np.array(np.mean([np.linalg.norm(p.grad.data.cpu().numpy().ravel()) for p in self.actor.parameters()])) #print(mean_policy_grad) self.writer.add_scalar('train/mean_policy_grad', mean_policy_grad, self.select_time) self.actor_optim.step() if self.pic: self.cnn_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) if self.pic: soft_update(self.cnn_target, self.cnn, self.tau) return -policy_loss, value_loss def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() if(self.pic): self.cnn.eval() self.cnn_target.eval() def train(self): self.actor.train() self.actor_target.train() self.critic.train() self.critic_target.train() if(self.pic): self.cnn.train() self.cnn_target.train() def cuda(self): self.cnn.cuda() self.cnn_target.cuda() self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() def observe(self, r_t, s_t1, done): self.memory.append([self.s_t, self.a_t, r_t, s_t1, done]) self.s_t = s_t1 def random_action(self, fix=False): action = np.random.uniform(-1.,1.,self.nb_actions) self.a_t = action if self.discrete and fix == False: action = action.argmax() # if self.pic: # action = np.concatenate((softmax(action[:16]), softmax(action[16:]))) return action def select_action(self, s_t, decay_epsilon=True, return_fix=False, noise_level=0): self.eval() if self.pic: s_t = self.normalize(s_t) s_t = self.cnn(to_tensor(np.array([s_t]))) if self.pic: action = to_numpy( self.actor_target(s_t) ).squeeze(0) else: action = to_numpy( self.actor(to_tensor(np.array([s_t]))) ).squeeze(0) self.train() noise_level = noise_level * max(self.epsilon, 0) if np.random.uniform(0, 1) < noise_level: action = self.random_action(fix=True) # episilon greedy if decay_epsilon: self.epsilon -= self.depsilon self.a_t = action if return_fix: return action if self.discrete: return action.argmax() else: return action def reset(self, obs): self.s_t = obs self.random_process.reset_status() def load_weights(self, output, num=1): if output is None: return self.actor.load_state_dict( torch.load('{}/actor{}.pkl'.format(output, num)) ) self.actor_target.load_state_dict( torch.load('{}/actor{}.pkl'.format(output, num)) ) self.critic.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num)) ) self.critic_target.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num)) ) def save_model(self, output, num): if self.use_cuda: self.cnn.cpu() self.actor.cpu() self.critic.cpu() torch.save( self.actor.state_dict(), '{}/actor{}.pkl'.format(output, num) ) torch.save( self.critic.state_dict(), '{}/critic{}.pkl'.format(output, num) ) if self.use_cuda: self.cnn.cuda() self.actor.cuda() self.critic.cuda()
class ddpg_Agent(): """Interacts with and learns from the environment.""" def __init__(self, env, config): """Initialize an Agent object. Params ====== env : environment to be handled config : configuration given a variety of parameters """ self.env = env self.config = config # set parameter for ML self.set_parameters(config) # Q-Network self.create_networks() # Noise process self.noise = OUNoise(self.action_size, self.seed) # Replay memory self.memory = ReplayBuffer(self.action_size, self.buffer_size, self.batch_size, self.seed) def set_parameters(self, config): # Base agent parameters self.gamma = config['gamma'] # discount factor self.tau = config['tau'] self.max_episodes = config['max_episodes'] # max numbers of episdoes to train self.env_file_name = config['env_file_name'] # name and path for env app self.brain_name = config['brain_name'] # name for env brain used in step self.num_agents = config['num_agents'] self.state_size = config['state_size'] self.action_size = config['action_size'] self.hidden_size = config['hidden_size'] self.buffer_size = config['buffer_size'] self.batch_size = config['batch_size'] self.dropout = config['dropout'] self.critic_learning_rate = config['critic_learning_rate'] self.actor_learning_rate = config['actor_learning_rate'] self.seed = (config['seed']) self.noise_scale = 1 self.noise_sigma = 0.1 # Some debug flags self.DoDebugEpisodeLists = False def create_networks(self): # Actor Network (local & Target Network) self.actor_local = Actor(self.state_size, self.hidden_size, self.action_size, self.seed).to(device) self.actor_target = Actor(self.state_size, self.hidden_size, self.action_size, self.seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.actor_learning_rate) # Critic Network (local & Target Network) self.critic_local = Critic(self.state_size, self.hidden_size, self.action_size, self.seed, self.dropout).to(device) self.critic_target = Critic(self.state_size, self.hidden_size, self.action_size, self.seed, self.dropout).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.critic_learning_rate) def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward # print('step : Next States : ',next_state.shape) self.memory.add(state, action, reward, next_state, done) # print('New step added to memory, length : ',len(self.memory)) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences, self.gamma) def update_noise_scale(self, cur_reward, scale_min = 0.2, scale_noise=False): """ If scale_noise == True the self.noise_scale will be decreased in relation to rewards Currently hand coded as rewards go up noise_scale will go down from 1 to scale_min""" if scale_noise: rewlow = 2 # below rewlow noise_scale is 1 from there on it increases linearly down to scale_min + 0.5*(1 - scale_min) until rewhigh is reached rewhigh = 10 # above rewhigh noise_scale falls linearly down to scale_min until rewrd = 30 is reached. Beyond 30 it stays at scale_min if cur_reward > rewlow: if cur_reward < rewhigh: self.noise_scale = (1 - scale_min)*(0.5*(rewhigh-cur_reward)/(rewhigh - rewlow) + 0.5) + scale_min else: self.noise_scale = (1 - scale_min)*np.min(0.5*(30-cur_reward)/((30-rewhigh)),0) + scale_min print('Updated noise scale to : ',self.noise_scale) return def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = ten(state) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise_scale * self.noise.sample() # ToDo check if tanh works better return np.clip(action, -1, 1) def train(self): if False: filename = 'trained_reacher_a_e100.pth' self.load_agent(filename) all_rewards = [] reward_window = deque(maxlen=100) print('Running on device : ',device) for i_episode in range(self.max_episodes): tic = time.time() # Reset the enviroment env_info = self.env.reset(train_mode=True)[self.brain_name] state = env_info.vector_observations total_reward = np.zeros(self.num_agents) t = 0 done = np.zeros(self.num_agents, dtype = bool) # loop over episode time steps while all(done==False): # t < self.tmax: # act and collect data action = self.act(state) env_info = self.env.step(action)[self.brain_name] next_state = env_info.vector_observations reward = np.asarray(env_info.rewards) done = np.asarray(env_info.local_done) # np.set_printoptions(formatter={'float': '{: 0.3f}'.format}) # print('Episode {} step {} taken action {} reward {} and done is {}'.format(i_episode,t,action,reward,done)) # increment stuff t += 1 total_reward += reward # Proceed agent step self.step(state, action, reward, next_state, done) # prepare for next round state = next_state # while not done # keep track of rewards: all_rewards.append(np.mean(total_reward)) reward_window.append(np.mean(total_reward)) # Output Episode info : toc = time.time() if (i_episode == 100): self.stable_update() self.update_noise_scale(np.mean(reward_window)) if not (i_episode % 25 == 0): print('Episode {} || Total Reward : {:6.3f} || average reward : {:6.3f} || Used {:5.3f} seconds, mem : {}'.format(i_episode,np.mean(total_reward),np.mean(reward_window),toc-tic,len(self.memory))) else: print(Back.RED + 'Episode {} || Total Reward : {:6.3f} || average reward : {:6.3f}'.format(i_episode,np.mean(total_reward),np.mean(reward_window))) print(Style.RESET_ALL) if (i_episode % 50 == 0): self.save_agent(i_episode) # for i_episode return all_rewards def reset(self): self.noise.reset() def stable_update(self): """ Update Hyperparameters which proved more stable """ self.buffer_size = 400000 self.memory.enlarge(self.buffer_size) self.noise_sigma = 0.05 self.noise.sigma = 0.05 def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples self.gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models # print('learn : Next States : ',next_states.shape) actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # print('learn : Actions : ',actions_next.shape) # print('learn : Q_target_next : ',Q_targets_next.shape) # Compute Q targets for current states (y_i) Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, self.tau) self.soft_update(self.actor_local, self.actor_target, self.tau) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data) def save_agent(self,i_episode): filename = 'trained_reacher_e'+str(i_episode)+'.pth' torch.save({ 'critic_local': self.critic_local.state_dict(), 'critic_target': self.critic_target.state_dict(), 'actor_local': self.actor_local.state_dict(), 'actor_target': self.actor_target.state_dict(), }, filename) print('Saved Networks in ',filename) return def load_agent(self,filename): savedata = torch.load(filename) self.critic_local.load_state_dict(savedata['critic_local']) self.critic_target.load_state_dict(savedata['critic_target']) self.actor_local.load_state_dict(savedata['actor_local']) self.actor_target.load_state_dict(savedata['actor_target']) return
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed, memory=None, buffer_size=BUFFER_SIZE, batch_size=BATCH_SIZE, gamma=GAMMA, tau=TAU, lr_actor=LR_ACTOR, lr_critic=LR_CRITIC, weigth_decay=WEIGHT_DECAY, pretrained_actor_weights=None, pretrained_critic_weights=None): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.buffer_size = buffer_size self.batch_size = batch_size self.gamma = gamma self.tau = tau self.lr_actor = lr_actor self.lr_critic = lr_critic self.weight_decay = weigth_decay # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.lr_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.lr_critic, weight_decay=self.weight_decay) if pretrained_actor_weights: actor_weights = torch.load(pretrained_actor_weights) self.actor_local.load_state_dict(actor_weights) self.actor_target.load_state_dict(actor_weights) if pretrained_critic_weights: critic_weights = torch.load(pretrained_critic_weights) self.critic_local.load_state_dict(critic_weights) self.critic_target.load_state_dict(critic_weights) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory if memory: self.memory = memory else: self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, random_seed) def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences, self.gamma) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device).unsqueeze(0) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, self.tau) self.soft_update(self.actor_local, self.actor_target, self.tau) self.noise.reset() def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): "Single agent no learning algorithm" def __init__(self, state_size, action_size, random_seed, lr_actor=1e-4, lr_critic=1e-3, weight_decay=0): """Initialize an Agent object Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed lr_actor (float) : learning rate actor network lr_critic (float) : learning rate critic network weight_decay (float) : weight decay regularizer """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.noise = OUNoise(action_size, random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic, weight_decay=weight_decay) def act(self, state, add_noise=True): "Returns actions for given state as per current policy" if not isinstance(state, torch.Tensor): state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def load(self, filename, map_location=None): "Load weights for actor and critic" weights = torch.load(filename, map_location=map_location) self.actor_local.load_state_dict(weights['actor']) if 'critic' in weights: self.critic_local.load_state_dict(weights['critic']) def reset(self): self.noise.reset() def save(self, filename='checkpoint.pth'): "Serialize actor and critic weights" checkpoint = { 'actor': self.actor_local.state_dict(), 'critic': self.critic_local.state_dict() } torch.save(checkpoint, filename)
class DDPG(object): def __init__(self, nb_states, nb_actions, args, discrete, use_cuda=False): if args.seed > 0: self.seed(args.seed) self.nb_states = nb_states self.nb_actions = nb_actions self.discrete = discrete # Create Actor and Critic Network net_cfg = { 'hidden1':args.hidden1, 'hidden2':args.hidden2, 'init_w':args.init_w } self.actor = Actor(self.nb_states * args.window_length, self.nb_actions, **net_cfg) self.actor_target = Actor(self.nb_states * args.window_length, self.nb_actions, **net_cfg) self.actor_optim = Adam(self.actor.parameters(), lr=args.prate) self.critic = Critic(self.nb_states * args.window_length, self.nb_actions, **net_cfg) self.critic_target = Critic(self.nb_states * args.window_length, self.nb_actions, **net_cfg) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) #Create replay buffer self.memory = rpm(args.rmsize) # SequentialMemory(limit=args.rmsize, window_length=args.window_length) self.random_process = Myrandom(size=nb_actions) # Hyper-parameters self.batch_size = args.bsize self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon # self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.use_cuda = use_cuda # if self.use_cuda: self.cuda() def update_policy(self, train_actor = True): # Sample batch state_batch, action_batch, reward_batch, \ next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size) # state_batch, action_batch, reward_batch, \ # next_state_batch, terminal_batch = self.memory.sample_and_split(self.batch_size) # Prepare for the target q batch next_q_values = self.critic_target([ to_tensor(next_state_batch, volatile=True), self.actor_target(to_tensor(next_state_batch, volatile=True)), ]) next_q_values.volatile = False target_q_batch = to_tensor(reward_batch) + \ self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values # Critic update self.critic.zero_grad() q_batch = self.critic([ to_tensor(state_batch), to_tensor(action_batch) ]) value_loss = criterion(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() self.actor.zero_grad() policy_loss = -self.critic([ to_tensor(state_batch), self.actor(to_tensor(state_batch)) ]) policy_loss = policy_loss.mean() policy_loss.backward() if train_actor == True: self.actor_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) return -policy_loss, value_loss def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() def cuda(self): print("use cuda") self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() def observe(self, r_t, s_t1, done): self.memory.append([self.s_t, self.a_t, r_t, s_t1, done]) self.s_t = s_t1 def random_action(self): action = np.random.uniform(-1.,1.,self.nb_actions) self.a_t = action if self.discrete: return action.argmax() else: return action def select_action(self, s_t, decay_epsilon=True, return_fix=False, noise_level=1): action = to_numpy( self.actor(to_tensor(np.array([s_t]))) ).squeeze(0) # print(self.random_process.sample(), action) noise_level = noise_level * max(self.epsilon, 0) action = action * (1 - noise_level) + (self.random_process.sample() * noise_level) # print(max(self.epsilon, 0) * self.random_process.sample() * noise_level, noise_level) action = np.clip(action, -1., 1.) # print(action) if decay_epsilon: self.epsilon -= self.depsilon self.a_t = action if return_fix: return action if self.discrete: return action.argmax() else: return action def reset(self, obs): self.s_t = obs self.random_process.reset_states() def load_weights(self, output): if output is None: return self.actor.load_state_dict( torch.load('{}/actor.pkl'.format(output)) ) self.critic.load_state_dict( torch.load('{}/critic.pkl'.format(output)) ) def save_model(self, output): if self.use_cuda: self.actor.cpu() self.critic.cpu() torch.save( self.actor.state_dict(), '{}/actor.pkl'.format(output) ) torch.save( self.critic.state_dict(), '{}/critic.pkl'.format(output) ) if self.use_cuda: self.actor.cuda() self.critic.cuda() def seed(self,s): torch.manual_seed(s) if self.use_cuda: torch.cuda.manual_seed(s)
class Multi_Agents(): """ Implements interactions and learning on environments for a set of agents """ def __init__(self, agents_count, state_size, action_size, random_seed, buffer_size, batch_size, gamma, fc1_units, fc2_units, noise, lr_actor, lr_critic): """Initialize a Multi_Agent. Params ====== agents_count (int): the number of agents state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed buffer_size(int): replay buffer size gamma(float): discount factor fc1_units (int): Number of nodes in first hidden layer fc2_units (int): Number of nodes in second hidden layer noise(Object): The noise applied to the actions selection lr_actor(float) : learning rates of the actor lr_critic(float) : learning rates of the critic """ self.agents_count = agents_count self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.gamma = gamma self.batch_size = batch_size # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed, fc1_units, fc2_units).to(device) self.actor_target = Actor(state_size, action_size, random_seed, fc1_units, fc2_units).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed, fc1_units, fc2_units).to(device) self.critic_target = Critic(state_size, action_size, random_seed, fc1_units, fc2_units).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic, weight_decay=WEIGHT_DECAY) # after reading implementating of ShangtongZhang as suggested in the course, # It seems relevant to initialize the weights of the target networks # with the same values as the local network : self.actor_target.load_state_dict(self.actor_local.state_dict()) self.critic_target.load_state_dict(self.critic_local.state_dict()) # Noise process self.noise = noise # Replay memory self.memory = ReplayBuffer(action_size, buffer_size, batch_size, random_seed) def step(self, states, actions, rewards, next_states, dones): """Save experience in replay memory, and use random sample from buffer to learn.""" for a in range(self.agents_count): # save for each agent self.memory.add(states[a], actions[a], rewards[a], next_states[a], dones[a]) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) def act(self, states, add_noise=True): """Returns actions for each given state of each agent as per current policy.""" states = torch.from_numpy(states).float().to(device) actions = np.empty([self.agents_count, self.action_size]) self.actor_local.eval() with torch.no_grad(): for a in range(self.agents_count): actions[a] = self.actor_local(states[a]).cpu().data.numpy() self.actor_local.train() if add_noise: actions += self.noise.sample() return np.clip(actions, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() # as suggested in the "Benchmak implementation" section of the course" torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, memory, batch_size, lr_actor, lr_critic, clip_critic, gamma, tau, weight_decay, update_network_steps, sgd_epoch, checkpoint_prefix): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed memory (ReplayBuffer): The replay buffer for storing xperiences batch_size (int): Number of experiences to sample from the memory lr_actor (float): The learning rate for the actor lr_critic (float): The learning rate critic clip_critic (float): The clip value for updating grads gamma (float): The reward discount factor tau (float): For soft update of target parameters weight_decay (float): The weight decay update_network_steps (int): How often to update the network sgd_epoch (int): Number of iterations for each network update checkpoint_prefix (string): The string prefix for saving checkpoint files """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.memory = memory self.batch_size = batch_size self.lr_actor = lr_actor self.lr_critic = lr_critic self.clip_critic = clip_critic self.gamma = gamma self.tau = tau self.weight_decay = weight_decay self.update_network_steps = update_network_steps self.sgd_epoch = sgd_epoch self.n_step = 0 # checkpoint self.checkpoint_prefix = checkpoint_prefix self.actor_loss_episodes = [] self.critic_loss_episodes = [] self.actor_loss = 0 self.critic_loss = 0 # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, seed).to(device) self.actor_target = Actor(state_size, action_size, seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, seed).to(device) self.critic_target = Critic(state_size, action_size, seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic, weight_decay=weight_decay) # Noise process self.noise = OUNoise(action_size, seed) def step(self, state, action, action_prob, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward for i in range(len(state)): self.memory.add(state[i], action[i], action_prob[i], reward[i], next_state[i], done[i]) # learn every n steps self.n_step = (self.n_step + 1) % self.update_network_steps if self.n_step == 0: # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: for i in range(self.sgd_epoch): experiences = self.memory.sample() self.learn(experiences, self.gamma) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1), np.zeros_like(action) # N/A action prob for DDPG def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, action_probs, rewards, next_states, dones = experiences # normalize rewards rewards = utils.normalize_rewards(rewards) # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) self.critic_loss = critic_loss # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() if self.clip_critic > 0: torch.nn.utils.clip_grad_norm(self.critic_local.parameters(), self.clip_critic) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() self.actor_loss = actor_loss # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, self.tau) self.soft_update(self.actor_local, self.actor_target, self.tau) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data) def checkpoint(self): """Save internal information in memory for later checkpointing""" self.actor_loss_episodes.append(self.actor_loss) self.critic_loss_episodes.append(self.critic_loss) def save_checkpoint(self): """Persist checkpoint information""" # the history loss utils.plot_scores(self.checkpoint_prefix + "_actor_loss.png", self.actor_loss_episodes, label="loss") utils.plot_scores(self.checkpoint_prefix + "_critic_loss.png", self.critic_loss_episodes, label="loss") # network torch.save(self.actor_local.state_dict(), self.checkpoint_prefix + "_actor.pth") torch.save(self.critic_local.state_dict(), self.checkpoint_prefix + "_critic.pth") def load_checkpoint(self): """Restore checkpoint information""" self.actor_local.load_state_dict(torch.load(self.checkpoint_prefix + "_actor.pth")) self.critic_local.load_state_dict(torch.load(self.checkpoint_prefix + "_critic.pth"))
class Agent: """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed, agents=2, every=4, updates=4): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size random.seed(np.random.seed(random_seed)) self.agents = agents self.every = every self.updates = updates self.steps = 0 # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noises = OUNoise((agents, action_size)) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, device) def load_actor(self, model_file: str): self.actor_local.load_state_dict( torch.load(model_file, map_location=device)) self.actor_local.to(device) def step(self, states, actions, rewards, next_states, dones): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.steps += 1 for i in range(self.agents): self.memory.add(states[i], actions[i], rewards[i], next_states[i], dones[i]) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE and self.steps % self.every == 0: self.steps = 0 for _ in range(self.updates): experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, states, add_noise=True): """Returns actions for given state as per current policy.""" states = torch.from_numpy(states).float().to(device) self.actor_local.eval() with torch.no_grad(): actions = self.actor_local(states).cpu().data.numpy() self.actor_local.train() if add_noise: actions += self.noises.sample() return np.clip(actions, -1, 1) def reset(self): self.noises.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() # torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) # Gradient clipping self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class DDPG(object): def __init__(self, nb_status, nb_actions, args, writer): self.clip_actor_grad = args.clip_actor_grad self.nb_status = nb_status * args.window_length self.nb_actions = nb_actions self.discrete = args.discrete self.pic = args.pic self.writer = writer self.select_time = 0 if self.pic: self.nb_status = args.pic_status # Create Actor and Critic Network net_cfg = { 'hidden1': args.hidden1, 'hidden2': args.hidden2, 'use_bn': args.bn, 'init_method': args.init_method } if args.pic: self.cnn = CNN(1, args.pic_status) self.cnn_target = CNN(1, args.pic_status) self.cnn_optim = Adam(self.cnn.parameters(), lr=args.crate) self.actor = Actor(self.nb_status, self.nb_actions, **net_cfg) self.actor_target = Actor(self.nb_status, self.nb_actions, **net_cfg) self.actor_optim = Adam(self.actor.parameters(), lr=args.prate) self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) if args.pic: hard_update(self.cnn_target, self.cnn) #Create replay buffer self.memory = rpm( args.rmsize ) # SequentialMemory(limit=args.rmsize, window_length=args.window_length) self.random_process = Myrandom(size=nb_actions) # Hyper-parameters self.batch_size = args.batch_size self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon # self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.use_cuda = args.cuda # if self.use_cuda: self.cuda() def normalize(self, pic): pic = pic.swapaxes(0, 2).swapaxes(1, 2) return pic def update_policy(self): # Sample batch state_batch, action_batch, reward_batch, \ next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size) # Prepare for the target q batch if self.pic: state_batch = np.array([self.normalize(x) for x in state_batch]) state_batch = to_tensor(state_batch, volatile=True) state_batch = self.cnn(state_batch) next_state_batch = np.array( [self.normalize(x) for x in next_state_batch]) next_state_batch = to_tensor(next_state_batch, volatile=True) next_state_batch = self.cnn_target(next_state_batch) next_q_values = self.critic_target( [next_state_batch, self.actor_target(next_state_batch)]) else: next_q_values = self.critic_target([ to_tensor(next_state_batch, volatile=True), self.actor_target(to_tensor(next_state_batch, volatile=True)), ]) # print('batch of picture is ok') next_q_values.volatile = False target_q_batch = to_tensor(reward_batch) + \ self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values # Critic update self.critic.zero_grad() if self.pic: self.cnn.zero_grad() if self.pic: state_batch.volatile = False q_batch = self.critic([state_batch, to_tensor(action_batch)]) else: q_batch = self.critic( [to_tensor(state_batch), to_tensor(action_batch)]) # print(reward_batch, next_q_values*self.discount, target_q_batch, terminal_batch.astype(np.float)) value_loss = criterion(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() if self.pic: self.cnn_optim.step() self.actor.zero_grad() if self.pic: self.cnn.zero_grad() if self.pic: state_batch.volatile = False policy_loss = -self.critic([state_batch, self.actor(state_batch)]) else: policy_loss = -self.critic( [to_tensor(state_batch), self.actor(to_tensor(state_batch))]) policy_loss = policy_loss.mean() policy_loss.backward() if self.clip_actor_grad is not None: torch.nn.utils.clip_grad_norm(self.actor.parameters(), float(self.clip_actor_grad)) if self.writer != None: mean_policy_grad = np.array( np.mean([ np.linalg.norm(p.grad.data.cpu().numpy().ravel()) for p in self.actor.parameters() ])) #print(mean_policy_grad) self.writer.add_scalar('train/mean_policy_grad', mean_policy_grad, self.select_time) self.actor_optim.step() if self.pic: self.cnn_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) if self.pic: soft_update(self.cnn_target, self.cnn, self.tau) return -policy_loss, value_loss def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() if (self.pic): self.cnn.eval() self.cnn_target.eval() def train(self): self.actor.train() self.actor_target.train() self.critic.train() self.critic_target.train() if (self.pic): self.cnn.train() self.cnn_target.train() def cuda(self): self.cnn.cuda() self.cnn_target.cuda() self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() def observe(self, r_t, s_t1, done): self.memory.append([self.s_t, self.a_t, r_t, s_t1, done]) self.s_t = s_t1 def random_action(self, fix=False): action = np.random.uniform(-1., 1., self.nb_actions) self.a_t = action if self.discrete and fix == False: action = action.argmax() if self.pic: action = np.concatenate( (softmax(action[:16]), softmax(action[16:]))) return action def select_action(self, s_t, decay_epsilon=True, return_fix=False, noise_level=0): self.eval() if self.pic: s_t = self.normalize(s_t) s_t = self.cnn(to_tensor(np.array([s_t]))) if self.pic: action = to_numpy(self.actor_target(s_t)).squeeze(0) else: action = to_numpy(self.actor(to_tensor(np.array([s_t ])))).squeeze(0) self.train() noise_level = noise_level * max(self.epsilon, 0) if np.random.uniform(0, 1) < noise_level: action = (action + self.random_action(fix=True)) / 2. # episilon greedy if decay_epsilon: self.epsilon -= self.depsilon self.a_t = action if return_fix: return action if self.discrete: return action.argmax() else: return action def reset(self, obs): self.s_t = obs self.random_process.reset_status() def load_weights(self, output, num=1): if output is None: return self.actor.load_state_dict( torch.load('{}/actor{}.pkl'.format(output, num))) self.actor_target.load_state_dict( torch.load('{}/actor{}.pkl'.format(output, num))) self.critic.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num))) self.critic_target.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num))) def save_model(self, output, num): if self.use_cuda: self.cnn.cpu() self.actor.cpu() self.critic.cpu() torch.save(self.actor.state_dict(), '{}/actor{}.pkl'.format(output, num)) torch.save(self.critic.state_dict(), '{}/critic{}.pkl'.format(output, num)) if self.use_cuda: self.cnn.cuda() self.actor.cuda() self.critic.cuda()
def main(): env = gym.make(args.env_name) env.seed(args.seed) torch.manual_seed(args.seed) num_inputs = env.observation_space.shape[0] num_actions = env.action_space.shape[0] running_state = ZFilter((num_inputs,), clip=5) print('state size:', num_inputs) print('action size:', num_actions) actor = Actor(num_inputs, num_actions, args) critic = Critic(num_inputs, args) actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate) critic_optim = optim.Adam(critic.parameters(), lr=args.learning_rate, weight_decay=args.l2_rate) writer = SummaryWriter(comment="-ppo_iter-" + str(args.max_iter_num)) if args.load_model is not None: saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model)) ckpt = torch.load(saved_ckpt_path) actor.load_state_dict(ckpt['actor']) critic.load_state_dict(ckpt['critic']) running_state.rs.n = ckpt['z_filter_n'] running_state.rs.mean = ckpt['z_filter_m'] running_state.rs.sum_square = ckpt['z_filter_s'] print("Loaded OK ex. Zfilter N {}".format(running_state.rs.n)) episodes = 0 for iter in range(args.max_iter_num): actor.eval(), critic.eval() memory = deque() steps = 0 scores = [] while steps < args.total_sample_size: state = env.reset() score = 0 state = running_state(state) for _ in range(10000): if args.render: env.render() steps += 1 mu, std = actor(torch.Tensor(state).unsqueeze(0)) action = get_action(mu, std)[0] next_state, reward, done, _ = env.step(action) if done: mask = 0 else: mask = 1 memory.append([state, action, reward, mask]) next_state = running_state(next_state) state = next_state score += reward if done: break episodes += 1 scores.append(score) score_avg = np.mean(scores) print('{}:: {} episode score is {:.2f}'.format(iter, episodes, score_avg)) writer.add_scalar('log/score', float(score_avg), iter) actor.train(), critic.train() train_model(actor, critic, memory, actor_optim, critic_optim, args) if iter % 100: score_avg = int(score_avg) model_path = os.path.join(os.getcwd(),'save_model') if not os.path.isdir(model_path): os.makedirs(model_path) ckpt_path = os.path.join(model_path, 'ckpt_'+ str(score_avg)+'.pth.tar') save_checkpoint({ 'actor': actor.state_dict(), 'critic': critic.state_dict(), 'z_filter_n':running_state.rs.n, 'z_filter_m': running_state.rs.mean, 'z_filter_s': running_state.rs.sum_square, 'args': args, 'score': score_avg }, filename=ckpt_path)