def coordinator(rank, args, share_model, exp_queues, model_params): assert len(exp_queues) == args.num_processes # device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # print(device) model = ActorCritic() model.train() # model.load_state_dict(share_model.state_dict()) for i in range(args.num_processes): model_params[i].put(model.state_dict()) # if args.cuda: # model = model.cuda() optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=1e-5) entropy_coef = args.entropy_coef count = 0 while True: count += 1 if count >= 14000: entropy_coef = 1 if count >= 17000: entropy_coef = 0.5 if count >= 19000: entropy_coef = 0.1 # assemble experiences from the agents for i in range(args.num_processes): s_batch, a_batch, r_batch, done = exp_queues[i].get() loss = compute_loss(args, s_batch, a_batch, r_batch, done, model, entropy_coef) optimizer.zero_grad() loss.backward(retain_graph=True) if torch.isnan(loss): torch.save(s_batch, 's_batch-coor.pt') torch.save(loss, 'loss.pt') print('s_batch', s_batch) print('loss: ', loss) break torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) # for param in model.parameters(): # param.grad.data.clamp_(-1, 1) optimizer.step() print('update model parameters ', count) if torch.isnan(loss): break # model.zero_grad() # if args.cuda: # model = model.cpu() for i in range(args.num_processes): model_params[i].put(model.state_dict()) share_model.load_state_dict(model.state_dict())
def main(): env = gym.make(args.env_name) env.seed(500) torch.manual_seed(500) num_inputs = env.observation_space.shape[0] num_actions = env.action_space.n print('state size:', num_inputs) print('action size:', num_actions) net = ActorCritic(num_inputs, num_actions) optimizer = optim.Adam(net.parameters(), lr=0.001) writer = SummaryWriter('logs') if not os.path.isdir(args.save_path): os.makedirs(args.save_path) net.to(device) net.train() running_score = 0 for e in range(3000): done = False score = 0 state = env.reset() state = torch.Tensor(state).to(device) state = state.unsqueeze(0) while not done: if args.render: env.render() policy, value = net(state) action = get_action(policy, num_actions) next_state, reward, done, _ = env.step(action) next_state = torch.Tensor(next_state).to(device) next_state = next_state.unsqueeze(0) mask = 0 if done else 1 reward = reward if not done or score == 499 else -1 transition = [state, next_state, action, reward, mask] train_model(net, optimizer, transition, policy, value) score += reward state = next_state score = score if score == 500.0 else score + 1 running_score = 0.99 * running_score + 0.01 * score if e % args.log_interval == 0: print('{} episode | score: {:.2f}'.format(e, running_score)) writer.add_scalar('log/score', float(score), running_score) if running_score > args.goal_score: ckpt_path = args.save_path + 'model.pth' torch.save(net.state_dict(), ckpt_path) print('running score exceeds 400 so end') break
def run(args): device = torch.device("cpu") env = gym.make('SpaceInvaders-v0') state_size = env.observation_space.shape action_size = env.action_space.n model = ActorCritic([1, 4, 84, 84], action_size).to(device) opt = SharedRMSprop(model.parameters(), lr=args.lr, alpha=args.alpha, eps=1e-8, weight_decay=args.weight_decay, momentum=args.momentum, centered=False) opt_lock = mp.Lock() scheduler = LRScheduler(args) if args.load_fp: checkpoint = torch.load(args.load_fp) model.load_state_dict(checkpoint['model_state_dict']) opt.load_state_dict(checkpoint['optimizer_state_dict']) if args.train: start = time.time() model.share_memory() model.train() step_counter, max_reward, ma_reward, ma_loss = [ mp.Value('d', 0.0) for _ in range(4) ] processes = [] if args.num_procs == -1: args.num_procs = mp.cpu_count() for rank in range(args.num_procs): p = mp.Process(target=train, args=(rank, args, device, model, opt, opt_lock, scheduler, step_counter, max_reward, ma_reward, ma_loss)) p.start() processes.append(p) for p in processes: p.join() if args.verbose > 0: print(f"Seconds taken: {time.time() - start}") if args.save_fp: torch.save( { 'model_state_dict': model.state_dict(), # 'optimizer_state_dict': opt.state_dict(), }, args.save_fp) if args.test: model.eval() test(args, device, model)
class PPO(): def __init__(self, state_dim, action_dim, lr, betas, gamma, K_epochs, eps_clip, device): self.lr = lr self.betas = betas self.gamma = gamma self.eps_clip = eps_clip self.K_epochs = K_epochs self.device = device self.policy = ActorCritic(state_dim, action_dim).to(self.device) self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=lr, betas=betas) self.polciy_old = ActorCritic(state_dim, action_dim).to(self.device) self.MseLoss = nn.MSELoss() def update(self, memory): # Monte Carlo estimate of state rewards rewards = [] discount_reward = 0 for reward in reversed(memory.rewards): discount_reward = reward + (self.gamma * discount_reward) rewards.insert(0, discount_reward) # Normalizing the rewards: rewards = torch.tensor(rewards).to(self.device) rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-5) # convert list in tensor old_states = torch.stack(memory.states).to(self.device).detach() old_actions = torch.stack(memory.actions).to(self.device).detach() old_logprobs = torch.stack(memory.logprobs).to(self.device).detach() # Optimize policy for K epochs for _ in range(self.K_epochs): # Evaluating old acions and values: logprobs, state_values, dist_entropy = self.policy.evaluate( old_states, old_actions) # Finding the ratio (pi_theta / pi_theta_old) ratios = torch.exp(logprobs - old_logprobs.detach()) # Finding Surrogate Loss: advantages = rewards - state_values.detach() surr1 = ratios * advantages surr2 = torch.clamp(ratios, 1 - self.eps_clip, 1 + self.eps_clip) loss = -torch.min(surr1, surr2) + 0.5 * self.MseLoss( state_values, rewards) - 0.01 * dist_entropy # take gradient step self.optimizer.zero_grad() loss.mean().backward() self.optimizer.step() # Copy new weights into old policy: self.policy_old.load_state_dict(self.policy.state_dict())
def train(): # Defaults parameters: # gamma = 0.99 # lr = 0.02 # betas = (0.9, 0.999) # random_seed = 543 render = False gamma = 0.99 lr = 0.02 betas = (0.9, 0.999) random_seed = 543 torch.manual_seed(random_seed) env = gym.make('LunarLander-v2') env.seed(random_seed) policy = ActorCritic() optimizer = optim.Adam(policy.parameters(), lr=lr, betas=betas) print(lr,betas) running_reward = 0 for i_episode in range(0, 10000): state = env.reset() for t in range(10000): action = policy(state) state, reward, done, _ = env.step(action) policy.rewards.append(reward) running_reward += reward if render and i_episode > 1000: env.render() if done: break # Updating the policy : optimizer.zero_grad() loss = policy.calculateLoss(gamma) loss.backward() optimizer.step() policy.clearMemory() # saving the model if episodes > 999 OR avg reward > 200 #if i_episode > 999: # torch.save(policy.state_dict(), './preTrained/LunarLander_{}_{}_{}.pth'.format(lr, betas[0], betas[1])) if running_reward > 4000: torch.save(policy.state_dict(), './preTrained/LunarLander_{}_{}_{}.pth'.format(lr, betas[0], betas[1])) print("########## Solved! ##########") test(name='LunarLander_{}_{}_{}.pth'.format(lr, betas[0], betas[1])) break if i_episode % 20 == 0: running_reward = running_reward/20 print('Episode {}\tlength: {}\treward: {}'.format(i_episode, t, running_reward)) running_reward = 0
class A2C: def __init__(self, state_dim, action_dim, cfg): self.gamma = cfg.gamma self.model = ActorCritic(state_dim, action_dim, hidden_dim=cfg.hidden_dim).to(cfg.device) self.optimizer = optim.Adam(self.model.parameters(), lr=cfg.lr) self.device = cfg.device self.loss = 0 self.env = cfg.env def choose_action(self, state): state = torch.tensor([state], device=self.device, dtype=torch.float32) dist, value = self.model(state) action = dist.sample().item() return action, value, dist def update(self, values, next_values, step_rewards, log_probs, mask_dones, entropy): # 利用一回合数据进行更新 expected_values = [] advantages = [] actor_losses = [] critic_losses = [] for step in range(len(step_rewards)): expected_values.append(step_rewards[step].item() + self.gamma * next_values[step].squeeze().item() * mask_dones[step].squeeze().item()) advantages.append(expected_values[step] - values[step].item()) actor_losses.append(-advantages[step] * log_probs[step].item()) critic_losses.append(nn.MSELoss()(values[step].squeeze(), torch.tensor([expected_values[step]]).to(self.device)).cpu().detach().numpy()) actor_loss = mean(actor_losses) critic_loss = mean(critic_losses) self.loss = actor_loss + 0.5 * critic_loss - 0.001 * entropy self.optimizer.zero_grad() self.loss.backward() self.optimizer.step() def save(self, path): model_checkpoint = os.path.join(path, self.env+'actor_critic.pt') torch.save(self.model.state_dict(), model_checkpoint) print('Model Saved!') def load(self, path): model_checkpoint = os.path.join(path, self.env+'actor_critic.pt') self.model.load_state_dict(torch.load(model_checkpoint)) print('Model Loaded!')
def train(rank, shared_model, optimizer): """ :param rank: worker-ID :param shared_model: model to sync between workers :param optimizer: :return: """ # torch.manual_seed(SEED + rank) ac_steps = 20 # The amount of steps before you review max_episode_length = 10000 # The game will stop after this amount of time and maybe re run the game? gamma = 0.99 tau = 1.0 max_grad_norm = 50.0 # Limit the direction of gradient travel within the queue. Anything outside the queue is cut checkpoint_n = 20 # To see the model after this many n. Can increase this number if have a shit comp env = create_atari_env( romname ) # enage game. romname is depending on the game of your choice. env.seed(SEED + rank) # For the problem to occur again? LOOK THIS UP state = env.reset() # Allow torch to handle pixel data. Don't understrand squeeze. FloatTensor - Tensor is an array, therefore array of float. state = Variable(torch.from_numpy(state).unsqueeze(0).type(FloatTensor), requires_grad=False) # Selecting model, with this size of input and that kind of output model = ActorCritic(env.observation_space.shape[0], env.action_space) t = 0 done = True # Starting from a state when gameover is true! episodes = 0 reward_sum = 0 reward_sum1 = 0 start_time = time.time() best_reward = -999 isbest = 0 cx = hx = None while True: model.load_state_dict(shared_model.state_dict( )) # Pull the up to date model from the shared model if done: # need to reset LSTM cell's input # the LSTM units need their own output to feed into next step # input (hence the name of the kind: recurrent neural nets). # At the beginning of an episode, to get things started, # we need to allocate some initial values in the required format, # i.e. the same size as the output of the layer. # # see http://pytorch.org/docs/master/_modules/torch/nn/modules/rnn.html#LSTM # for details # # Optionally, you can remove LSTM to simplify the code # Think: what is the possible loss? cx = Variable(torch.zeros(1, 256)).type( FloatTensor ) # torch.zeros - setting the values to all zeros since there's nothing there yet hx = Variable(torch.zeros(1, 256)).type(FloatTensor) else: cx = Variable( cx.data) # takes the last computed value for the next input hx = Variable( hx.data ) # basically this is to detach from previous comp graph states = [] values = [] log_probs = [] rewards = [] entropies = [] for i in range(ac_steps): # Running through the 20 steps t += 1 v, logit, (hx, cx) = model( (state, (hx, cx)) ) # When you run model, it will return you 4 values -> store those 4 values in v, logit, etc. states.append(state) prob = F.softmax(logit) # The gradient descent thing log_prob = F.log_softmax( logit) # Do it again, a lot to make sure its correct entropy = -(log_prob * prob).sum( 1, keepdim=True ) # To increase diversity of our choice (part of e-greedy?) entropies.append(entropy) # detach - anything compute with pytorch will drag a trail behind it. When get gradient descent, the calculation will race with the result. We do not want the descent to chase it randomly, so we just detach it. !Do not need to modify this function when modify the code. action = prob.multinomial().detach( ) # detach -- so the backprob will NOT go through multinomial() # use the current action as an index to get the # corresponding log probability log_prob = log_prob.gather( 1, action ) # allow you to simultenously take probability of many actions. action = action.data[ 0, 0] # Extract the variables out of the integer. Turning it from a torch integer to a "normal" integer # Accept what was given by the action, does it things? and the env will return the 4 following; state, reward, done # _ is something that we don't care about but since env.step is returning 4 values so we just have to have something to take it. state, reward, done, _ = env.step(action) reward_sum += reward reward_sum1 += reward # reason why store reward sum twice just for re-assurance done = (done or t >= max_episode_length) if done: t_ = t t = 0 state = env.reset() episodes += 1 if episodes % 10 == 0: time_str = time.strftime( "%Hh %Mm %Ss", time.gmtime(time.time() - start_time)) print("Time {}, worker-{} episode {} " "mean episode reward {}, " "episode length {}".format(time_str, rank, episodes, reward_sum / 10.0, t_)) reward_sum = 0.0 if episodes % checkpoint_n == 0: ave_reward = reward_sum1 / checkpoint_n if best_reward < ave_reward: isbest = 1 best_reward = ave_reward print("Saving checkpoint Time {}, worker-{} episode {} " "mean episode reward {}, " "episode length {} best_reward {}".format( get_elapsed_time_str(), rank, episodes, ave_reward, t_, best_reward)) checkpoint_fname = os.path.join( args.savedir, args.rom + '_worker' + str(rank) + '_' + str(episodes)) save_checkpoint( { 'epoch': episodes, 'average_reward': ave_reward, 'time': time.time(), 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), }, isbest, checkpoint_fname) reward_sum1 = 0.0 state = Variable( torch.from_numpy(state).unsqueeze(0).type(FloatTensor), requires_grad=False) reward = max(min(reward, 1), -1) values.append(v) log_probs.append(log_prob) # Keep record rewards.append(reward) if done: break # We reach here because either # i) an episode ends, such as game over # ii) we have explored certain steps into the future and now it is # time to look-back and summerise the if done: R = torch.zeros(1, 1).type( FloatTensor ) # If game over, the game over stage receive a reward of 0 else: value, _, _ = model( (state, (hx, cx)) ) # if its not game over, then we will use the model to evaluate the reward R = value.data values.append(Variable(R)) critic_loss = 0 actor_loss = 0 R = Variable(R) gae = 0 for i in reversed(range(len(rewards))): R = gamma * R + rewards[i] # R - longterm reward advantage = R - values[ i] # type: Variable, advantage against the average # Compare the actual long-term reward. Note: we are reversing the # experience of a complete trajectory. If the full length is 100 # (time indexes are among 0, 1, 2, ..., 99), and now i=50, that means # we have processed all information in steps, 51, 52, ..., 99 # and R will contain the actual long term reward at time step 51 at # the beginning of this step. The above computation injects the reward # information in step 50 to R. Now R is the long-term reward at this # step. # # So-called advantage is then the "unexpected gain/loss". It forms the base # of evaluating the action taken at this step (50). # # critic_loss accumulates those "exceptional gain/loss" so that later we will # adjust our expectation for each state and reduce future exceptions (to better # evaluate actions, say, the advantage agains expectation is only meaningful # when the expectation itself is meaningful). critic_loss += 0.5 * advantage.pow(2) # Generalized Advantage Estimation # see https://arxiv.org/abs/1506.02438 # we can use advantage in the computation of the direction to adjust policy, # but the manipulation here improves stability (as claims by the paper). # # Note advantage implicitly contributes to GAE, since it helps # achieve a good estimation of state-values. td_error = rewards[i] + gamma * values[i + 1].data - values[i].data gae = gae * gamma * tau + td_error # log_probs[i] is the log-probability(action-taken). If GAE is great, that # means the choice we had made was great, and we want to make the same # action decision in future -- make log_probs[i] large. Otherwise, # we add log_probs to our regret and will be less likely to take the same # action in future. # # entropy means the variety in a probabilistic distribution, # to encourage big entropies is to make more exploration. actor_loss -= (Variable(gae) * log_probs[i] + 0.01 * entropies[i]) optimizer.zero_grad( ) # Applied the gradient to the parameter (back-propagation will get you good stuff from gradient) total_loss = actor_loss + critic_loss * 0.5 # type: Variable total_loss.backward() # error occur, back propagation # this is to improve stability torch.nn.utils.clip_grad_norm(model.parameters(), max_grad_norm) ensure_shared_grads( model, shared_model) # Push each updated model to the shared model optimizer.step()
rewards = trajectory_collector.scores_by_episode[n_episodes : ] # record the number of "dones" per trajectory writer.add_scalar("episodes_per_trajectory", len(rewards), step) step += 1 end_time = time.time() for idx_r, reward in enumerate(rewards): mean_reward = reward_tracker.reward(reward, n_episodes + idx_r, end_time - start if start is not None else 0) # we switch LR to 1e-4 in the middle scheduler.step() # keep current spectacular scores if n_episodes > 0 and (reward > max_score or (n_episodes + idx_r) % SAVE_EVERY == 0): torch.save(policy.state_dict(), os.path.join(ckpt_path, f'checkpoint_actor_{reward:.03f}.pth')) max_score = reward if mean_reward is not None and mean_reward >= SOLVED_SCORE: torch.save(policy.state_dict(), os.path.join(ckpt_path, f'checkpoint_actor_{mean_reward:.03f}.pth')) solved_episode = n_episodes + idx_r - AVG_WIN - 1 print(f"Solved in {solved_episode if solved_episode > 0 else n_episodes + idx_r} episodes") solved = True break if solved: break start = time.time() # train agents in a round-robin for the number of epochs for epoch in range(EPOCHS):
def main(): # 确定神经网络计算设备 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # 构建神经网络 net = ActorCritic() net = net.to(device) # 准备优化器 optimizer = torch.optim.Adam(net.parameters(), lr=3e-4) # 准备环境 envs = Envs(NUM_WORKERS, gamma=GAMMA) # 开始训练 for episode in range(EPISODES): # 从多个环境采集一回合数据 net.eval() with torch.no_grad(): states = envs.reset() done = False while not done: states = states.to(device) _, policys = net(states) policys = policys.cpu() # 移到CPU上处理比较好 # 不能下的位置概率填 0 for i in range(NUM_WORKERS): if envs.reversis[i].next != 0: for y, x in itertools.product(range(SIZE), repeat=2): if not envs.reversis[i].good[y][x]: policys[i][y * SIZE + x] = 0. else: policys[i][y * SIZE + x] += 1e-8 # 防止概率全为 0 actions = Categorical(probs=policys).sample() done, states = envs.step(actions) envs.setReturn() data = EpisodeData(envs.readHistory()) loader = DataLoader(data, batch_size=BATCH_SIZE, shuffle=True, num_workers=2) # 训练网络 net.train() # 相关指标 value_loss_total = 0. entropy_total = 0. for states, actions, Returns in loader: states, actions, Returns = states.to(device), actions.to( device), Returns.to(device) values, policys = net(states) dist = Categorical(probs=policys) action_log_probs = dist.log_prob(actions).view(-1, 1) dist_entropy = dist.entropy().mean() # 我们希望分布的熵更大些,保持模型的探索性 advantages = Returns.view(-1, 1) - values value_loss = advantages.pow(2).mean() action_loss = -(advantages.detach() * action_log_probs).mean() optimizer.zero_grad() (VALUE_LOSS_COEF * value_loss + action_loss - ENTROPY_LOSS_COEF * dist_entropy).backward() optimizer.step() value_loss_total += value_loss.item() entropy_total += dist_entropy.item() print('Episode: {:>10d}, Value Loss: {:g}, Entropy: {:g}'.format( episode, value_loss_total / len(loader), entropy_total / len(loader)), flush=True) if episode != 0 and episode % SAVE_INTERVAL == 0: if not os.path.isdir('models'): os.mkdir('models') torch.save(net.state_dict(), 'models/{}.pt'.format(episode // SAVE_INTERVAL))
advantages = returns - values actor_loss = -(log_probs * advantages.detach()).mean() critic_loss = advantages.pow(2).mean() entropy_loss = entropies.mean() loss = args.actor_loss_coefficient * actor_loss + \ args.critic_loss_coefficient * critic_loss - \ args.entropy_loss_coefficient * entropy_loss optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(actor_critic.parameters(), args.max_grad_norm) optimizer.step() if len(rewards) > 1: end = time.time() total_num_steps = (episode_n + 1) * args.num_episodes * args.num_steps print("********************************************************") print("Episode: {0}, total steps: {1}".format( episode_n, total_num_steps)) print("Episode rewards: {:.1f}".format(np.sum(rewards))) print("Actor loss: {:.5f}, Critic loss: {:.5f}, Entropy: {:.5f}". format(actor_loss.item(), critic_loss.item(), entropy_loss.item())) print("********************************************************") if episode_n % args.storage_freq == 0: torch.save(actor_critic.state_dict(), args.storage_path + 'a2c_model.pt')
episode_length = 1 while True: if episode_length % steps == 0: model.low_lr(rate) if (episode_length % 1000 == 0) and (episode_length > 20000): if dataset == 'cifar': model.eval() map = test_util.test(Dtest, model, batch_size, bit_len) file = open(logpath, "a") file.write('#### map=' + str(map) + '\n') file.close() path = checkpoint_path + '/' + str(episode_length) + '.model' torch.save(model.state_dict(), path) model.train() if dataset == 'cifar': ori, pos, neg = traintest.get_batch_cifar_nus(batch_size) else: ori, pos, neg = traintest.get_batch_flk_nus(batch_size) ori = Variable(ori).cuda() pos = Variable(pos).cuda() neg = Variable(neg).cuda() hash_o = Variable(torch.zeros(batch_size, 1).cuda()) hash_p = Variable(torch.zeros(batch_size, 1).cuda()) hash_n = Variable(torch.zeros(batch_size, 1).cuda())
def train(args, scorer, summary_writer=None): device = args.device env = create_crop_env(args, scorer) model = ActorCritic(args).to(device) model.train() optimizer = optim.Adam(model.parameters(), lr=args.lr) # import pdb; pdb.set_trace(); training_log_file = open(os.path.join( args.model_save_path, 'training.log'), 'w') validation_log_file = open(os.path.join( args.model_save_path, 'validation.log'), 'w') training_log_file.write('Epoch,Cost\n') validation_log_file.write('Epoch,Cost\n') for train_iter in range(args.n_epochs): episode = BatchEpisodes(batch_size=args.batch_size, gamma=args.gamma, device=device) for _ in range(args.batch_size): done = True observation_np = env.reset() observations_np, rewards_np, actions_np, hs_ts, cs_ts = [], [], [], [], [] cx = torch.zeros(1, args.hidden_dim).to(device) hx = torch.zeros(1, args.hidden_dim).to(device) for step in range(args.num_steps): observations_np.append(observation_np[0]) hs_ts.append(hx) cs_ts.append(cx) with torch.no_grad(): observation_ts = torch.from_numpy(observation_np).to(device) value_ts, logit_ts, (hx, cx) = model((observation_ts, (hx, cx))) prob = F.softmax(logit_ts, dim=-1) action_ts = prob.multinomial(num_samples=1).detach() action_np = action_ts.cpu().numpy() actions_np.append(action_np[0][0]) observation_np, reward_num, done, _ = env.step(action_np) if step == args.num_steps - 1: reward_num = 0 if done else value_ts.item() rewards_np.append(reward_num) if done: break observations_np, actions_np, rewards_np = \ map(lambda x: np.array(x).astype(np.float32), [observations_np, actions_np, rewards_np]) episode.append(observations_np, actions_np, rewards_np, hs_ts, cs_ts) log_probs = [] values = [] entropys = [] for i in range(len(episode)): (hs_ts, cs_ts) = episode.hiddens[0][i], episode.hiddens[1][i] value_ts, logit_ts, (_, _) = model((episode.observations[i], (hs_ts, cs_ts))) prob = F.softmax(logit_ts, dim=-1) log_prob = F.log_softmax(logit_ts, dim=-1) entropy = -(log_prob * prob).sum(1) log_prob = log_prob.gather(1, episode.actions[i].unsqueeze(1).long()) log_probs.append(log_prob) entropys.append(entropy) values.append(value_ts) log_probs_ts = torch.stack(log_probs).squeeze(2) values_ts = torch.stack(values).squeeze(2) entropys_ts = torch.stack(entropys) advantages_ts = episode.gae(values_ts) advantages_ts = weighted_normalize(advantages_ts, weights=episode.mask) policy_loss = - weighted_mean(log_probs_ts * advantages_ts, dim=0, weights=episode.mask) # import pdb; pdb.set_trace(); value_loss = weighted_mean((values_ts - episode.returns).pow(2), dim=0, weights = episode.mask) entropy_loss = - weighted_mean(entropys_ts, dim=0, weights = episode.mask) optimizer.zero_grad() tot_loss = policy_loss + entropy_loss + args.value_loss_coef * value_loss tot_loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() print("Epoch [%2d/%2d] : Tot Loss: %5.5f, Policy Loss: %5.5f, Value Loss: %5.5f, Entropy Loss: %5.5f" % (train_iter, args.n_epochs, tot_loss.item(), policy_loss.item(), value_loss.item(), entropy_loss.item())) # print("Train_iter: ", train_iter, " Total Loss: ", tot_loss.item(), " Value Loss: ", value_loss.item(), " Policy Loss: ", policy_loss.item(), "Entropy Loss: ", entropy_loss.item()) if summary_writer: summary_writer.add_scalar('loss_policy', policy_loss.item(), train_iter) summary_writer.add_scalar('loss_value', value_loss.item(), train_iter) summary_writer.add_scalar('loss_entropy', entropy_loss.item(), train_iter) summary_writer.add_scalar('loss_tot', tot_loss.item(), train_iter) train_iter += 1 if (train_iter + 1) % args.save_per_epoch == 0: torch.save(model.state_dict(), os.path.join(args.model_save_path, 'model_{}_{}.pth').format(train_iter, tot_loss.item())) training_log_file.write('{},{}\n'.format(train_iter, tot_loss.item())) validation_log_file.write('{},{}\n'.format(train_iter, 0)) training_log_file.flush() validation_log_file.flush() training_log_file.close() validation_log_file.close()
class A3C(): '''Implementation of N-step Asychronous Advantage Actor Critic''' def __init__(self, args, env, train=True): self.args = args self.set_random_seeds() self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') # Create the environment. self.env = gym.make(env) self.environment_name = env # Setup model. self.policy = ActorCritic(4, self.env.action_space.n) self.policy.apply(self.initialize_weights) # Setup critic model. self.critic = ActorCritic(4, self.env.action_space.n) self.critic.apply(self.initialize_weights) # Setup optimizer. self.eps = 1e-10 # To avoid divide-by-zero error. self.policy_optimizer = optim.Adam(self.policy.parameters(), lr=args.policy_lr) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=args.critic_lr) # Model weights path. self.timestamp = datetime.now().strftime( 'a2c-breakout-%Y-%m-%d_%H-%M-%S') self.weights_path = 'models/%s/%s' % (self.environment_name, self.timestamp) # Load pretrained weights. if args.weights_path: self.load_model() self.policy.to(self.device) self.critic.to(self.device) # Video render mode. if args.render: self.policy.eval() self.generate_episode(render=True) self.plot() return # Data for plotting. self.rewards_data = [] # n * [epoch, mean(returns), std(returns)] # Network training mode. if train: # Tensorboard logging. self.logdir = 'logs/%s/%s' % (self.environment_name, self.timestamp) self.summary_writer = SummaryWriter(self.logdir) # Save hyperparameters. with open(self.logdir + '/training_parameters.json', 'w') as f: json.dump(vars(self.args), f, indent=4) def initialize_weights(self, layer): if isinstance(layer, nn.Linear) or isinstance(layer, nn.Conv2d): nn.init.xavier_uniform_(layer.weight) nn.init.zeros_(layer.bias) def set_random_seeds(self): torch.manual_seed(self.args.random_seed) np.random.seed(self.args.random_seed) torch.backends.cudnn.benchmark = True def save_model(self, epoch): '''Helper function to save model state and weights.''' if not os.path.exists(self.weights_path): os.makedirs(self.weights_path) torch.save( { 'policy_state_dict': self.policy.state_dict(), 'policy_optimizer': self.policy_optimizer.state_dict(), 'critic_state_dict': self.critic.state_dict(), 'critic_optimizer': self.critic_optimizer.state_dict(), 'rewards_data': self.rewards_data, 'epoch': epoch }, os.path.join(self.weights_path, 'model_%d.h5' % epoch)) def load_model(self): '''Helper function to load model state and weights. ''' if os.path.isfile(self.args.weights_path): print('=> Loading checkpoint', self.args.weights_path) self.checkpoint = torch.load(self.args.weights_path) self.policy.load_state_dict(self.checkpoint['policy_state_dict']) self.policy_optimizer.load_state_dict( self.checkpoint['policy_optimizer']) self.critic.load_state_dict(self.checkpoint['critic_state_dict']) self.critic_optimizer.load_state_dict( self.checkpoint['critic_optimizer']) self.rewards_data = self.checkpoint['rewards_data'] else: raise Exception('No checkpoint found at %s' % self.args.weights_path) def train(self): '''Trains the model on a single episode using REINFORCE.''' for epoch in range(self.args.num_episodes): # Generate epsiode data. returns, log_probs, value_function, train_rewards = self.generate_episode( ) self.summary_writer.add_scalar('train/cumulative_rewards', train_rewards, epoch) self.summary_writer.add_scalar('train/trajectory_length', returns.size()[0], epoch) # Compute loss and policy gradient. self.policy_optimizer.zero_grad() policy_loss = ((returns - value_function.detach()) * -log_probs).mean() policy_loss.backward() self.policy_optimizer.step() self.critic_optimizer.zero_grad() critic_loss = F.mse_loss(returns, value_function) critic_loss.backward() self.critic_optimizer.step() # Test the model. if epoch % self.args.test_interval == 0: self.policy.eval() print('\nTesting') rewards = [ self.generate_episode(test=True) for epoch in range(self.args.test_episodes) ] rewards_mean, rewards_std = np.mean(rewards), np.std(rewards) print( 'Test Rewards (Mean): %.3f | Test Rewards (Std): %.3f\n' % (rewards_mean, rewards_std)) self.rewards_data.append([epoch, rewards_mean, rewards_std]) self.summary_writer.add_scalar('test/rewards_mean', rewards_mean, epoch) self.summary_writer.add_scalar('test/rewards_std', rewards_std, epoch) self.policy.train() # Logging. if epoch % self.args.log_interval == 0: print( 'Epoch: {0:05d}/{1:05d} | Policy Loss: {2:.3f} | Value Loss: {3:.3f}' .format(epoch, self.args.num_episodes, policy_loss, critic_loss)) self.summary_writer.add_scalar('train/policy_loss', policy_loss, epoch) self.summary_writer.add_scalar('train/critic_loss', critic_loss, epoch) # Save the model. if epoch % self.args.save_interval == 0: self.save_model(epoch) self.save_model(epoch) self.summary_writer.close() def generate_episode(self, gamma=0.99, test=False, render=False, max_iters=10000): ''' Generates an episode by executing the current policy in the given env. Returns: - a list of states, indexed by time epoch - a list of actions, indexed by time epoch - a list of cumulative discounted returns, indexed by time epoch ''' iters = 0 done = False state = self.env.reset() # Set video save path if render enabled. if render: save_path = 'videos/%s/epoch-%s' % (self.environment_name, self.checkpoint['epoch']) if not os.path.exists(save_path): os.makedirs(save_path) monitor = gym.wrappers.Monitor(self.env, save_path, force=True) batches = [] states = [torch.zeros(84, 84, device=self.device).float()] * 3 rewards, returns = [], [] actions, log_probs = [], [] while not done: # Run policy on current state to log probabilities of actions. states.append( torch.tensor(preprocess(state), device=self.device).float().squeeze(0)) batches.append(torch.stack(states[-4:])) action_probs = self.policy.forward( batches[-1].unsqueeze(0)).squeeze(0) # Sample action from the log probabilities. if test and self.args.det_eval: action = torch.argmax(action_probs) else: action = torch.argmax( torch.distributions.Multinomial( logits=action_probs).sample()) actions.append(action) log_probs.append(action_probs[action]) # Run simulation with current action to get new state and reward. if render: monitor.render() state, reward, done, _ = self.env.step(action.cpu().numpy()) rewards.append(reward) # Break if the episode takes too long. iters += 1 if iters > max_iters: break # Save video and close rendering. cum_rewards = np.sum(rewards) if render: monitor.close() print('\nCumulative Rewards:', cum_rewards) return # Return cumulative rewards for test mode. if test: return cum_rewards # Flip rewards from T-1 to 0. rewards = np.array(rewards) / self.args.reward_normalizer # Compute value. values = [] minibatches = torch.split(torch.stack(batches), 256) for minibatch in minibatches: values.append( self.critic.forward(minibatch, action=False).squeeze(1)) values = torch.cat(values) discounted_values = values * gamma**self.args.n # Compute the cumulative discounted returns. n_step_rewards = np.zeros((1, self.args.n)) for i in reversed(range(rewards.shape[0])): if i + self.args.n >= rewards.shape[0]: V_end = 0 else: V_end = discounted_values[i + self.args.n] n_step_rewards[0, :-1] = n_step_rewards[0, 1:] * gamma n_step_rewards[0, -1] = rewards[i] n_step_return = torch.tensor( n_step_rewards.sum(), device=self.device).unsqueeze(0) + V_end returns.append(n_step_return) # Normalize returns. # returns = torch.stack(returns) # mean_return, std_return = returns.mean(), returns.std() # returns = (returns - mean_return) / (std_return + self.eps) return torch.stack(returns[::-1]).detach().squeeze(1), torch.stack( log_probs), values.squeeze(), cum_rewards def plot(self): # Save the plot. filename = os.path.join( 'plots', *self.args.weights_path.split('/')[-2:]).replace('.h5', '.png') if not os.path.exists(os.path.dirname(filename)): os.makedirs(os.path.dirname(filename)) # Make error plot with mean, std of rewards. data = np.asarray(self.rewards_data) plt.errorbar(data[:, 0], data[:, 1], data[:, 2], lw=2.5, elinewidth=1.5, ecolor='grey', barsabove=True, capthick=2, capsize=3) plt.title('Cumulative Rewards (Mean/Std) Plot for A3C Algorithm') plt.xlabel('Number of Episodes') plt.ylabel('Cumulative Rewards') plt.grid() plt.savefig(filename, dpi=300) plt.show()
def train(rank, shared_model, optimizer): """ :param rank: worker-ID :param shared_model: model to sync between workers :param optimizer: :return: """ # torch.manual_seed(SEED + rank) ac_steps = 20 max_episode_length = 10000 gamma = 0.99 tau = 1.0 max_grad_norm = 50.0 checkpoint_n = 20 env = create_atari_env(romname) env.seed(SEED + rank) state = env.reset() state = Variable(torch.from_numpy(state).unsqueeze(0).type(FloatTensor), requires_grad=False) model = ActorCritic(env.observation_space.shape[0], env.action_space) t = 0 done = True episodes = 0 reward_sum = 0 reward_sum1 = 0 start_time = time.time() best_reward = -999 isbest = 0 cx = hx = None while True: model.load_state_dict(shared_model.state_dict()) if done: # need to reset LSTM cell's input cx = Variable(torch.zeros(1, 256)).type(FloatTensor) hx = Variable(torch.zeros(1, 256)).type(FloatTensor) else: cx = Variable(cx.data) hx = Variable(hx.data) # basically this is to detach from previous comp graph states = [] values = [] log_probs = [] rewards = [] entropies = [] for i in range(ac_steps): t += 1 v, logit, (hx, cx) = model((state, (hx, cx))) states.append(state) prob = F.softmax(logit) log_prob = F.log_softmax(logit) entropy = -(log_prob * prob).sum(1, keepdim=True) entropies.append(entropy) action = prob.multinomial().detach() # detach -- so the backprob will NOT go through multinomial() log_prob = log_prob.gather(1, action) action = action.data[0, 0] state, reward, done, _ = env.step(action) reward_sum += reward reward_sum1 += reward done = done or t >= max_episode_length if done: t_ = t t = 0 state = env.reset() episodes += 1 if episodes % 10 == 0: time_str = time.strftime( "%Hh %Mm %Ss", time.gmtime(time.time() - start_time)) print("Time {}, worker-{} episode {} " "mean episode reward {}, " "episode length {}". format(time_str, rank, episodes, reward_sum / 10.0, t_)) reward_sum = 0.0 if episodes % checkpoint_n == 0: ave_reward = reward_sum1 / checkpoint_n if best_reward < ave_reward: isbest = 1 best_reward = ave_reward print("Saving checkpoint Time {}, worker-{} episode {} " "mean episode reward {}, " "episode length {} best_reward {}". format(get_elapsed_time_str(), rank, episodes, ave_reward, t_, best_reward)) checkpoint_fname = os.path.join( args.savedir, args.rom + '_worker' + str(rank) + '_' + str(episodes)) save_checkpoint({'epoch': episodes, 'average_reward': ave_reward, 'time': time.time(), 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), }, isbest, checkpoint_fname) reward_sum1 = 0.0 state = Variable(torch.from_numpy(state).unsqueeze(0).type(FloatTensor), requires_grad=False) reward = max(min(reward, 1), -1) values.append(v) log_probs.append(log_prob) rewards.append(reward) if done: break # We reach here because either # i) an episode ends, such as game over # ii) we have explored certain steps into the future and now it is # time to look-back and summerise the if done: R = torch.zeros(1, 1).type(FloatTensor) else: value, _, _ = model((state, (hx, cx))) R = value.data values.append(Variable(R)) critic_loss = 0 actor_loss = 0 R = Variable(R) gae = 0 for i in reversed(range(len(rewards))): R = gamma * R + rewards[i] advantage = R - values[i] # type: Variable critic_loss += 0.5 * advantage.pow(2) td_error = rewards[i] + gamma * values[i + 1].data - values[i].data gae = gae * gamma * tau + td_error actor_loss -= (Variable(gae) * log_probs[i] + 0.01 * entropies[i]) optimizer.zero_grad() total_loss = actor_loss + critic_loss * 0.5 # type: Variable total_loss.backward() # error occur torch.nn.utils.clip_grad_norm(model.parameters(), max_grad_norm) ensure_shared_grads(model, shared_model) optimizer.step()
def test(rank, args, shared_model): torch.manual_seed(args.seed + rank) env = create_atari_env(args.env_name) env.seed(args.seed + rank) model = ActorCritic(env.observation_space.shape[0], env.action_space) model.eval() (f, ckpt_path), (log_dir, ckpt_dir) = setup(args) if args.task == 'eval': env = wrappers.Monitor(env, '/tmp/{}-experiment'.format(args.env_name), force=True) state = env.reset() state = torch.from_numpy(state) reward_sum = 0 if args.task == 'eval': reward_list = [] done = True #env = wrappers.Monitor(env, '/tmp/{}-experiment'.format(args.env_name), force=True) start_time = time.time() # a quick hack to prevent the agent from stucking actions = deque(maxlen=100) episode_i = 0 episode_length = 0 try: while True: episode_length += 1 # Sync with the shared model if done: model.load_state_dict(shared_model.state_dict()) cx = Variable(torch.zeros(1, 128), volatile=True) hx = Variable(torch.zeros(1, 128), volatile=True) else: cx = Variable(cx.data, volatile=True) hx = Variable(hx.data, volatile=True) # for mujoco, env returns DoubleTensor value, mu, sigma_sq, (hx, cx) = model( (Variable(state.float().unsqueeze(0).float()), (hx, cx))) sigma_sq = F.softplus(sigma_sq) eps = torch.randn(mu.size()) # calculate the probability action = (mu + sigma_sq.sqrt()*Variable(eps)).data state, reward, done, _ = env.step(action[0, 0]) if args.display: env.render() done = done or episode_length >= args.max_episode_length reward_sum += reward # a quick hack to prevent the agent from stucking actions.append(action[0, 0]) if actions.count(actions[0]) == actions.maxlen: done = True if done: episode_i += 1 if args.task == 'eval': reward_list.append(reward_sum) if args.task == 'eval' and episode_i >= 100: print "Testing over %d episodes, Average reward = %f" % \ (episode_i, sum(reward_list)/episode_i,) break if episode_i%args.save_freq == 0: torch.save(model.state_dict(), os.path.join(ckpt_dir, args.env_name+\ "."+args.model_name+"."+str(episode_i)+".pkl")) info_str = "Time {}, episode reward {}, episode length {}".format( time.strftime("%Hh %Mm %Ss",time.gmtime(time.time() - start_time)), reward_sum, episode_length) print(info_str) f.write(info_str+'\n') reward_sum = 0 episode_length = 0 actions.clear() state = env.reset() if args.task == 'train': time.sleep(60) state = torch.from_numpy(state) except KeyboardInterrupt: env.close() f.close() torch.save(model.state_dict(), ckpt_path)
def main(args): current_dir = os.path.abspath('.') exp_dir = current_dir + '/results/exp/' model_dir = current_dir + '/results/model/' os.makedirs(exp_dir, exist_ok=True) os.makedirs(model_dir, exist_ok=True) writer = SummaryWriter(exp_dir) torch.manual_seed(args.seed) np.random.seed(args.seed) if args.device == 'cuda': torch.cuda.manual_seed(args.seed) sampler = MemorySampler(args) num_inputs, num_actions = sampler.get_space network = ActorCritic(num_inputs, num_actions, layer_norm=args.layer_norm) optimizer = opt.Adam(network.parameters(), lr=args.lr) clip_now = args.clip for i_episode in range(args.num_episode): # step1: perform current policy to collect trajectories # this is an on-policy method! memory = sampler.sample(network) # step2: extract variables from trajectories batch = memory.sample() batch_size = len(memory) rewards = torch.Tensor(batch.reward) values = torch.Tensor(batch.value) masks = torch.Tensor(batch.mask) actions = torch.Tensor(batch.action) observations = torch.Tensor(batch.observation) oldlogproba = torch.Tensor(batch.logproba) returns = torch.Tensor(batch_size) deltas = torch.Tensor(batch_size) advantages = torch.Tensor(batch_size) prev_return = 0 prev_value = 0 prev_advantage = 0 for i in reversed(range(batch_size)): returns[i] = rewards[i] + args.gamma * prev_return * masks[i] deltas[i] = rewards[ i] + args.gamma * prev_value * masks[i] - values[i] # ref: https://arxiv.org/pdf/1506.02438.pdf (generalization advantage estimate) advantages[i] = deltas[ i] + args.gamma * args.lamda * prev_advantage * masks[i] prev_return = returns[i] prev_value = values[i] prev_advantage = advantages[i] if args.advantage_norm: advantages = (advantages - advantages.mean()) / (advantages.std() + args.EPS) observations = observations.to(args.device) actions = actions.to(args.device) oldlogproba = oldlogproba.to(args.device) advantages = advantages.to(args.device) returns = returns.to(args.device) for i_epoch in range( int(args.num_epoch * batch_size / args.minibatch_size)): # sample from current batch minibatch_ind = np.random.choice(batch_size, args.minibatch_size, replace=False) minibatch_observations = observations[minibatch_ind] minibatch_actions = actions[minibatch_ind] minibatch_oldlogproba = oldlogproba[minibatch_ind] minibatch_newlogproba, entropy = network.get_logproba( minibatch_observations, minibatch_actions) minibatch_advantages = advantages[minibatch_ind] minibatch_returns = returns[minibatch_ind] minibatch_newvalues = network._forward_critic( minibatch_observations).flatten() assert minibatch_oldlogproba.shape == minibatch_newlogproba.shape ratio = torch.exp(minibatch_newlogproba - minibatch_oldlogproba) assert ratio.shape == minibatch_advantages.shape surr1 = ratio * minibatch_advantages surr2 = ratio.clamp(1 - clip_now, 1 + clip_now) * minibatch_advantages loss_surr = -torch.mean(torch.min(surr1, surr2)) # not sure the value loss should be clipped as well # clip example: https://github.com/Jiankai-Sun/Proximal-Policy-Optimization-in-Pytorch/blob/master/ppo.py # however, it does not make sense to clip score-like value by a dimensionless clipping parameter # moreover, original paper does not mention clipped value if args.lossvalue_norm: minibatch_return_6std = 6 * minibatch_returns.std() loss_value = torch.mean( (minibatch_newvalues - minibatch_returns).pow(2)) / minibatch_return_6std else: loss_value = torch.mean( (minibatch_newvalues - minibatch_returns).pow(2)) # loss_entropy = torch.mean(torch.exp(minibatch_newlogproba) * minibatch_newlogproba) loss_entropy = -torch.mean(entropy) total_loss = loss_surr + args.loss_coeff_value * loss_value + args.loss_coeff_entropy * loss_entropy optimizer.zero_grad() total_loss.backward() optimizer.step() if args.schedule_clip == 'linear': ep_ratio = 1 - (i_episode / args.num_episode) clip_now = args.clip * ep_ratio if args.schedule_adam == 'linear': ep_ratio = 1 - (i_episode / args.num_episode) lr_now = args.lr * ep_ratio # set learning rate # ref: https://stackoverflow.com/questions/48324152/ for g in optimizer.param_groups: g['lr'] = lr_now if i_episode % args.log_num_episode == 0: mean_reward = (torch.sum(rewards) / memory.num_episode).data mean_step = len(memory) // memory.num_episode print('Finished episode: {} | Reward: {:.4f} | total_loss = {:.4f} = {:.4f} + {} * {:.4f} + {} * {:.4f}' \ .format(i_episode, mean_reward, total_loss.cpu().data, loss_surr.cpu().data, args.loss_coeff_value, loss_value.cpu().data, args.loss_coeff_entropy, loss_entropy.cpu().data), end=' | ') print('Step: {:d}'.format(mean_step)) writer.add_scalar('reward', mean_reward, i_episode) writer.add_scalar('total_loss', total_loss.cpu().data, i_episode) torch.save(network.state_dict(), model_dir + 'network_{}.pth'.format(i_episode)) sampler.close()
def test(rank, args, shared_model): torch.manual_seed(args.seed + rank) env = create_atari_env(args.env_name) env.seed(args.seed + rank) model = ActorCritic(env.observation_space.shape[0], env.action_space.n, args.lstm_size) model.eval() state = env.reset() state = torch.from_numpy(state) reward_sum = 0 done = True start_time = time.time() #actions=deque(maxlen=100) episode_length = 0 currentPath = os.getcwd() File = open(currentPath + '/record.txt', 'a+') print("\n\n\n\n------------------------------\n\n\n\n\n") File.write("\n\n\n\n------------------------------\n\n\n\n\n") File.close() cnt = 0 episode_number = 0 while True: env.render() cnt = cnt + 1 episode_length += 1 if done: model.load_state_dict(shared_model.state_dict()) hx = Variable(torch.zeros(1, args.lstm_size), volatile=True) cx = Variable(torch.zeros(1, args.lstm_size), volatile=True) else: hx = Variable(hx.data, volatile=True) cx = Variable(cx.data, volatile=True) #print(state) value, logit, (hx, cx) = model((Variable(state.unsqueeze(0), volatile=True), (hx, cx))) prob = F.softmax(logit) #action=prob.max(1)[1].data.numpy() action = prob.multinomial().data #if(args.env_name=='Breakout-v3'): # state,reward,done,_=env.step(1) # reward_sum+=reward #state,reward,done,_ =env.step(action[0,0]) state, reward, done, _ = env.step(action.numpy()) done = done #or episode_length >= args.max_episode_length if episode_length >= args.max_episode_length: done = True reward_sum -= 30 reward_sum += reward #actions.append(action[0,0]) #if actions.count(actions[0])==actions.maxlen: # done=True #if reward!=0: # print("ep %d : game finished,reward: %d " %(episode_number,reward))+('' if reward == #-1 else ' !!!!!!!!') if done: hour = int( time.strftime("%H", time.gmtime(time.time() - start_time))) _min = int( time.strftime("%M", time.gmtime(time.time() - start_time))) print("Time {},episode reward {}, episode length {} ".format( hour * 60 + _min + args.starttime, reward_sum, episode_length)) File = open(currentPath + '/record.txt', 'a+') File.write( "Time {},episode reward {}, episode length {} \n".format( hour * 60 + _min + args.starttime, reward_sum, episode_length)) File.close() reward_sum = 0 episode_length = 0 #actions.clear() state = env.reset() torch.save(model.state_dict(), currentPath + '/A3C.t7') episode_number += 1 time.sleep(60) state = torch.from_numpy(state)
if args.checkpoint_path and os.path.isfile(args.checkpoint_path): checkpoint = torch.load(args.checkpoint_path) counter.value = checkpoint['episodes'] shared_model.load_state_dict(checkpoint['model']) shared_model.share_memory() optimizer.load_state_dict(checkpoint['optimizer']) optimizer.share_memory() else: checkpoint = {} processes = [] logging = build_logger( lambda: dict(episodes=counter.value, model=shared_model.state_dict(), optimizer=optimizer.state_dict()), checkpoint, args.run, args.visdom_port) p = mp.Process(target=test, args=(args.num_processes, args, shared_model, (counter, steps, args.max_test_episodes), logging, kill)) p.start() processes.append(p) for rank in range(0, args.num_processes): p = mp.Process(target=train, args=(rank, args, shared_model, (counter, steps), lock, optimizer, logging, kill)) p.start()
def train(training_scene, train_object, rank, shared_model, scheduler, counter, lock, config, arguments=dict(), optimizer=None): torch.manual_seed(arguments['seed'] + rank) # To prevent out of memory if (arguments['train_cnn'] and rank < 10): arguments.update({"gpu_ids": [-1]}) gpu_id = arguments['gpu_ids'][rank % len(arguments['gpu_ids'])] if gpu_id >= 0: torch.cuda.manual_seed(arguments['seed'] + rank) if optimizer is None: optimizer = optim.RMSprop(shared_model.parameters(), lr=arguments['lr'], alpha=0.99, eps=0.1) env = AI2ThorDumpEnv(training_scene, train_object, config, arguments, seed=arguments['seed'] + rank) state, score, target = env.reset() starting = env.current_state_id done = True print("Done initalizing process {}. Now find {} in {}! Use gpu: {}".format( rank, env.target, env.scene, 'yes' if gpu_id >= 0 else 'no')) model = ActorCritic(config, arguments, gpu_id) if gpu_id >= 0: with torch.cuda.device(gpu_id): model = model.cuda() dtype = torch.cuda.FloatTensor else: dtype = torch.FloatTensor model.train() # monitoring total_reward_for_num_steps_list = [] redundancies = [] success = [] avg_entropies = [] learning_rates = [] dist_to_goal = [] start = time.time() episode_length = 0 for epoch in range(arguments['num_epochs']): # Sync with the shared model if gpu_id >= 0: with torch.cuda.device(gpu_id): model.load_state_dict(shared_model.state_dict()) else: model.load_state_dict(shared_model.state_dict()) if arguments['lstm']: if done: cx = torch.zeros(1, 512).type(dtype) hx = torch.zeros(1, 512).type(dtype) else: cx = cx.detach() hx = hx.detach() if scheduler is not None: scheduler.step() learning_rates.append(optimizer.param_groups[0]['lr']) values = [] log_probs = [] rewards = [] entropies = [] starting = env.current_state_id dist_to_goal.append( min([env.shortest[starting][t] for t in env.target_ids])) for step in range(arguments['num_iters']): episode_length += 1 if arguments['lstm']: value, logit, (hx, cx) = model((state, (hx, cx)), score, target) else: value, logit = model(state, score, target) prob = F.softmax(logit, dim=-1) log_prob = F.log_softmax(logit, dim=-1) entropy = -(log_prob * prob).sum(1, keepdim=True) entropies.append(entropy) action = prob.multinomial(num_samples=1).detach() log_prob = log_prob.gather(1, action) action_int = action.cpu().numpy()[0][0].item() state, score, reward, done = env.step(action_int) if done: success.append(1) elif episode_length >= arguments['max_episode_length']: success.append(0) done = done or episode_length >= arguments['max_episode_length'] with lock: counter.value += 1 values.append(value) log_probs.append(log_prob) rewards.append(reward) ending = env.current_state_id if done: state, score, target = env.reset() print('[P-{}] Epoch: {}. Episode length: {}. Total reward: {:.3f}. Time elapsed: {:.3f}'\ .format(rank, epoch + 1, episode_length, sum(rewards), (time.time() - start) / 3600)) episode_length = 0 break if not done: success.append(0) # No interaction with environment below. # Monitoring total_reward_for_num_steps_list.append(sum(rewards)) redundancies.append(step + 1 - env.shortest[ending, starting]) avg_entropies.append(torch.tensor(entropies).numpy().mean()) # Backprop and optimisation R = torch.zeros(1, 1) if not done: # to change last reward to predicted value to .... if arguments['lstm']: value, _, (hx, cx) = model((state, (hx, cx)), score, target) else: value, _ = model(state, score, target) R = value.detach() if gpu_id >= 0: with torch.cuda.device(gpu_id): R = R.cuda() values.append(R) policy_loss = 0 value_loss = 0 gae = torch.zeros(1, 1) if gpu_id >= 0: with torch.cuda.device(gpu_id): gae = gae.cuda() for i in reversed(range(len(rewards))): R = arguments['gamma'] * R + rewards[i] advantage = R - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) if arguments['use_gae']: # Generalized Advantage Estimation delta_t = rewards[i] + arguments['gamma'] * values[ i + 1] - values[i] gae = gae * arguments['gamma'] * arguments['tau'] + delta_t policy_loss = policy_loss - log_probs[i] * gae.detach() - \ arguments['ec'] * entropies[i] optimizer.zero_grad() (policy_loss + arguments['vc'] * value_loss).backward() torch.nn.utils.clip_grad_norm_(model.parameters(), arguments['max_grad_norm']) ensure_shared_grads(model, shared_model, gpu=gpu_id >= 0) optimizer.step() if (epoch + 1) % 1000 == 0 and np.mean(success[-500:]) >= 0.8 and \ not os.path.isfile("training-history/{}/net_good.pth".format(arguments['about'])): torch.save( model.state_dict(), "training-history/{}/net_good.pth".format(arguments['about'])) if (epoch + 1) % 2000 == 0: with open( 'training-history/{}/{}_{}_{}.pkl'.format( arguments['about'], training_scene, train_object, rank), 'wb') as f: pickle.dump( { "rewards": total_reward_for_num_steps_list, "dist_to_goal": dist_to_goal, "success_rate": success, 'redundancies': redundancies, "entropies": avg_entropies, 'lrs': learning_rates }, f, pickle.HIGHEST_PROTOCOL) torch.save( model.state_dict(), "training-history/{}/net_{}.pth".format(arguments['about'], train_object))
def __init__(self, model: ActorCritic, shared_model: ActorCritic): self.model = model self.shared_model = shared_model self.model.load_state_dict(shared_model.state_dict())
def test(rank, args, T, shared_model): torch.manual_seed(args.seed + rank) env = gym.make(args.env) env.seed(args.seed + rank) model = ActorCritic(env.observation_space, env.action_space, args.hidden_size) model.eval() save_dir = os.path.join('results', args.name) can_test = True # Test flag t_start = 1 # Test step counter to check against global counter rewards, steps = [], [] # Rewards and steps for plotting l = str(len(str(args.T_max))) # Max num. of digits for logging steps done = True # Start new episode # stores step, reward, avg_steps and time results_dict = {'t': [], 'reward': [], 'avg_steps': [], 'time': []} while T.value() <= args.T_max: if can_test: t_start = T.value() # Reset counter # Evaluate over several episodes and average results avg_rewards, avg_episode_lengths = [], [] for _ in range(args.evaluation_episodes): while True: # Reset or pass on hidden state if done: # Sync with shared model every episode model.load_state_dict(shared_model.state_dict()) hx = torch.zeros(1, args.hidden_size) cx = torch.zeros(1, args.hidden_size) # Reset environment and done flag state = state_to_tensor(env.reset()) done, episode_length = False, 0 reward_sum = 0 # Optionally render validation states if args.render: env.render() # Calculate policy with torch.no_grad(): policy, _, _, (hx, cx), _ = model(state, (hx, cx)) # Choose action greedily action = policy.max(1)[1][0] # Step state, reward, done, _ = env.step(action.item()) state = state_to_tensor(state) reward_sum += reward done = done or episode_length >= args.max_episode_length # Stop episodes at a max length episode_length += 1 # Increase episode counter # Log and reset statistics at the end of every episode if done: avg_rewards.append(reward_sum) avg_episode_lengths.append(episode_length) break print(('[{}] Step: {:<' + l + '} Avg. Reward: {:<8} Avg. Episode Length: {:<8}').format( datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S,%f')[:-3], t_start, sum(avg_rewards) / args.evaluation_episodes, sum(avg_episode_lengths) / args.evaluation_episodes)) fields = [ t_start, sum(avg_rewards) / args.evaluation_episodes, sum(avg_episode_lengths) / args.evaluation_episodes, str(datetime.now()) ] # storing data in the dictionary. results_dict['t'].append(t_start) results_dict['reward'].append( sum(avg_rewards) / args.evaluation_episodes) results_dict['avg_steps'].append( sum(avg_episode_lengths) / args.evaluation_episodes) results_dict['time'].append(str(datetime.now())) # Dumping the results in pickle format with open(os.path.join(save_dir, 'results.pck'), 'wb') as f: pickle.dump(results_dict, f) # Saving the data in csv format with open(os.path.join(save_dir, 'results.csv'), 'a') as f: writer = csv.writer(f) writer.writerow(fields) if args.evaluate: return rewards.append(avg_rewards) # Keep all evaluations steps.append(t_start) plot_line(steps, rewards, save_dir) # Plot rewards torch.save(model.state_dict(), os.path.join(save_dir, 'model.pth')) # Save model params # torch.save(model.state_dict(), os.path.join(save_dir, 'model_{}.pth'.format(t_start))) # Save model params can_test = False # Finish testing else: if T.value() - t_start >= args.evaluation_interval: can_test = True time.sleep(0.001) # Check if available to test every millisecond # Dumping the results in pickle format with open(os.path.join(save_dir, 'results.pck'), 'wb') as f: pickle.dump(results_dict, f) env.close()
def test(rank, args, shared_model, shared_curiosity, counter, pids, optimizer, train_policy_losses, train_value_losses, train_rewards): models_dir = os.path.join(args.sum_base_dir, 'models') if not os.path.exists(models_dir): logging.info("Created models dir") os.makedirs(models_dir) recordings_dir = os.path.join(args.sum_base_dir, 'recordings') if (not os.path.exists(recordings_dir)) and (args.game == 'doom'): logging.info("Created recordings dir") os.makedirs(recordings_dir) videos_dir = os.path.join(args.sum_base_dir, 'videos') if (not os.path.exists(videos_dir)) and (args.game == 'atari'): logging.info("Created videos dir") os.makedirs(videos_dir) torch.manual_seed(args.seed + rank) if args.game == 'doom': env = create_doom_env(args.env_name, rank, num_skip=args.num_skip, num_stack=args.num_stack) env.set_recordings_dir(recordings_dir) logging.info("Set recordings dir") env.seed(args.seed + rank) elif args.game == 'atari': env_to_wrap = create_atari_env(args.env_name) env_to_wrap.seed(args.seed + rank) env = env_to_wrap elif args.game == 'picolmaze': env_to_wrap = create_picolmaze_env(args.num_rooms) env_to_wrap.seed(args.seed + rank) env = env_to_wrap env.step(0) model = ActorCritic( # env.observation_space.shape[0], args.num_stack, env.action_space) curiosity = IntrinsicCuriosityModule( # ICM # env.observation_space.shape[0], args.num_stack, env.action_space) model.eval() curiosity.eval() # ICM external_reward_sum = 0 curiosity_reward_sum = 0 # ICM curiosity_reward_sum_clipped = 0 # ICM inv_loss = torch.tensor(0.0) # ICM forw_loss = torch.tensor(0.0) # ICM curiosity_loss = 0 # ICM done = True count_done = 0 start_time = time.time() passed_time = 0 current_counter = 0 # a quick hack to prevent the agent from stucking # actions = deque(maxlen=100) actions = deque(maxlen=args.max_episode_length_test) episode_length = 0 while True: episode_length += 1 if done: passed_time = time.time() - start_time current_counter = counter.value # Sync with the shared model model.load_state_dict(shared_model.state_dict()) curiosity.load_state_dict(shared_curiosity.state_dict()) # ICM cx = torch.zeros(1, 256) hx = torch.zeros(1, 256) if count_done % args.save_video_again_eps == 0: if args.game == 'atari': video_dir = os.path.join( videos_dir, 'video_' + time.strftime('%Y.%m.%d-%H.%M.%S_') + str(current_counter)) if not os.path.exists(video_dir): os.makedirs(video_dir) logging.info("Created new video dir") env = wrappers.Monitor(env_to_wrap, video_dir, force=False) logging.info("Created new wrapper") elif args.game == 'doom': env.set_current_counter(current_counter) env.set_record() logging.info("Set new recording") state = env.reset() state = torch.from_numpy(state) else: cx = cx.detach() hx = hx.detach() with torch.no_grad(): value, logit, (hx, cx) = model(state.unsqueeze(0), hx, cx) prob = F.softmax(logit, dim=-1) action = prob.max(1, keepdim=True)[1].flatten().detach() state_old = state # ICM state, external_reward, done, _ = env.step(action) state = torch.from_numpy(state) # external reward = 0 if ICM-only mode # external_reward = external_reward * (1 - args.icm_only) external_reward_sum += external_reward # <---ICM--- inv_out, forw_out, curiosity_reward = \ curiosity( state_old.unsqueeze(0), action, state.unsqueeze(0)) # In noreward-rl: # self.invloss = tf.reduce_mean( # tf.nn.sparse_softmax_cross_entropy_with_logits(logits, aindex), # name="invloss") # self.forwardloss = 0.5 * tf.reduce_mean(tf.square(tf.subtract(f, phi2)), name='forwardloss') # self.forwardloss = self.forwardloss * 288.0 # lenFeatures=288. Factored out to make hyperparams not depend on it. current_inv_loss = F.nll_loss(F.log_softmax(inv_out, dim=-1), action) current_forw_loss = curiosity_reward inv_loss += current_inv_loss forw_loss += current_forw_loss curiosity_reward = args.eta * curiosity_reward curiosity_reward_sum += curiosity_reward.detach() curiosity_reward_sum_clipped += \ max(min(curiosity_reward.detach(), args.clip), -args.clip) # ---ICM---> done = done or episode_length >= args.max_episode_length # a quick hack to prevent the agent from stucking actions.append(action) if actions.count(actions[0]) == actions.maxlen: done = True if done: # <---ICM--- inv_loss = inv_loss / episode_length forw_loss = forw_loss * (32 * 3 * 3) * 0.5 / episode_length curiosity_loss = args.lambda_1 * ( (1 - args.beta) * inv_loss + args.beta * forw_loss) # ---ICM---> train_policy_loss_mean = sum(train_policy_losses) / \ len(train_policy_losses) train_value_loss_mean = sum(train_value_losses) / \ len(train_value_losses) train_rewards_mean = sum(train_rewards) / \ len(train_rewards) logging.info( "\n\nEp {:3d}: time {}, num steps {}, FPS {:.0f}, len {},\n" " total R {:.6f}, train policy loss {:.6f}, train value loss {:.6f},\n" " train mean R {:.6f}, curiosity R {:.3f}, curiosity R clipped {:.3f},\n" " inv loss {:.3f}, forw loss {:.3f}, curiosity loss {:.3f}.\n" "".format( count_done, time.strftime("%Hh %Mm %Ss", time.gmtime(passed_time)), current_counter, current_counter / passed_time, episode_length, external_reward_sum, train_policy_loss_mean, train_value_loss_mean, train_rewards_mean, curiosity_reward_sum, curiosity_reward_sum_clipped, inv_loss, forw_loss, curiosity_loss)) if ((count_done % args.save_model_again_eps == 0) and (optimizer is not None)): torch.save( model.state_dict(), models_dir + '/model_' + time.strftime('%Y.%m.%d-%H.%M.%S') + f'_{current_counter}.pth') torch.save( curiosity.state_dict(), models_dir + '/curiosity_' + time.strftime('%Y.%m.%d-%H.%M.%S') + f'_{current_counter}.pth') torch.save( optimizer.state_dict(), models_dir + '/optimizer_' + time.strftime('%Y.%m.%d-%H.%M.%S') + f'_{current_counter}.pth') logging.info("Saved the model") tb.log_value('steps_second', current_counter / passed_time, current_counter) tb.log_value('reward', external_reward_sum, current_counter) tb.log_value('reward_icm', curiosity_reward_sum, current_counter) tb.log_value('reward_icm_clipped', curiosity_reward_sum_clipped, current_counter) tb.log_value('loss_inv', inv_loss, current_counter) tb.log_value('loss_forw', forw_loss, current_counter) tb.log_value('loss_curiosity', curiosity_loss, current_counter) tb.log_value('loss_train_policy_mean', train_policy_loss_mean, current_counter) tb.log_value('loss_train_value_mean', train_value_loss_mean, current_counter) tb.log_value('reward_train_mean', train_value_loss_mean, current_counter) if args.game == 'atari': env.close() # Close the window after the rendering session env_to_wrap.close() logging.info("Episode done, close all") episode_length = 0 external_reward_sum = 0 curiosity_reward_sum = 0 # ICM curiosity_reward_sum_clipped = 0 # ICM inv_loss = torch.tensor(0.0) # ICM forw_loss = torch.tensor(0.0) # ICM curiosity_loss = 0 # ICM actions.clear() if count_done >= args.max_episodes: for pid in pids: os.kill(pid, signal.SIGTERM) env.close() os.kill(os.getpid(), signal.SIGKILL) count_done += 1 time.sleep(args.time_sleep)
# actions.append(action) # state = next_state # frame_idx += 1 # next_state = torch.FloatTensor(next_state).to(device) # _, next_value = model(next_state) # use last value final_value = 40 if won else -40 returns = compute_gae(final_value, rewards, masks, values) returns = torch.stack(returns).detach() log_probs = torch.stack(log_probs).detach() values = torch.stack(values).detach() states = torch.stack(states) actions = torch.stack(actions) advantage = returns# - values print("log probs {} values {}returns {} advantage {}".format(log_probs.size(), values.size(), returns.size(), advantage.size())) ppo_update(ppo_epochs, mini_batch_size, states, actions, log_probs, returns, advantage) torch.save(model.state_dict(), "run/weights") if game_idx % 1 == 0: print("Completed game {}/{}, total reward = {}".format(game_idx + 1, N_GAMES, total_reward)) # test_reward = np.mean([test_env() for _ in range(10)]) # test_rewards.append(test_reward) # plot(frame_idx, test_rewards) # if test_reward > threshold_reward: early_stop = True
def run_acer(variant): # BLAS setup os.environ['OMP_NUM_THREADS'] = '1' os.environ['MKL_NUM_THREADS'] = '1' # Setup # args = parser.parse_args() # Creating directories. save_dir = os.path.join('results', 'results') if not os.path.exists(save_dir): os.makedirs(save_dir) print(' ' * 26 + 'Options') """ # Saving parameters with open(os.path.join(save_dir, 'params.txt'), 'w') as f: for k, v in vars(args).items(): print(' ' * 26 + k + ': ' + str(v)) f.write(k + ' : ' + str(v) + '\n') """ # args.env = 'CartPole-v1' # TODO: Remove hardcoded environment when code is more adaptable # mp.set_start_method(platform.python_version()[0] == '3' and 'spawn' or 'fork') # Force true spawning (not forking) if available torch.manual_seed(variant['seed']) T = Counter() # Global shared counter # gym.logger.set_level(gym.logger.ERROR) # Disable Gym warnings # Create shared network env = gym.make(variant['env']) shared_model = ActorCritic(env.observation_space, env.action_space, variant['hidden_size']) shared_model.share_memory() """ if args.model and os.path.isfile(args.model): # Load pretrained weights shared_model.load_state_dict(torch.load(args.model)) """ # Create average network shared_average_model = ActorCritic(env.observation_space, env.action_space, variant['hidden_size']) shared_average_model.load_state_dict(shared_model.state_dict()) shared_average_model.share_memory() for param in shared_average_model.parameters(): param.requires_grad = False # Create optimiser for shared network parameters with shared statistics optimiser = SharedRMSprop(shared_model.parameters(), lr=variant['lr'], alpha=0.99) optimiser.share_memory() env.close() fields = ['t', 'rewards', 'avg_steps', 'time'] with open(os.path.join(save_dir, 'test_results.csv'), 'w') as f: writer = csv.writer(f) writer.writerow(fields) # Start validation agent processes = [] p = mp.Process(target=test, args=(0, variant, T, shared_model)) p.start() processes.append(p) if not variant['evaluate']: # Start training agents for rank in range(1, variant['num-processes'] + 1): p = mp.Process(target=train, args=(rank, variant, T, shared_model, shared_average_model, optimiser)) p.start() print('Process ' + str(rank) + ' started') processes.append(p) # Clean up for p in processes: p.join()
def test(rank, args, T, shared_model): torch.manual_seed(args.seed + rank) env = gym.make(args.env) env.seed(args.seed + rank) model = ActorCritic(env.observation_space, env.action_space, args.hidden_size) model.eval() can_test = True # Test flag t_start = 1 # Test step counter to check against global counter rewards, steps = [], [] # Rewards and steps for plotting l = str(len(str(args.T_max))) # Max num. of digits for logging steps done = True # Start new episode while T.value() <= args.T_max: if can_test: t_start = T.value() # Reset counter # Evaluate over several episodes and average results avg_rewards, avg_episode_lengths = [], [] for _ in range(args.evaluation_episodes): while True: # Reset or pass on hidden state if done: # Sync with shared model every episode model.load_state_dict(shared_model.state_dict()) hx = Variable(torch.zeros(1, args.hidden_size), volatile=True) cx = Variable(torch.zeros(1, args.hidden_size), volatile=True) # Reset environment and done flag state = state_to_tensor(env.reset()) done, episode_length = False, 0 reward_sum = 0 # Optionally render validation states if args.render: env.render() # Calculate policy policy, _, _, (hx, cx) = model(Variable(state, volatile=True), (hx.detach(), cx.detach())) # Break graph for memory efficiency # Choose action greedily action = policy.max(1)[1].data[0, 0] # Step state, reward, done, _ = env.step(action) state = state_to_tensor(state) reward_sum += reward done = done or episode_length >= args.max_episode_length # Stop episodes at a max length episode_length += 1 # Increase episode counter # Log and reset statistics at the end of every episode if done: avg_rewards.append(reward_sum) avg_episode_lengths.append(episode_length) break print(('[{}] Step: {:<' + l + '} Avg. Reward: {:<8} Avg. Episode Length: {:<8}').format( datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S,%f')[:-3], t_start, sum(avg_rewards) / args.evaluation_episodes, sum(avg_episode_lengths) / args.evaluation_episodes)) if args.evaluate: return rewards.append(avg_rewards) # Keep all evaluations steps.append(t_start) plot_line(steps, rewards) # Plot rewards torch.save(model.state_dict(), 'model.pth') # Save model params can_test = False # Finish testing else: if T.value() - t_start >= args.evaluation_interval: can_test = True time.sleep(0.001) # Check if available to test every millisecond env.close()
def test(rank, args, shared_model): torch.manual_seed(args.seed + rank) env = env_wrapper.create_doom(args.record, outdir=args.outdir) model = ActorCritic(env.observation_space.shape[0], env.action_space) model.eval() state = env.reset() state = torch.from_numpy(state) reward_sum = 0 done = True start_time = time.time() # a quick hack to prevent the agent from stucking actions = deque(maxlen=2100) episode_length = 0 result = [] while True: episode_length += 1 # Sync with the shared model if done: model.load_state_dict(shared_model.state_dict()) cx = Variable(torch.zeros(1, 256), volatile=True) hx = Variable(torch.zeros(1, 256), volatile=True) else: cx = Variable(cx.data, volatile=True) hx = Variable(hx.data, volatile=True) value, logit, (hx, cx) = model( (Variable(state.unsqueeze(0), volatile=True), (hx, cx)), icm=False) prob = F.softmax(logit) action = prob.max(1)[1].data.numpy() state, reward, done, _ = env.step(action[0, 0]) state = torch.from_numpy(state) done = done or episode_length >= args.max_episode_length reward_sum += reward # a quick hack to prevent the agent from stucking actions.append(action[0, 0]) if actions.count(actions[0]) == actions.maxlen: done = True if done: end_time = time.time() print("Time {}, episode reward {}, episode length {}".format( time.strftime("%Hh %Mm %Ss", time.gmtime(end_time - start_time)), reward_sum, episode_length)) result.append((reward_sum, end_time - start_time)) f = open('output/result.pickle', 'w') pickle.dump(result, f) f.close() torch.save(model.state_dict(), 'output/{}.pth'.format( (end_time - start_time))) reward_sum = 0 episode_length = 0 actions.clear() state = env.reset() state = torch.from_numpy(state) time.sleep(60)
returns = torch.cat(returns).detach() log_probs = torch.cat(log_probs).detach() values = torch.cat(values).detach() states = torch.cat(states) actions = torch.cat(actions) advantage = returns - values advantage = normalize(advantage) ppo_update(frame_idx, states, actions, log_probs, returns, advantage) train_epoch += 1 if train_epoch % TEST_EPOCHS == 0: test_reward = np.mean([ test_env(env, model, device, num_outputs) for _ in range(NUM_TESTS) ]) writer.add_scalar("test_rewards", test_reward, frame_idx) print('Frame %s. reward: %s' % (frame_idx, test_reward)) # Save a checkpoint every time we achieve a best reward if best_reward is None or best_reward < test_reward: if best_reward is not None: print("Best reward updated: %.3f -> %.3f" % (best_reward, test_reward)) name = "%s_best_%+.3f_%d.weights" % ( "connectx", test_reward, frame_idx) fname = os.path.join('.', 'checkpoints', name) torch.save(model.state_dict(), fname) best_reward = test_reward if test_reward > TARGET_REWARD: early_stop = True
args.env = 'CartPole-v1' # TODO: Remove hardcoded environment when code is more adaptable torch.manual_seed(args.seed) T = Counter() # Global shared counter # Create shared network env = gym.make(args.env) shared_model = ActorCritic(env.observation_space, env.action_space, args.hidden_size) shared_model.share_memory() if args.model and os.path.isfile(args.model): # Load pretrained weights shared_model.load_state_dict(torch.load(args.model)) # Create average network shared_average_model = ActorCritic(env.observation_space, env.action_space, args.hidden_size) shared_average_model.load_state_dict(shared_model.state_dict()) shared_average_model.share_memory() for param in shared_average_model.parameters(): param.requires_grad = False # Create optimiser for shared network parameters with shared statistics optimiser = SharedRMSprop(shared_model.parameters(), lr=args.lr, alpha=args.rmsprop_decay) optimiser.share_memory() env.close() # Start validation agent processes = [] p = mp.Process(target=test, args=(0, args, T, shared_model)) p.start() processes.append(p)
def test(args, shared_model): action_map = _set_action_map() env = FixedEnvWrap() # time.sleep(10) model = ActorCritic() model.load_state_dict(shared_model.state_dict()) model.eval() state = env.reset() training_time = 0 vis = visdom.Visdom(env='final') line_plot = vis.line(Y=np.array([0]), opts=dict(xlabel='testing count', ylabel='average reward', title='ali-v1')) start = time.time() vis_count = 0 while True: video_count = 1 reward_all_sum = 0 reward_all = 0 reward_all_ave = 0 reward_gop = 0 action = 3 last_action = 3 # update model before testing all trace files # time.sleep(5) print('load updated model') model.load_state_dict(shared_model.state_dict()) while True: # get the reward for one gop while True: _, done, decision_flag = env.step_gop(action) if decision_flag or done: reward_gop = env.get_reward_gop() state = env.get_state_gop() break else: continue # print('testing') # get action from model last_action = action with torch.no_grad(): state = torch.FloatTensor(state) logit, _ = model( state.view(-1, args.s_gop_info, args.s_gop_len)) prob = F.softmax(logit, dim=1) _, action = torch.max(prob, 1) action = action.data.numpy()[0] bitrate, target_buffer = action_map[last_action] # print('bitrate: %d, target_buffer: %d, reward is %s' % (bitrate, target_buffer, reward_gop)) if done: print("video count %d, reward is %.5f" % (video_count, reward_all)) # reward_all_sum += reward_all / 100 reward_all_sum += reward_all video_count += 1 if reward_all < 0: print('bad model ! just break this loop') reward_all_ave = 0 break if video_count > env.traces_len * 2: reward_all_ave = reward_all_sum / video_count break action = 3 last_action = 3 reward_all = 0 reward_all += reward_gop # update the figure of average reward of all testing files vis_count += 1 reward_all_ave = max(reward_all_ave, 0) vis.line(Y=np.array([reward_all_ave]), X=np.array([vis_count]), win=line_plot, update='append') path = 'ali-v1/actor.pt-' + str(vis_count) torch.save(model.state_dict(), path) end = time.time() hours, rem = divmod(end - start, 3600) minutes, seconds = divmod(rem, 60) print("{:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds)) print("average reward of traces are: ", reward_all_ave) print('saved one model in epoch:', vis_count)
def test(rank, args, T, shared_model): torch.manual_seed(args.seed + rank) env = JacoEnv(args.width, args.height, args.frame_skip, args.rewarding_distance, args.control_magnitude, args.reward_continuous) env.seed(args.seed + rank) if args.render: (_, _, obs_rgb_view2) = env.reset() plt.ion() f, ax = plt.subplots() im = ax.imshow(obs_rgb_view2) model = ActorCritic(None, args.non_rgb_state_size, None, args.hidden_size) model.eval() can_test = True # Test flag t_start = 1 # Test step counter to check against global counter rewards, steps = [], [] # Rewards and steps for plotting n_digits = str( len(str(args.T_max))) # Max num. of digits for logging steps done = True # Start new episode while T.value() <= args.T_max: if can_test: t_start = T.value() # Reset counter # Evaluate over several episodes and average results avg_rewards, avg_episode_lengths = [], [] for _ in range(args.evaluation_episodes): while True: # Reset or pass on hidden state if done: # Sync with shared model every episode model.load_state_dict(shared_model.state_dict()) hx = Variable( torch.zeros(1, args.hidden_size), volatile=True) cx = Variable( torch.zeros(1, args.hidden_size), volatile=True) # Reset environment and done flag state = state_to_tensor(env.reset()) action, reward, done, episode_length = (0, 0, 0, 0, 0, 0), 0, False, 0 reward_sum = 0 # Calculate policy policy, _, (hx, cx) = model( Variable( state[0], volatile=True), Variable( state[1], volatile=True), (hx.detach(), cx.detach())) # Break graph for memory efficiency # Choose action greedily action = [p.max(1)[1].data[0, 0] for p in policy] # Step state, reward, done = env.step(action) obs_rgb_view1 = state[1] obs_rgb_view2 = state[2] state = state_to_tensor(state) reward_sum += reward done = done or episode_length >= args.max_episode_length # Stop episodes at a max length episode_length += 1 # Increase episode counter # Optionally render validation states if args.render: # rendering the first camera view im.set_data(obs_rgb_view1) plt.draw() plt.pause(0.05) # rendering mujoco simulation # viewer = mujoco_py.MjViewer(env.sim) # viewer.render() # Log and reset statistics at the end of every episode if done: avg_rewards.append(reward_sum) avg_episode_lengths.append(episode_length) break print(('[{}] Step: {:<' + n_digits + '} Avg. Reward: {:<8} Avg. Episode Length: {:<8}').format( datetime.utcnow().strftime( '%Y-%m-%d %H:%M:%S,%f')[:-3], t_start, sum(avg_rewards) / args.evaluation_episodes, sum(avg_episode_lengths) / args.evaluation_episodes)) rewards.append(avg_rewards) # Keep all evaluations steps.append(t_start) plot_line(steps, rewards) # Plot rewards torch.save(model.state_dict(), os.path.join('results', str(t_start) + '_model.pth')) # Checkpoint model params can_test = False # Finish testing if args.evaluate: return else: if T.value() - t_start >= args.evaluation_interval: can_test = True time.sleep(0.001) # Check if available to test every millisecond