all_reward, all_eps_hori_me, all_eps_verti_me, all_eps_hori_st, all_eps_verti_st , all_length , all_success_rate\ = 0, 0 ,0 , 0, 0, 0, 0 all_success_rate_single, all_success_rate_single_mean = np.zeros( player.num_agents), np.zeros(player.num_agents) for i_episode in range(args.num_episodes): print('episode', i_episode) if i_episode >= args.num_episodes // 2: player.env.env.env.reverse = True else: player.env.env.env.reverse = False player.state = player.env.reset() if 'Unreal' in args.env: player.cam_pos = player.env.env.env.env.cam_pose player.set_cam_info() player.state = torch.from_numpy(player.state).float() player.last_gate_ids = [1 for i in range(player.num_agents)] player.input_actions = torch.Tensor( np.zeros((player.num_agents, 11))) if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() player.eps_len = 0 reward_sum = np.zeros(len(env.observation_space)) success_rate_sum = 0 success_rate_singles = np.zeros(player.num_agents) reward_mean = 0 eps_step = 0
def train(rank, args, shared_model, optimizer, train_modes, n_iters, device, env=None): n_steps = 0 n_iter = 0 writer = SummaryWriter(os.path.join(args.log_dir, 'Agent:{}'.format(rank))) ptitle('Training Agent: {}'.format(rank)) torch.manual_seed(args.seed + rank) training_mode = args.train_mode env_name = args.env train_modes.append(training_mode) n_iters.append(n_iter) if env == None: env = create_env(env_name, args) params = shared_model.parameters() if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(params, lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam(filter(lambda p: p.requires_grad, shared_model.parameters()), lr=args.lr) env.seed(args.seed + rank) player = Agent(None, env, args, None, None, device) player.model = build_model( player.env.observation_space, player.env.action_space, args, device).to(device) player.state = player.env.reset() if 'Unreal' in args.env: player.cam_pos = player.env.env.env.env.cam_pose player.collect_state = player.env.env.env.env.current_states player.set_cam_info() player.state = torch.from_numpy(player.state).float() player.state = player.state.to(device) player.model = player.model.to(device) player.model.train() reward_sum = torch.zeros(player.num_agents).to(device) count_eps = 0 cross_entropy_loss = nn.CrossEntropyLoss() while True: player.model.load_state_dict(shared_model.state_dict()) player.update_lstm() fps_counter = 0 t0 = time.time() for step in range(args.num_steps): player.action_train() n_steps += 1 reward_sum += player.reward if player.done: break update_steps = len(player.rewards) fps = fps_counter / (time.time() - t0) if player.done: for i in range(player.num_agents): writer.add_scalar('train/reward_'+str(i), reward_sum[i], n_steps) count_eps += 1 reward_sum = torch.zeros(player.num_agents).to(device) player.eps_len = 0 player.state = player.env.reset() player.set_cam_info() player.state = torch.from_numpy(player.state).float().to(device) R = torch.zeros(player.num_agents, 1).to(device) if not player.done: state = player.state value_multi, _, _, _, _, _, _, _ , _= player.model( (Variable(state, requires_grad=True), Variable((player.cam_info), requires_grad=True), player.H_multi, player.last_gate_ids, player.gt_gate)) for i in range(player.num_agents): R[i][0] = value_multi[i].data gates, gt_gates = [], [] for k1 in range(len(player.rewards)): for k2 in range(player.num_agents): gates.append(player.gates[k1][k2]) gt_gates.append(player.gate_gts[k1][k2]) gate_probs = torch.cat(gates).view(-1, 2).to(device) gate_gt_ids = torch.Tensor(gt_gates).view(1, -1).squeeze().long().to(device) gate_loss = cross_entropy_loss(gate_probs, gate_gt_ids) player.values.append(Variable(R).to(device)) policy_loss = torch.zeros(player.num_agents, 1).to(device) value_loss = torch.zeros(player.num_agents, 1).to(device) entropies = torch.zeros(player.num_agents, 1).to(device) w_entropies = torch.Tensor([[float(args.entropy)] for i in range(player.num_agents)]).to(device) R = Variable(R, requires_grad=True).to(device) gae = torch.zeros(1, 1).to(device) for i in reversed(range(len(player.rewards))): R = args.gamma * R + player.rewards[i] advantage = R - player.values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = player.rewards[i] + args.gamma * player.values[i + 1].data - player.values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ (player.log_probs[i] * Variable(gae)) - \ (w_entropies * player.entropies[i]) entropies += player.entropies[i] loss = policy_loss.sum() / update_steps / player.num_agents + 0.5 * value_loss.sum() / update_steps / player.num_agents + \ 5 * gate_loss player.model.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(params, 50) ensure_shared_grads(player.model, shared_model, gpu=args.gpu_ids[-1] >= 0) writer.add_scalar('train/policy_loss_sum', policy_loss.sum(), n_steps) writer.add_scalar('train/value_loss_sum', value_loss.sum(), n_steps) writer.add_scalar('train/entropies_sum', entropies.sum(), n_steps) writer.add_scalar('train/fps', fps, n_steps) writer.add_scalar('train/gate_loss', gate_loss, n_steps) n_iter += 1 n_iters[rank] = n_iter optimizer.step() player.clear_actions() if train_modes[rank] == -100: env.close() break
def test(rank, args, shared_model, train_modes, n_iters, device): writer = SummaryWriter( os.path.join(args.log_dir, 'Test Agent:{}'.format(rank))) ptitle('Test Agent: {}'.format(rank)) torch.manual_seed(args.seed + rank) n_iter = 0 log = {} setup_logger('{}_log'.format(args.env), r'{0}/logger'.format(args.log_dir)) log['{}_log'.format(args.env)] = logging.getLogger('{}_log'.format( args.env)) d_args = vars(args) for k in d_args.keys(): log['{}_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k])) torch.manual_seed(args.seed) env = create_env(args.env, args) start_time = time.time() num_tests = 1 n_step = 0 player = Agent(None, env, args, None, None, device) player.model = build_model(player.env.observation_space, player.env.action_space, args, device).to(device) player.state = player.env.reset() if 'Unreal' in args.env: player.cam_pos = player.env.env.env.env.cam_pose player.collect_state = player.env.env.env.env.current_states player.set_cam_info() player.state = torch.from_numpy(player.state).float().to(device) player.model.eval() max_score = -100 reward_sum = np.zeros(player.num_agents) reward_total_sum = np.zeros(player.num_agents) reward_sum_ep = np.zeros(player.num_agents) success_rate_sum_ep = np.zeros(player.num_agents) fps_counter = 0 t0 = time.time() cross_entropy_loss = nn.CrossEntropyLoss() len_sum = 0 seed = args.seed count_eps = 0 eps_length = 0 rate = 0 rates = [0, 0] step_rates = [0, 0] mean_rates = [0, 0] visible_steps = 0 while True: if player.done: count_eps += 1 t0 = time.time() eps_length = 0 player.model.load_state_dict(shared_model.state_dict()) player.action_test() eps_length += 1 n_step += 1 fps_counter += 1 reward_sum_ep += player.reward success_rate_sum_ep += player.success_rate gate_ids, gate_probs, gt_gates = [], [], [] for k1 in range(len(player.rewards)): for k2 in range(player.num_agents): _, max_id = torch.max(player.gates[k1][k2], 0) gate_probs.append(player.gates[k1][k2]) gate_ids.append(max_id) gt_gates.append(player.gate_gts[k1][k2]) gate_probs = torch.cat(gate_probs).view(-1, 2).to(device) gate_gt_ids = torch.Tensor(gt_gates).view( 1, -1).squeeze().long().to(device) gate_loss = cross_entropy_loss(gate_probs, gate_gt_ids) visible_steps += sum(np.array(gt_gates).squeeze()) / 4 gate_ids = np.array( [gate_ids[i].cpu().detach().numpy() for i in range(4)]) gt_gates = np.array( [gt_gates[i].cpu().detach().numpy() for i in range(4)]) one_step_rate = sum(gate_ids == gt_gates) / player.num_agents rate += one_step_rate for id in range(2): right_num = sum(gate_ids[i] == gt_gates[i] == id for i in range(4)) num = sum(gt_gates[i] == id for i in range(4)) step_rate = right_num / num if num != 0 else 0 if step_rate > 0: rates[id] += step_rate step_rates[id] += 1 mean_rates[id] = rates[id] / step_rates[id] mean_rate = rate / n_step if player.done: player.state = player.env.reset() player.state = torch.from_numpy(player.state).float().to(device) player.set_cam_info() reward_sum += reward_sum_ep len_sum += player.eps_len fps = fps_counter / (time.time() - t0) n_iter = 0 for n in n_iters: n_iter += n for i in range(player.num_agents): writer.add_scalar('test/reward' + str(i), reward_sum_ep[i], n_iter) writer.add_scalar('test/fps', fps, n_iter) writer.add_scalar('test/eps_len', player.eps_len, n_iter) writer.add_scalar('test/unvisible_acc', mean_rates[0], n_iter) writer.add_scalar('test/visible_acc', mean_rates[1], n_iter) writer.add_scalar('test/mean_acc', mean_rate, n_iter) writer.add_scalar('test/gate_loss', gate_loss, n_iter) player.eps_len = 0 fps_counter = 0 reward_sum_ep = np.zeros(player.num_agents) t0 = time.time() count_eps += 1 if count_eps % args.test_eps == 0: player.max_length = True else: player.max_length = False if player.done and not player.max_length: seed += 1 player.env.seed(seed) player.state = player.env.reset() player.set_cam_info() player.state = torch.from_numpy(player.state).float().to(device) player.eps_len += 2 elif player.done and player.max_length: ave_reward_sum = reward_sum / args.test_eps reward_total_sum += ave_reward_sum reward_mean = reward_total_sum / num_tests len_mean = len_sum / args.test_eps reward_step = reward_sum / len_sum log['{}_log'.format(args.env)].info( "Time {0}, ave eps reward {1}, ave eps length {2}, reward mean {3}, reward step {4}" .format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), ave_reward_sum, len_mean, reward_mean, reward_step)) if ave_reward_sum.mean() >= max_score: print('save best! in %d iters' % n_step) max_score = ave_reward_sum.mean() model_dir = os.path.join( args.log_dir, '{0}-gate-all-model-best-{1}.dat'.format(args.env, n_step)) else: model_dir = os.path.join(args.log_dir, '{0}-new.dat'.format(args.env)) if args.gpu_ids[-1] >= 0: with torch.cuda.device(args.gpu_ids[-1]): state_to_save = player.model.state_dict() torch.save(state_to_save, model_dir) else: state_to_save = player.model.state_dict() torch.save(state_to_save, model_dir) num_tests += 1 reward_sum = 0 len_sum = 0 seed += 1 player.env.seed(seed) player.state = player.env.reset() if 'Unreal' in args.env: player.cam_pos = player.env.env.env.env.cam_pose player.collect_state = player.env.env.env.env.current_states player.set_cam_info() player.state = torch.from_numpy(player.state).float().to(device) player.input_actions = torch.Tensor( np.zeros((player.num_agents, 9))) time.sleep(args.sleep_time) if n_iter > args.max_step: env.close() for id in range(0, args.workers): train_modes[id] = -100 break player.clear_actions()