def __init__(self, state_dim, action_dim, max_action, args): self.state_dim = state_dim self.action_dim = action_dim self.max_action = max_action self._init_parameters(args) self._init_nets(args) self.replay_buffer = ReplayBuffer(self.buffer_size, self.state_dim, self.action_dim)
def run(config): model_dir = Path('./results') / config.env_id if not model_dir.exists(): curr_run = 'run1' else: exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run')] if len(exst_run_nums) == 0: curr_run = 'run1' else: curr_run = 'run%i' % (max(exst_run_nums) + 1) run_dir = model_dir / curr_run fig_dir = run_dir / 'figures' os.makedirs(str(fig_dir)) torch.manual_seed(config.seed) np.random.seed(config.seed) assert config.n_rollout_threads == 1, "For simple test, we assume the number of the environment is 1" env = make_parallel_env(config.env_id, config.n_rollout_threads, config.seed) controller = Controller.init_from_env(env=env, config=config) obs_shape, n_actions = controller.obs_shape, controller.n_actions buffer = ReplayBuffer(controller.n_agents, obs_shape, n_actions, config.episode_limit, config.buffer_size) rolloutworker = RolloutWorker(env, controller, config) train_step = 0 mean_episode_rewards = [] for ep_i in range(config.n_episodes): episode, ep_rew, mean_ep_rew = rolloutworker.generate_episode() buffer.push(episode) for step in range(config.n_train_steps): mini_batch = buffer.sample(min(len(buffer), config.batch_size)) controller.update(mini_batch, train_step) train_step += 1 # ep_rew = buffer.get_average_rewards(config.episode_limit * config.n_rollout_threads) mean_episode_rewards.append(mean_ep_rew) print("Episode {} : Total reward {} , Mean reward {}" .format(ep_i + 1, ep_rew, mean_ep_rew)) if ep_i % config.save_interval < config.n_rollout_threads: os.makedirs(str(run_dir / 'incremental'), exist_ok=True) controller.save(str(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1)))) controller.save(str(run_dir / 'model.pt')) controller.save(str(run_dir / 'model.pt')) env.close() index = list(range(1, len(mean_episode_rewards) + 1)) plt.plot(index, mean_episode_rewards) plt.ylabel("Mean Episode Reward") plt.savefig(str(fig_dir) + '/mean_episode_reward.jpg') # plt.show() plt.close()
def __init__(self, state_size, action_size, action_space, args): self.device = torch.device("cuda" if args.cuda else "cpu") self.buffer = ReplayBuffer(args.buffer_size, args.batch_size, self.device) self.action_size = action_size self.gamma = args.gamma self.tau = args.tau self.eps = EpsilonController(e_decays = args.eps_decays, e_min = args.eps_min) self.q_local = QNetwork(state_size, action_size, args.hidden_size).to(self.device) self.q_optimizer = optim.Adam(self.q_local.parameters(), lr=args.lr) self.q_target = copy.deepcopy(self.q_local)
def __init__(self): super(DDPG, self).__init__( actor=Actor(), critic=Critic(), ) self.target_actor = deepcopy(self.actor) self.target_critic = deepcopy(self.critic) disable_train(self.target_actor) disable_train(self.target_critic) self.noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(A_DIM)) self.buffer = ReplayBuffer(BUFFER_SIZE) self.time = 0
def push_to_replay_buffer(self, buffer: ReplayBuffer): for i in range(len(self.frames)): frame = self.frames[i] next_frame = self.frames[i + 1] if i < len(self.frames) - 1 else None for k in frame.keys(): if next_frame is None: frame[k].next_obs = [0] * len(frame[k].obs) elif k not in next_frame: assert frame[k].done frame[k].next_obs = [0] * len(frame[k].obs) else: frame[k].next_obs = next_frame[k].obs buffer.push({k: v.build() for k, v in frame.items()})
def test_buffer(self): data = \ {AgentKey(0, '0-1'): AgentReplayFrame([2, 1, 2, 2, 3], [0, 1, 0], 3, False, [3, 1, 1, 2, 3]), AgentKey(0, '0-2'): AgentReplayFrame([1, 1, 3, 2, 1], [0, 1, 0], 4, False, [2, 1, 1, 2, 2]), AgentKey(1, '0-1'): AgentReplayFrame([2, 0, 3, 1, 2], [0, 1], 5, False, [3, 0, 1, 3, 4])} max_steps = 4 buffer = ReplayBuffer(max_steps) for i in range(5): buffer.push(data) self.assertEqual(buffer.length(), min(i + 1, max_steps)) sample: List[Dict[AgentKey, AgentReplayFrame]] = buffer.sample(2, norm_rews=False) for s in sample: for k, v in s.items(): self.assertEqual(v.reward, data[k].reward) sample: List[Dict[AgentKey, AgentReplayFrame]] = buffer.sample(2, norm_rews=True) for s in sample: for k, v in s.items(): self.assertEqual(v.reward, 0) avg_rewards = buffer.get_average_rewards(3) for k, v in avg_rewards.items(): self.assertEqual(v, data[k].reward)
def __init__(self, env, config, logger=None): """ Initialize Policy Gradient Class Args: env: an OpenAI Gym environment config: class with hyperparameters use_mask: train time, omit velocity features in state logger: logger instance from the logging module You do not need to implement anything in this function. However, you will need to use self.discrete, self.observation_dim, self.action_dim, and self.lr in other methods. """ # directory for training outputs if not os.path.exists(config.output_path): os.makedirs(config.output_path) # store hyperparameters self.config = config if self.config.use_mask: print('Using mask...') self.logger = logger if logger is None: self.logger = get_logger(config.log_path) self.env = env # discrete vs continuous action space self.discrete = isinstance(env.action_space, gym.spaces.Discrete) self.observation_dim = get_obs_dims(self.config.env_name, self.config.use_mask) self.action_dim = self.env.action_space.n if self.discrete else self.env.action_space.shape[ 0] self.lr = self.config.learning_rate # for milestone: capture raw tuple embedding self.memory_dim = 6 #self.observation_dim * 2 + self.action_dim + 1 + 1 # (s, a, r, s', done_mask) self.replay_buffer = ReplayBuffer(self.config.memory_len + 1, 1, action_dim=self.action_dim) self.percolated_buffer = ReplayBuffer(self.config.percolate_len + 1, 1, action_dim=self.action_dim) # build model self.build()
def __init__(self, num_agents, state_size, action_size, opts): self.num_agents = num_agents self.state_size = state_size self.action_size = action_size self.opts = opts self.closs = np.inf self.aloss = np.inf self.eps = 1 self.eps_decay = 0.998 self.min_eps = 0.01 # Actor Network self.actor_local = ActorNet(state_size, action_size, fc1_units=opts.a_fc1, fc2_units=opts.a_fc2).to(opts.device) self.actor_target = ActorNet(state_size, action_size, fc1_units=opts.a_fc1, fc2_units=opts.a_fc2).to(opts.device) self.actor_optimizer = torch.optim.Adam(self.actor_local.parameters(), lr=opts.actor_lr) # Critic Network self.critic_local = CriticNet(state_size, action_size, fc1_units=opts.c_fc1, fc2_units=opts.c_fc2).to(opts.device) self.critic_target = CriticNet(state_size, action_size, fc1_units=opts.c_fc1, fc2_units=opts.c_fc2).to(opts.device) self.critic_optimizer = torch.optim.Adam( self.critic_local.parameters(), lr=opts.critic_lr, weight_decay=opts.critic_weight_decay) # Noise process self.noise = OUNoise((num_agents, action_size), opts.random_seed) self.step_idx = 0 # Replay memory self.memory = ReplayBuffer(action_size, opts.buffer_size, opts.batch_size, opts.random_seed, opts.device)
def run(config): """ :param config: """ # model_dir = Path('./models') / config.env_id / config.model_name env = make_env(config.env_id) np.random.seed(config.seed) torch.manual_seed(config.seed) if all([hasattr(a, 'adversary') for a in env.agents]): agent_types = [ 'adversary' if a.adversary else 'agent' for a in env.agents ] else: agent_types = ['agent' for _ in env.agents] maddpg = MADDPG.init_from_env(env, agent_types, agent_alg=config.agent_alg, adversary_alg=config.adversary_alg, tau=config.tau, lr=config.lr, hidden_dim=config.hidden_dim) replay_buffer = ReplayBuffer(config.buffer_length, maddpg.num_agent) for ep_i in range(config.n_episodes): print("Episodes %i of %i" % (ep_i + 1, config.n_episodes)) observations = env.reset() for et_i in range(config.episode_length): torch_observations = [ torch.from_numpy(observations[i]).float() for i in range(maddpg.num_agent) ] torch_agent_actions = maddpg.step(torch_observations) agent_actions = [ action.data.numpy() for action in torch_agent_actions ] next_observations, rewards, dones, infos = env.step(agent_actions) replay_buffer.push_data(observations, agent_actions, rewards, next_observations, dones) observations = next_observations if replay_buffer.get_size() >= config.batch_size: for a_i in range(maddpg.num_agent): sample = replay_buffer.sample(config.batch_size) maddpg.update(sample, agent_i=a_i) maddpg.update_all_agent() print("Episode rewards ") print(replay_buffer.get_episode_rewards(config.episode_length)) env.close()
def __init__(self, state_size, action_size, action_space, args, policy_noise=0.2, noise_clip=0.5, policy_freq=2): self.device = torch.device("cuda" if args.cuda else "cpu") self.buffer = ReplayBuffer(args.buffer_size, args.batch_size, self.device) self.action_size = action_size self.gamma = args.gamma self.tau = args.tau self.start_steps = args.start_steps self.total_it = 0 self.max_action = float(action_space.high[0]) self.policy_noise = policy_noise * self.max_action # Target policy smoothing is scaled wrt the action scale self.noise_clip = noise_clip * self.max_action self.policy_freq = policy_freq self.expl_noise = args.expl_noise self.ce = nn.CrossEntropyLoss() self.mse = nn.MSELoss() self.policy = Actor(state_size, action_space.shape[0], args.hidden_size, action_space).to(self.device) self.policy_optimizer = optim.Adam(self.policy.parameters(), lr=args.lr) self.policy_target = copy.deepcopy(self.policy) self.critic_local = QNetwork(state_size, action_space.shape[0], args.hidden_size).to(self.device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=args.lr) self.critic_target = copy.deepcopy(self.critic_local)
class DDPG: def __init__(self, state_dim, action_dim, max_action, args): self.state_dim = state_dim self.action_dim = action_dim self.max_action = max_action self._init_parameters(args) self._init_nets(args) self.replay_buffer = ReplayBuffer(self.buffer_size, self.state_dim, self.action_dim) def _init_parameters(self, args): self.actor_lr = args.actor_lr self.critic_lr = args.critic_lr self.discount = args.discount self.tau = args.tau self.buffer_size = args.buffer_size self.batch_size = args.batch_size def _init_nets(self, args): self.actor = Actor(self.state_dim, self.action_dim, self.max_action, args) self.actor_t = Actor(self.state_dim, self.action_dim, self.max_action, args) self.actor_optim = optim.Adam(self.actor.parameters(), lr=self.actor_lr) self.critic = Critic(self.state_dim, self.action_dim, args) self.critic_t = Critic(self.state_dim, self.action_dim, args) self.critic_optim = optim.Adam(self.critic.parameters(), lr=self.critic_lr) self.loss = nn.MSELoss() hard_update(self.actor_t, self.actor) hard_update(self.critic_t, self.critic) def train(self): states, n_states, actions, rewards, dones = self.replay_buffer.sample( self.batch_size) # Compute q target next_q = self.critic_t(n_states, self.actor_t(n_states)) q_target = (rewards + self.discount * (1 - dones.float()) * next_q).detach() # Compute q predict q_predict = self.critic(states, actions) # Critic update critic_loss = self.loss(q_predict, q_target) self.critic_optim.zero_grad() critic_loss.backward() self.critic_optim.step() # Actor update actor_loss = -self.critic(states, self.actor(states)).mean() self.actor_optim.zero_grad() actor_loss.backward() actor_grad = self.actor.get_grads() self.actor_optim.step() soft_update(self.actor_t, self.actor, self.tau) soft_update(self.critic_t, self.critic, self.tau) return actor_grad
def run(config): model_dir = Path('./models') / config.env_id / config.model_name if not model_dir.exists(): run_num = 1 else: exst_run_nums = [ int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run') ] if len(exst_run_nums) == 0: run_num = 1 else: run_num = max(exst_run_nums) + 1 curr_run = 'run%i' % run_num run_dir = model_dir / curr_run torch.manual_seed(run_num) np.random.seed(run_num) env = make_parallel_env(config.env_id, config.n_rollout_threads, run_num) envActionSpace = env.action_space envObservationSpace = env.observation_space model = AttentionSAC.init_from_env( envActionSpace, envObservationSpace, tau=config.tau, pi_lr=config.pi_lr, q_lr=config.q_lr, gamma=config.gamma, pol_hidden_dim=config.pol_hidden_dim, #128 critic_hidden_dim=config.critic_hidden_dim, #128 attend_heads=config.attend_heads, #4 reward_scale=config.reward_scale) replay_buffer = ReplayBuffer( config.buffer_length, model.nagents, [obsp.shape[0] for obsp in env.observation_space], [ acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space ]) t = 0 for ep_i in range(0, config.n_episodes, config.n_rollout_threads): #12 print( "Episodes %i-%i of %i" % (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes)) obs = env.reset() model.prep_rollouts(device='cpu') for et_i in range(config.episode_length): #25 # rearrange observations to be per agent, and convert to torch Variable torch_obs = [ Variable(torch.Tensor(np.vstack(obs[:, i])), requires_grad=False) for i in range(model.nagents) ] # get actions as torch Variables torch_agent_actions = model.step(torch_obs, explore=True) # convert actions to numpy arrays agent_actions = [ac.data.numpy() for ac in torch_agent_actions] # rearrange actions to be per environment actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] next_obs, rewards, dones, infos = env.step(actions) replay_buffer.push(obs, agent_actions, rewards, next_obs, dones) obs = next_obs t += config.n_rollout_threads if (len(replay_buffer) >= config.batch_size and (t % config.steps_per_update) < config.n_rollout_threads ): # 100 steps across rollouts -> 4 updates model.prep_training(device='cpu') for u_i in range(config.num_updates): #4 sample = replay_buffer.sample(config.batch_size) model.update_critic(sample) model.update_policies(sample) model.update_all_targets() model.prep_rollouts(device='cpu') ep_rews = replay_buffer.get_average_rewards(config.episode_length * config.n_rollout_threads) if ep_i % config.save_interval < config.n_rollout_threads: model.prep_rollouts(device='cpu') os.makedirs(run_dir / 'incremental', exist_ok=True) model.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))) model.save(run_dir / 'model.pt') model.save(run_dir / 'model.pt') env.close()
def run(config): model_dir = Path('./models') / config.env_id / config.model_name if not model_dir.exists(): run_num = 1 else: exst_run_nums = [ int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run') ] if len(exst_run_nums) == 0: run_num = 1 else: run_num = max(exst_run_nums) + 1 curr_run = 'run%i' % run_num run_dir = model_dir / curr_run log_dir = run_dir / 'logs' os.makedirs(log_dir) logger = SummaryWriter(str(log_dir)) torch.manual_seed(1804) np.random.seed(1804) # initialize E parallel environments with N agents env = make_parallel_env(config.env_id, config.n_rollout_threads, 1804) model = AttentionSAC.init_from_save('model.pt') # model = AttentionSAC.init_from_env(env, # tau=config.tau, # pi_lr=config.pi_lr, # q_lr=config.q_lr, # gamma=config.gamma, # pol_hidden_dim=config.pol_hidden_dim, # critic_hidden_dim=config.critic_hidden_dim, # attend_heads=config.attend_heads, # reward_scale=config.reward_scale) # initialize replay buffer D replay_buffer = ReplayBuffer( config.buffer_length, model.nagents, [obsp.shape[0] for obsp in env.observation_space], [ acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space ]) # T_update t = 0 max_step = 0 max_time = 0 total_step = np.zeros(model.nagents) total_time = np.zeros(model.nagents) for ep_i in range(0, config.n_episodes, config.n_rollout_threads): print( "Episodes %i-%i of %i" % (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes)) obs = env.reset() model.prep_rollouts(device='cpu') success = np.zeros((config.n_rollout_threads, model.nagents), dtype=bool) steps = np.zeros((config.n_rollout_threads, model.nagents)) time_cost = np.zeros((config.n_rollout_threads, model.nagents)) for et_i in range(config.episode_length): # rearrange observations to be per agent, and convert to torch Variable torch_obs = [ Variable(torch.Tensor(np.vstack(obs[:, i])), requires_grad=False) for i in range(model.nagents) ] start = time.clock() # get actions as torch Variables torch_agent_actions = model.step(torch_obs, explore=False) end = time.clock() per_time_cost = end - start # convert actions to numpy arrays agent_actions = [ac.data.numpy() for ac in torch_agent_actions] # rearrange actions to be per environment actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] next_obs, rewards, dones, infos = env.step(actions) # calculate steps success = np.logical_or(success, dones) # steps += dones steps += np.logical_not(dones) time_cost += np.logical_not(dones) * per_time_cost # store transitions for all env in replay buffer # replay_buffer.push(obs, agent_actions, rewards, next_obs, dones) obs = next_obs # T_update = T_update + E t += config.n_rollout_threads # if (len(replay_buffer) >= max(config.pi_batch_size, config.q_batch_size) and # (t % config.steps_per_update) < config.n_rollout_threads): # if config.use_gpu: # model.prep_training(device='gpu') # else: # model.prep_training(device='cpu') # for u_i in range(config.num_critic_updates): # sample = replay_buffer.sample(config.q_batch_size, # to_gpu=config.use_gpu) # model.update_critic(sample, logger=logger) # for u_i in range(config.num_pol_updates): # sample = replay_buffer.sample(config.pi_batch_size, # to_gpu=config.use_gpu) # model.update_policies(sample, logger=logger) # model.update_all_targets() # # for u_i in range(config.num_updates): # # sample = replay_buffer.sample(config.batch_size, # # to_gpu=config.use_gpu) # # model.update_critic(sample, logger=logger) # # model.update_policies(sample, logger=logger) # # model.update_all_targets() model.prep_rollouts(device='cpu') # ep_dones = np.mean(success, axis=0) # ep_steps = 1 - np.mean(steps / config.episode_length, axis=0) # ep_mean_step # ep_rews = replay_buffer.get_average_rewards( # config.episode_length * config.n_rollout_threads) # for a_i, a_ep_rew in enumerate(ep_rews): # logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i) # for a_i, a_ep_done in enumerate(ep_dones): # logger.add_scalar('agent%i/mean_episode_dones' % a_i, a_ep_done, ep_i) # for a_i, a_ep_step in enumerate(ep_steps): # logger.add_scalar('agent%i/mean_episode_steps' % a_i, a_ep_step, ep_i) total_step += np.mean(steps, axis=0) total_time += np.mean(time_cost, axis=0) max_step += np.max(steps) max_time += np.max(time_cost) if ep_i % config.save_interval < config.n_rollout_threads: model.prep_rollouts(device='cpu') # os.makedirs(run_dir / 'incremental', exist_ok=True) # model.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))) # model.save(run_dir / 'model.pt') mean_step = total_step / (100 / config.n_rollout_threads) mean_time = total_time / (100 / config.n_rollout_threads) max_time /= 100 / config.n_rollout_threads max_step /= 100 / config.n_rollout_threads print('; '.join([ f'{chr(65 + i)} Mean Step:{mean_step[i]}, Mean Time:{mean_time[i]}' for i in range(model.nagents) ])) print('Mean Max Step:{}, Mean Max Time Cost:{}'.format(max_step, max_time)) # model.save(run_dir / 'model.pt') env.close() logger.export_scalars_to_json(str(log_dir / 'summary.json')) logger.close()
def run(config): torch.set_num_threads(1) env_descr = 'map%i_%iagents_task%i' % (config.map_ind, config.num_agents, config.task_config) model_dir = Path('./models') / config.env_type / env_descr / config.model_name if not model_dir.exists(): run_num = 1 else: exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run')] if len(exst_run_nums) == 0: run_num = 1 else: run_num = max(exst_run_nums) + 1 curr_run = 'run%i' % run_num run_dir = model_dir / curr_run log_dir = run_dir / 'logs' os.makedirs(log_dir) logger = SummaryWriter(str(log_dir)) torch.manual_seed(run_num) np.random.seed(run_num) env = make_parallel_env(config, run_num) if config.nonlinearity == 'relu': nonlin = torch.nn.functional.relu elif config.nonlinearity == 'leaky_relu': nonlin = torch.nn.functional.leaky_relu if config.intrinsic_reward == 0: n_intr_rew_types = 0 sep_extr_head = True else: n_intr_rew_types = len(config.explr_types) sep_extr_head = False n_rew_heads = n_intr_rew_types + int(sep_extr_head) model = SAC.init_from_env(env, nagents=config.num_agents, tau=config.tau, hard_update_interval=config.hard_update, pi_lr=config.pi_lr, q_lr=config.q_lr, phi_lr=config.phi_lr, adam_eps=config.adam_eps, q_decay=config.q_decay, phi_decay=config.phi_decay, gamma_e=config.gamma_e, gamma_i=config.gamma_i, pol_hidden_dim=config.pol_hidden_dim, critic_hidden_dim=config.critic_hidden_dim, nonlin=nonlin, reward_scale=config.reward_scale, head_reward_scale=config.head_reward_scale, beta=config.beta, n_intr_rew_types=n_intr_rew_types, sep_extr_head=sep_extr_head) replay_buffer = ReplayBuffer(config.buffer_length, model.nagents, env.state_space, env.observation_space, env.action_space) intr_rew_rms = [[RunningMeanStd() for i in range(config.num_agents)] for j in range(n_intr_rew_types)] eps_this_turn = 0 # episodes so far this turn active_envs = np.ones(config.n_rollout_threads) # binary indicator of whether env is active env_times = np.zeros(config.n_rollout_threads, dtype=int) env_ep_extr_rews = np.zeros(config.n_rollout_threads) env_extr_rets = np.zeros(config.n_rollout_threads) env_ep_intr_rews = [[np.zeros(config.n_rollout_threads) for i in range(config.num_agents)] for j in range(n_intr_rew_types)] recent_ep_extr_rews = deque(maxlen=100) recent_ep_intr_rews = [[deque(maxlen=100) for i in range(config.num_agents)] for j in range(n_intr_rew_types)] recent_ep_lens = deque(maxlen=100) recent_found_treasures = [deque(maxlen=100) for i in range(config.num_agents)] meta_turn_rets = [] extr_ret_rms = [RunningMeanStd() for i in range(n_rew_heads)] t = 0 steps_since_update = 0 state, obs = env.reset() while t < config.train_time: model.prep_rollouts(device='cuda' if config.gpu_rollout else 'cpu') # convert to torch tensor torch_obs = apply_to_all_elements(obs, lambda x: torch.tensor(x, dtype=torch.float32, device='cuda' if config.gpu_rollout else 'cpu')) # get actions as torch tensors torch_agent_actions = model.step(torch_obs, explore=True) # convert actions to numpy arrays agent_actions = apply_to_all_elements(torch_agent_actions, lambda x: x.cpu().data.numpy()) # rearrange actions to be per environment actions = [[ac[i] for ac in agent_actions] for i in range(int(active_envs.sum()))] try: with timeout(seconds=1): next_state, next_obs, rewards, dones, infos = env.step(actions, env_mask=active_envs) # either environment got stuck or vizdoom crashed (vizdoom is unstable w/ multi-agent scenarios) except (TimeoutError, ViZDoomErrorException, ViZDoomIsNotRunningException, ViZDoomUnexpectedExitException) as e: print("Environments are broken...") env.close(force=True) print("Closed environments, starting new...") env = make_parallel_env(config, run_num) state, obs = env.reset() env_ep_extr_rews[active_envs.astype(bool)] = 0.0 env_extr_rets[active_envs.astype(bool)] = 0.0 for i in range(n_intr_rew_types): for j in range(config.num_agents): env_ep_intr_rews[i][j][active_envs.astype(bool)] = 0.0 env_times = np.zeros(config.n_rollout_threads, dtype=int) state = apply_to_all_elements(state, lambda x: x[active_envs.astype(bool)]) obs = apply_to_all_elements(obs, lambda x: x[active_envs.astype(bool)]) continue steps_since_update += int(active_envs.sum()) if config.intrinsic_reward == 1: # if using state-visit counts, store state indices # shape = (n_envs, n_agents, n_inds) state_inds = np.array([i['visit_count_lookup'] for i in infos], dtype=int) state_inds_t = state_inds.transpose(1, 0, 2) novelties = get_count_based_novelties(env, state_inds_t, device='cpu') intr_rews = get_intrinsic_rewards(novelties, config, intr_rew_rms, update_irrms=True, active_envs=active_envs, device='cpu') intr_rews = apply_to_all_elements(intr_rews, lambda x: x.numpy().flatten()) else: intr_rews = None state_inds = None state_inds_t = None replay_buffer.push(state, obs, agent_actions, rewards, next_state, next_obs, dones, state_inds=state_inds) env_ep_extr_rews[active_envs.astype(bool)] += np.array(rewards) env_extr_rets[active_envs.astype(bool)] += np.array(rewards) * config.gamma_e**(env_times[active_envs.astype(bool)]) env_times += active_envs.astype(int) if intr_rews is not None: for i in range(n_intr_rew_types): for j in range(config.num_agents): env_ep_intr_rews[i][j][active_envs.astype(bool)] += intr_rews[i][j] over_time = env_times >= config.max_episode_length full_dones = np.zeros(config.n_rollout_threads) for i, env_i in enumerate(np.where(active_envs)[0]): full_dones[env_i] = dones[i] need_reset = np.logical_or(full_dones, over_time) # create masks ONLY for active envs active_over_time = env_times[active_envs.astype(bool)] >= config.max_episode_length active_need_reset = np.logical_or(dones, active_over_time) if any(need_reset): try: with timeout(seconds=1): # reset any environments that are past the max number of time steps or done state, obs = env.reset(need_reset=need_reset) # either environment got stuck or vizdoom crashed (vizdoom is unstable w/ multi-agent scenarios) except (TimeoutError, ViZDoomErrorException, ViZDoomIsNotRunningException, ViZDoomUnexpectedExitException) as e: print("Environments are broken...") env.close(force=True) print("Closed environments, starting new...") env = make_parallel_env(config, run_num) state, obs = env.reset() # other envs that were force reset (rest taken care of in subsequent code) other_reset = np.logical_not(need_reset) env_ep_extr_rews[other_reset.astype(bool)] = 0.0 env_extr_rets[other_reset.astype(bool)] = 0.0 for i in range(n_intr_rew_types): for j in range(config.num_agents): env_ep_intr_rews[i][j][other_reset.astype(bool)] = 0.0 env_times = np.zeros(config.n_rollout_threads, dtype=int) else: state, obs = next_state, next_obs for env_i in np.where(need_reset)[0]: recent_ep_extr_rews.append(env_ep_extr_rews[env_i]) meta_turn_rets.append(env_extr_rets[env_i]) if intr_rews is not None: for j in range(n_intr_rew_types): for k in range(config.num_agents): # record intrinsic rewards per step (so we don't confuse shorter episodes with less intrinsic rewards) recent_ep_intr_rews[j][k].append(env_ep_intr_rews[j][k][env_i] / env_times[env_i]) env_ep_intr_rews[j][k][env_i] = 0 recent_ep_lens.append(env_times[env_i]) env_times[env_i] = 0 env_ep_extr_rews[env_i] = 0 env_extr_rets[env_i] = 0 eps_this_turn += 1 if eps_this_turn + active_envs.sum() - 1 >= config.metapol_episodes: active_envs[env_i] = 0 for i in np.where(active_need_reset)[0]: for j in range(config.num_agents): # len(infos) = number of active envs recent_found_treasures[j].append(infos[i]['n_found_treasures'][j]) if eps_this_turn >= config.metapol_episodes: if not config.uniform_heads and n_rew_heads > 1: meta_turn_rets = np.array(meta_turn_rets) if all(errms.count < 1 for errms in extr_ret_rms): for errms in extr_ret_rms: errms.mean = meta_turn_rets.mean() extr_ret_rms[model.curr_pol_heads[0]].update(meta_turn_rets) for i in range(config.metapol_updates): model.update_heads_onpol(meta_turn_rets, extr_ret_rms, logger=logger) pol_heads = model.sample_pol_heads(uniform=config.uniform_heads) model.set_pol_heads(pol_heads) eps_this_turn = 0 meta_turn_rets = [] active_envs = np.ones(config.n_rollout_threads) if any(need_reset): # reset returns state and obs for all envs, so make sure we're only looking at active state = apply_to_all_elements(state, lambda x: x[active_envs.astype(bool)]) obs = apply_to_all_elements(obs, lambda x: x[active_envs.astype(bool)]) if (len(replay_buffer) >= max(config.batch_size, config.steps_before_update) and (steps_since_update >= config.steps_per_update)): steps_since_update = 0 print('Updating at time step %i' % t) model.prep_training(device='cuda' if config.use_gpu else 'cpu') for u_i in range(config.num_updates): sample = replay_buffer.sample(config.batch_size, to_gpu=config.use_gpu, state_inds=(config.intrinsic_reward == 1)) if config.intrinsic_reward == 0: # no intrinsic reward intr_rews = None state_inds = None else: sample, state_inds = sample novelties = get_count_based_novelties( env, state_inds, device='cuda' if config.use_gpu else 'cpu') intr_rews = get_intrinsic_rewards(novelties, config, intr_rew_rms, update_irrms=False, device='cuda' if config.use_gpu else 'cpu') model.update_critic(sample, logger=logger, intr_rews=intr_rews) model.update_policies(sample, logger=logger) model.update_all_targets() if len(recent_ep_extr_rews) > 10: logger.add_scalar('episode_rewards/extrinsic/mean', np.mean(recent_ep_extr_rews), t) logger.add_scalar('episode_lengths/mean', np.mean(recent_ep_lens), t) if config.intrinsic_reward == 1: for i in range(n_intr_rew_types): for j in range(config.num_agents): logger.add_scalar('episode_rewards/intrinsic%i_agent%i/mean' % (i, j), np.mean(recent_ep_intr_rews[i][j]), t) for i in range(config.num_agents): logger.add_scalar('agent%i/n_found_treasures' % i, np.mean(recent_found_treasures[i]), t) logger.add_scalar('total_n_found_treasures', sum(np.array(recent_found_treasures[i]) for i in range(config.num_agents)).mean(), t) if t % config.save_interval < config.n_rollout_threads: model.prep_training(device='cpu') os.makedirs(run_dir / 'incremental', exist_ok=True) model.save(run_dir / 'incremental' / ('model_%isteps.pt' % (t + 1))) model.save(run_dir / 'model.pt') t += active_envs.sum() model.prep_training(device='cpu') model.save(run_dir / 'model.pt') logger.close() env.close(force=(config.env_type == 'vizdoom'))
def train(config, dir_manager=None, logger=None, pbar="default_pbar"): # A few safety checks check_training_args(config) # Creates a directory manager that encapsulates our directory-tree structure if dir_manager is None: dir_manager = DirectoryManager(agent_alg=config.agent_alg, env_name=config.env_name, desc=config.desc, seed=config.seed) dir_manager.create_directories() # Creates logger and prints config if logger is None: logger = create_logger('MASTER', config.log_level, dir_manager.seed_dir / 'logger.out') logger.debug(config_to_str(config)) # Creates a progress-bar if type(pbar) is str: if pbar == "default_pbar": pbar = tqdm() if pbar is not None: pbar.n = 0 pbar.desc += f'{dir_manager.storage_dir.name}/{dir_manager.experiment_dir.name}/{dir_manager.seed_dir.name}' pbar.total = config.n_episodes # Encapsulates in a dict all user-defined params that concern the world (scenario.make_world()) world_params = {} world_params['use_dense_rewards'] = config.use_dense_rewards if config.env_name == 'chase': if config.n_preys is not None: world_params['n_preys'] = config.n_preys if config.n_preds is not None: world_params['n_preds'] = config.n_preds if config.prey_variance is not None: world_params['prey_variance'] = config.prey_variance if config.individual_reward is not None: world_params['individual_reward'] = config.individual_reward elif config.env_name == 'gather': if config.n_agents is not None: world_params['n_agents'] = config.n_agents elif config.env_name == 'intersection': if config.n_agents is not None: world_params['n_agents'] = config.n_agents elif config.env_name == 'bounce': world_params['episode_length'] = config.episode_length if config.line_length is not None: world_params['line_length'] = config.line_length elif config.env_name == 'compromise': if config.line_length is not None: world_params['line_length'] = config.line_length if config.show_all_landmarks is not None: world_params['show_all_landmarks'] = config.show_all_landmarks elif config.env_name == 'imitation': if config.staged is not None: world_params['staged'] = config.staged if config.set_trap is not None: world_params['set_trap'] = config.set_trap elif config.env_name == 'intersection': if config.by_stander is not None: world_params['by_stander'] = config.by_stander elif config.env_name == 'spread': if config.n_agents is not None: world_params['n_agents'] = config.n_agents if config.shuffle_landmarks is not None: world_params['shuffle_landmarks'] = config.shuffle_landmarks if config.color_objects is not None: world_params['color_objects'] = config.color_objects if config.small_agents is not None: world_params['small_agents'] = config.small_agents save_dict_to_json(world_params, str(dir_manager.seed_dir / 'world_params.json')) # Encapsulates in a dict all user-defined params that concern the environment (multiagent.environment.MultiAgentEnv) env_params = {} env_params['env_name'] = config.env_name if 'football' not in config.env_name: env_params['use_max_speed'] = config.use_max_speed save_dict_to_json(env_params, str(dir_manager.seed_dir / 'env_params.json')) # Sets the random seeds (for reproducibility) set_seeds(config.seed) # Initializes environments # TODO: Check reproductibility and that different envs are seeded differently if '3v2football' == config.env_name: obs_rep = config.representation if config.feature_extractor == 'identity': assert obs_rep in ['simple115', 'simple37'] elif config.feature_extractor == 'convNet': assert obs_rep == 'extracted' else: raise NotImplemented( f"config.feature_extractor={config.feature_extractor} not recognized." ) env = make_parallel_football_env( seed_dir=dir_manager.seed_dir, seed=config.seed, dump_freq=config.dump_freq, representation=obs_rep, render=False, n_rollout_threads=config.n_rollout_threads ) # no rendering during training else: env = make_parallel_particle_env( scenario_name=config.env_name, n_rollout_threads=config.n_rollout_threads, seed=config.seed, use_discrete_action=config.use_discrete_action, use_max_speed=config.use_max_speed, world_params=world_params) if not config.use_cuda: torch.set_num_threads(config.n_training_threads) # Initialize the algo algorithm = init_from_config(env, config, logger) # Creates recorders and stores basic info regarding agent types os.makedirs(dir_manager.recorders_dir, exist_ok=True) train_recorder = algorithm.create_train_recorder() train_recorder.tape['agent_colors'] = env.agent_colors if 'football' in config.env_name: if config.feature_extractor == "convNet": n_stack = 4 elif config.feature_extractor == "identity": n_stack = 1 else: raise NotImplemented obs_buffers = ObsBufferCollection(n_env=config.n_rollout_threads, n_stack=n_stack) replay_buffer = StackingReplayBuffer( max_steps=config.buffer_length, num_agents=algorithm.nagents, obs_dims=[obsp.shape for obsp in env.observation_space], ac_dims=[ acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space ], n_stack=n_stack) else: # defines observation buffer for multi-step obs_buffers = ObsBufferCollection(n_env=config.n_rollout_threads, n_stack=1) replay_buffer = ReplayBuffer( max_steps=config.buffer_length, num_agents=algorithm.nagents, obs_dims=[obsp.shape for obsp in env.observation_space], ac_dims=[ acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space ]) # Saves initial models current_model = "model_ep0.pt" best_eval_reward_exploit = -100000. best_model_exploit = "model_ep0_exploit_best.pt" algorithm.save(dir_manager.seed_dir / current_model) algorithm.save(dir_manager.seed_dir / best_model_exploit) best_eval_reward_explore = -100000. best_model_explore = "model_ep0_explore_best.pt" algorithm.save(dir_manager.seed_dir / current_model) algorithm.save(dir_manager.seed_dir / best_model_explore) # Initializes step and episode counters step_i = 0 ep_steps = np.zeros(shape=(config.n_rollout_threads, ), dtype=np.int) ep_dones = 0 ep_recorders = [ EpisodeRecorder(stuff_to_record=['reward']) for _ in range(config.n_rollout_threads) ] obs = env.reset() obs_buffers.fill(obs) algorithm.set_exploration( begin_decay_proportion=config.begin_exploration_decay, n_episodes=config.n_episodes, end_decay_proportion=config.end_exploration_decay, initial_scale=config.init_noise_scale, final_scale=config.final_noise_scale, current_episode=ep_dones) # EPISODES LOOP while ep_dones < config.n_episodes: start_time = time.time() # ENVIRONMENT STEP # convert observations to torch Variable torch_obs = [ Variable(torch.Tensor(obs_buffers.read()[:, i]), requires_grad=False) for i in range(algorithm.nagents) ] # get actions as torch Variables torch_agent_actions = algorithm.select_action(torch_obs, is_exploring=True) # convert actions to numpy arrays agent_actions = [ac.data.numpy() for ac in torch_agent_actions] # rearrange actions to be per environment actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] # makes one step in the environment next_obs, rewards, dones, infos = env.step(actions) # put transitions in the memory buffer replay_buffer.push(obs, agent_actions, rewards, next_obs, dones) # saves relevant info in episode recorders for i in range(config.n_rollout_threads): ep_recorders[i].add_step(obs[i], actions[i], rewards[i], next_obs[i]) # ending step obs = next_obs obs_buffers.append(obs) step_i += config.n_rollout_threads step_time = time.time() - start_time ep_steps += 1 # LEARNING STEP if (len(replay_buffer) >= config.batch_size * config.warmup) \ and (step_i % config.steps_per_update) < config.n_rollout_threads: # Prepares models to training if config.use_cuda: algorithm.prep_training(device='gpu') else: algorithm.prep_training(device='cpu') # Performs one algorithm update sample = replay_buffer.sample(config.batch_size, to_gpu=config.use_cuda, normalize_rewards=False) algorithm.update(sample, train_recorder) # Update target networks algorithm.update_all_targets() # Prepares models to go back in rollout phase algorithm.prep_rollouts(device='cpu') # EPISODE ENDINGS episodes_over = dones | (ep_steps >= config.episode_length) if any(episodes_over): if pbar is not None: pbar.update(sum(episodes_over)) for env_i, is_over in enumerate(episodes_over): if is_over: ep_dones += 1 ep_steps[env_i] = 0 # Reset environments obs[env_i] = env.reset(env_i=env_i) obs_buffers[env_i].flush() obs_buffers[env_i].fill(obs[env_i]) # Summarizes episode metrics train_recorder.append( 'total_reward', ep_recorders[env_i].get_total_reward()) # Reinitialise episode recorder ep_recorders[env_i] = EpisodeRecorder( stuff_to_record=['reward']) # Printing if one third of training is completed if (ep_dones - 1) % (config.n_episodes // 3) == 0 and ep_dones != config.n_episodes: step_time = time.time() - start_time logger.info( f"Episode {ep_dones}/{config.n_episodes}, " f"speed={round_to_two(float(config.n_rollout_threads) / step_time)}steps/s" ) # Sets exploration noise current_noise_scale = algorithm.set_exploration( begin_decay_proportion=config.begin_exploration_decay, n_episodes=config.n_episodes, end_decay_proportion=config.end_exploration_decay, initial_scale=config.init_noise_scale, final_scale=config.final_noise_scale, current_episode=ep_dones) # BOOK-KEEPING if ep_dones % config.episodes_per_save < config.n_rollout_threads: # Model checkpoints if config.save_incrementals: os.makedirs(dir_manager.incrementals_dir, exist_ok=True) algorithm.save(dir_manager.incrementals_dir / ('model_ep%i.pt' % (ep_dones + 1))) os.remove(dir_manager.seed_dir / current_model) current_model = f"model_ep{ep_dones}.pt" algorithm.save(dir_manager.seed_dir / current_model) logger.debug('Saving model checkpoint') # Current model evaluation (run episodes without exploration) if config.n_evaluation_episodes > 0: logger.debug( f'Evaluating model for {config.n_evaluation_episodes} episodes' ) set_seeds( config.evaluation_seed) # fixed seed for evaluation env.seed(config.evaluation_seed) eval_config = get_evaluation_args(overwritten_args="") eval_config.storage_name = dir_manager.storage_dir.name eval_config.experiment_num = int( dir_manager.experiment_dir.stem.strip('experiment')) eval_config.seed_num = int( dir_manager.seed_dir.stem.strip('seed')) eval_config.render = False eval_config.n_episodes = config.n_evaluation_episodes eval_config.last_model = True eval_config.noise_scale = None eval_config.episode_length = config.episode_length eval_config.representation = config.representation # Evaluate with exploit (without explorarion) eval_reward_exploit = np.vstack(evaluate(eval_config)) train_recorder.append('eval_episodes', ep_dones) train_recorder.append('eval_total_reward_exploit', eval_reward_exploit) if eval_reward_exploit.mean() > best_eval_reward_exploit: logger.debug("New best exploit model") os.remove(dir_manager.seed_dir / best_model_exploit) best_model_exploit = f"model_ep{ep_dones}_exploit_best.pt" algorithm.save(dir_manager.seed_dir / best_model_exploit) best_eval_reward_exploit = eval_reward_exploit.mean() # Evaluate with exploration eval_config.noise_scale = current_noise_scale eval_reward_explore = np.vstack(evaluate(eval_config)) train_recorder.append('eval_total_reward_explore', eval_reward_explore) if eval_reward_explore.mean() > best_eval_reward_explore: logger.debug("New best explore model") os.remove(dir_manager.seed_dir / best_model_explore) best_model_explore = f"model_ep{ep_dones}_explore_best.pt" algorithm.save(dir_manager.seed_dir / best_model_explore) best_eval_reward_explore = eval_reward_explore.mean() set_seeds(config.seed + ep_dones) env.seed(config.seed + ep_dones) # Graphs checkpoints logger.debug('Saving recorder checkpoints and graphs') train_recorder.save(dir_manager.recorders_dir / 'train_recorder.pkl') # Saving graphs if len(train_recorder.tape['actor_loss']) > 0: algorithm.save_training_graphs( train_recorder=train_recorder, save_dir=dir_manager.seed_dir) # Saves model one last time and close the environment os.remove(dir_manager.seed_dir / current_model) current_model = f"model_ep{ep_dones}.pt" algorithm.save(dir_manager.seed_dir / current_model) env.close()
def run(config): model_dir = Path('./models') / config.env_id / config.model_name if not model_dir.exists(): curr_run = 'run1' else: exst_run_nums = [ int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run') ] if len(exst_run_nums) == 0: curr_run = 'run1' else: curr_run = 'run%i' % (max(exst_run_nums) + 1) run_dir = model_dir / curr_run log_dir = run_dir / 'logs' os.makedirs(log_dir) logger = SummaryWriter(str(log_dir)) torch.manual_seed(config.seed) np.random.seed(config.seed) if not USE_CUDA: torch.set_num_threads(config.n_training_threads) env = make_parallel_env(config.env_id, config.n_rollout_threads, config.seed, config.discrete_action) maddpg = MADDPG.init_from_env(env, agent_alg=config.agent_alg, adversary_alg=config.adversary_alg, tau=config.tau, lr=config.lr, hidden_dim=config.hidden_dim, noisy_sharing=True, noisy_SNR=config.noisy_SNR, game_id=config.env_id, est_ac=config.est_action) replay_buffer = ReplayBuffer( config.buffer_length, maddpg.nagents, [obsp.shape[0] for obsp in env.observation_space], [ acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space ]) t = 0 print( '#########################################################################' ) print('Adversary using: ', config.adversary_alg, 'Good agent using: ', config.agent_alg, '\n') print('Noisy SNR is: ', config.noisy_SNR) print( '#########################################################################' ) for ep_i in range(0, config.n_episodes, config.n_rollout_threads): obs = env.reset() # obs.shape = (n_rollout_threads, nagent)(nobs), nobs differs per agent so not tensor maddpg.prep_rollouts(device='cpu') if ep_i % 5000 == 0: maddpg.lr *= 0.5 explr_pct_remaining = max( 0, config.n_exploration_eps - ep_i) / config.n_exploration_eps maddpg.scale_noise(config.final_noise_scale + (config.init_noise_scale - config.final_noise_scale) * explr_pct_remaining) maddpg.reset_noise() for et_i in range(config.episode_length): # rearrange observations to be per agent, and convert to torch Variable torch_obs = [ Variable(torch.Tensor(np.vstack(obs[:, i])), requires_grad=False) for i in range(maddpg.nagents) ] # get actions as torch Variables torch_agent_actions = maddpg.step(torch_obs, explore=True) # convert actions to numpy arrays agent_actions = [ac.data.numpy() for ac in torch_agent_actions] # rearrange actions to be per environment actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] next_obs, rewards, dones, infos = env.step(actions) replay_buffer.push(obs, agent_actions, rewards, next_obs, dones) obs = next_obs t += config.n_rollout_threads if (len(replay_buffer) >= config.batch_size and (t % config.steps_per_update) < config.n_rollout_threads): if USE_CUDA: maddpg.prep_training(device='gpu') else: maddpg.prep_training(device='cpu') for u_i in range(config.n_rollout_threads): for a_i in range(maddpg.nagents): sample = replay_buffer.sample(config.batch_size, to_gpu=USE_CUDA) maddpg.update(sample, a_i, logger=logger) maddpg.update_all_targets() maddpg.prep_rollouts(device='cpu') ep_rews = replay_buffer.get_average_rewards(config.episode_length * config.n_rollout_threads) for a_i, a_ep_rew in enumerate(ep_rews): logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i) if ep_i % config.save_interval < config.n_rollout_threads: print("Episodes %i-%i of %i, rewards are: \n" % (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes)) for a_i, a_ep_rew in enumerate(ep_rews): print('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i) os.makedirs(run_dir / 'incremental', exist_ok=True) maddpg.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))) maddpg.save(run_dir / 'model.pt') # *** perform validation every 1000 episodes. i.e. run N=10 times without exploration *** if ep_i % config.validate_every_n_eps == config.validate_every_n_eps - 1: # 假设只有一个env在跑 episodes_stats = [] info_for_one_env_among_timesteps = [] print('*' * 10, 'Validation BEGINS', '*' * 10) for valid_et_i in range(config.run_n_eps_in_validation): obs = env.reset() maddpg.prep_rollouts(device='cpu') explr_pct_remaining = max(0, config.n_exploration_eps - ep_i) / config.n_exploration_eps maddpg.scale_noise( config.final_noise_scale + (config.init_noise_scale - config.final_noise_scale) * explr_pct_remaining) maddpg.reset_noise() curr_episode_stats = [] for et_i in range(config.episode_length): # rearrange observations to be per agent, and convert to torch Variable torch_obs = [ Variable(torch.Tensor(np.vstack(obs[:, i])), requires_grad=False) for i in range(maddpg.nagents) ] # get actions as torch Variables torch_agent_actions = maddpg.step(torch_obs, explore=False) # convert actions to numpy arrays agent_actions = [ ac.data.numpy() for ac in torch_agent_actions ] # rearrange actions to be per environment actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] next_obs, rewards, dones, infos = env.step(actions) info_for_one_env_among_timesteps.append(infos[0]['n']) curr_episode_stats.append(infos[0]['n']) obs = next_obs episodes_stats.append(curr_episode_stats) print('Summary statistics:') if config.env_id == 'simple_tag': # avg_collisions = sum(map(sum,info_for_one_env_among_timesteps))/config.run_n_eps_in_validation episodes_stats = np.array(episodes_stats) # print(episodes_stats.shape) # validation logging with open(f'{config.model_name}.log', 'a') as valid_logfile: valid_logwriter = csv.writer(valid_logfile, delimiter=' ') valid_logwriter.writerow( np.sum(episodes_stats, axis=(1, 2)).tolist()) avg_collisions = np.sum( episodes_stats) / episodes_stats.shape[0] print(f'Avg of collisions: {avg_collisions}') elif config.env_id == 'simple_speaker_listener': for i, stat in enumerate(info_for_one_env_among_timesteps): print(f'ep {i}: {stat}') else: raise NotImplementedError print('*' * 10, 'Validation ENDS', '*' * 10) # *** END of VALIDATION *** maddpg.save(run_dir / 'model.pt') env.close() logger.export_scalars_to_json(str(log_dir / 'summary.json')) logger.close() valid_logfile.close()
def run(config): model_dir = Path('./models') / config.env_id / config.model_name if not model_dir.exists(): run_num = 1 else: exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run')] if len(exst_run_nums) == 0: run_num = 1 else: run_num = max(exst_run_nums) + 1 curr_run = 'run%i' % run_num run_dir = model_dir / curr_run log_dir = run_dir / 'logs' os.makedirs(log_dir) logger = SummaryWriter(str(log_dir)) torch.manual_seed(run_num) np.random.seed(run_num) env = make_parallel_env(config.env_id, config.n_rollout_threads, run_num) # model = AttentionSAC.init_from_env(env, # tau=config.tau, # pi_lr=config.pi_lr, # q_lr=config.q_lr, # gamma=config.gamma, # pol_hidden_dim=config.pol_hidden_dim, # critic_hidden_dim=config.critic_hidden_dim, # attend_heads=config.attend_heads, # reward_scale=config.reward_scale) # Model used to test with adversarial agent # model= AttentionSAC.init_from_save ("C:\\Users\\HP\\Desktop\\NTU\\FYP\\FYP Code\\MAAC\\Output\\run140\\model.pt") # print("Model instantiated") # Model used to test without adversarial agent model= AttentionSAC.init_from_save ("C:\\Users\\HP\\Desktop\\NTU\\FYP\\FYP Code\\MAAC\\Output\\run148\\model.pt") print("Model instantiated") replay_buffer = ReplayBuffer(config.buffer_length, model.nagents, [obsp.shape[0] for obsp in env.observation_space], [acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space]) t = 0 row_list = [] for ep_i in range(0, config.n_episodes, config.n_rollout_threads): print("Episodes %i-%i of %i" % (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes)) obs = env.reset() model.prep_rollouts(device='cpu') for et_i in range(config.episode_length): # rearrange observations to be per agent, and convert to torch Variable torch_obs = [Variable(torch.Tensor(np.vstack(obs[:, i])), requires_grad=False) for i in range(model.nagents)] # get actions as torch Variables torch_agent_actions = model.step(torch_obs, explore=True) # convert actions to numpy arrays agent_actions = [ac.data.numpy() for ac in torch_agent_actions] # rearrange actions to be per environment actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] next_obs, rewards, dones, infos = env.step(actions) # print (rewards) # print (dones[0]) # env.render('human') replay_buffer.push(obs, agent_actions, rewards, next_obs, dones) obs = next_obs t += config.n_rollout_threads if (len(replay_buffer) >= config.batch_size and (t % config.steps_per_update) < config.n_rollout_threads): if config.use_gpu: model.prep_training(device='gpu') else: model.prep_training(device='cpu') for u_i in range(config.num_updates): sample = replay_buffer.sample(config.batch_size, to_gpu=config.use_gpu) #print(sample) model.update_critic(sample, logger=logger) model.update_policies(sample, logger=logger) model.update_all_targets() model.prep_rollouts(device='cpu') if (dones[0][0]): print("Breakin the epsiodeeeee at timestep", et_i) break et_i += 1 row_list.append((ep_i+1,et_i)) ep_rews = replay_buffer.get_average_rewards( et_i * config.n_rollout_threads) for a_i, a_ep_rew in enumerate(ep_rews): logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew * et_i, ep_i) if ep_i % config.save_interval < config.n_rollout_threads: model.prep_rollouts(device='cpu') os.makedirs(run_dir / 'incremental', exist_ok=True) model.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))) model.save(run_dir / 'model.pt') with open('Timesteps_vs_Episodes.csv', 'w', newline='') as file: writer = csv.writer(file) writer.writerow(["Ep No", "Number of Timesteps"]) for row in row_list: writer.writerow(row) model.save(run_dir / 'model.pt') env.close() logger.export_scalars_to_json(str(log_dir / 'summary.json')) logger.close()
def run(config): model_dir = Path('./models') / config.env_id / config.model_name if not model_dir.exists(): curr_run = 'run1' else: exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run')] if len(exst_run_nums) == 0: curr_run = 'run1' else: curr_run = 'run%i' % (max(exst_run_nums) + 1) run_dir = model_dir / curr_run log_dir = run_dir / 'logs' os.makedirs(log_dir) #logger = SummaryWriter(str(log_dir)) torch.manual_seed(config.seed) np.random.seed(config.seed) if not USE_CUDA: torch.set_num_threads(config.n_training_threads) env = make_parallel_env(config.env_id, config.n_rollout_threads, config.seed, config.discrete_action) if(env=='simple_reference'): for i in range(2): agent_init_params.append({'num_in_pol': num_in_pol, 'num_out_pol': num_out_pol, 'num_in_critic': num_in_critic}) init_dict = {'gamma': gamma, 'tau': tau, 'lr': lr, 'hidden_dim': hidden_dim, 'alg_types': alg_types, 'agent_init_params': agent_init_params, 'discrete_action': discrete_action} maddpg = MADDPG.init_from_env(env, agent_alg=config.agent_alg, adversary_alg=config.adversary_alg, tau=config.tau, lr=config.lr, hidden_dim=config.hidden_dim) replay_buffer = ReplayBuffer(config.buffer_length, maddpg.nagents, [obsp.shape[0] for obsp in env.observation_space], [acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space]) t = 0 episode_average_rewards=[] hundred_episode_average_rewards=[] for ep_i in range(0, config.n_episodes, config.n_rollout_threads): if (ep_i%100==0 and ep_i>0): hundred_episode_average_rewards.append(np.mean(episode_average_rewards)) print('Rewards till',ep_i,'=',hundred_episode_average_rewards[-1]) print('Agent Actions=',torch_agent_actions) episode_average_rewards=[] ''' print("Episodes %i-%i of %i" % (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes)) ''' obs = env.reset() rewards_for_this_episode=[] # obs.shape = (n_rollout_threads, nagent)(nobs), nobs differs per agent so not tensor maddpg.prep_rollouts(device='cpu') explr_pct_remaining = max(0, config.n_exploration_eps - ep_i) / config.n_exploration_eps maddpg.scale_noise(config.final_noise_scale + (config.init_noise_scale - config.final_noise_scale) * explr_pct_remaining) maddpg.reset_noise() for et_i in range(config.episode_length): # rearrange observations to be per agent, and convert to torch Variable torch_obs = [Variable(torch.Tensor(np.vstack(obs[:, i])), requires_grad=False) for i in range(maddpg.nagents)] # get actions as torch Variables torch_agent_actions = maddpg.step(torch_obs, explore=True) # convert actions to numpy arrays agent_actions = [ac.data.numpy() for ac in torch_agent_actions] # rearrange actions to be per environment actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] next_obs, rewards, dones, infos = env.step(actions) rewards_for_this_episode.append(np.mean(rewards)) replay_buffer.push(obs, agent_actions, rewards, next_obs, dones) obs = next_obs t += config.n_rollout_threads if (len(replay_buffer) >= config.batch_size and (t % config.steps_per_update) < config.n_rollout_threads): if USE_CUDA: maddpg.prep_training(device='gpu') else: maddpg.prep_training(device='cpu') for u_i in range(config.n_rollout_threads): for a_i in range(maddpg.nagents): sample = replay_buffer.sample(config.batch_size, to_gpu=USE_CUDA) maddpg.update(sample, a_i)#, logger=logger) maddpg.update_all_targets() maddpg.prep_rollouts(device='cpu') if ep_i>10000: print('Goal Color=',torch_obs[0]) print('Communication=',agent_actions[0]) env.render() time.sleep(0.01) if ep_i>100000: import ipdb ipdb.set_trace() ep_rews = replay_buffer.get_average_rewards( config.episode_length * config.n_rollout_threads) episode_average_rewards.append(np.sum(rewards_for_this_episode)) #for a_i, a_ep_rew in enumerate(ep_rews): #logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i) if ep_i % config.save_interval < config.n_rollout_threads: os.makedirs(run_dir / 'incremental', exist_ok=True) maddpg.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))) maddpg.save(run_dir / 'model.pt') plt.plot(100*np.array(range(1,config.n_episodes//100)),hundred_episode_average_rewards) plt.xlabel('Episode Number') plt.ylabel('Average Reward for 100 episodes') plt.title('Speaker Discrete and Mover Continuous') plt.show('plot.png') maddpg.save(run_dir / 'model.pt') env.close()
def run(config): model_dir = Path('./models') / config.env_id / config.model_name # if not model_dir.exists(): # run_num = 1 # else: # exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in # model_dir.iterdir() if # str(folder.name).startswith('run')] # if len(exst_run_nums) == 0: # run_num = 1 # else: # run_num = max(exst_run_nums) + 1 run_num = 1 curr_run = 'run%i' % run_num run_dir = model_dir / curr_run log_dir = run_dir / 'logs' os.makedirs(log_dir,exist_ok=True) logger = SummaryWriter(str(log_dir)) torch.manual_seed(run_num) np.random.seed(run_num) env = make_parallel_env(config.env_id, config.n_rollout_threads, run_num) model = AttentionSAC.init_from_env(env, tau=config.tau, pi_lr=config.pi_lr, q_lr=config.q_lr, gamma=config.gamma, pol_hidden_dim=config.pol_hidden_dim, critic_hidden_dim=config.critic_hidden_dim, attend_heads=config.attend_heads, reward_scale=config.reward_scale) replay_buffer = ReplayBuffer(config.buffer_length, model.nagents, [obsp.shape[0] for obsp in env.observation_space], [acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space]) t = 0 for ep_i in range(0, config.n_episodes, config.n_rollout_threads): print("Episodes %i-%i of %i" % (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes)) obs = env.reset() model.prep_rollouts(device='cpu') for et_i in range(config.episode_length): # rearrange observations to be per agent, and convert to torch Variable torch_obs = [Variable(torch.Tensor(np.vstack(obs[:, i])), requires_grad=False) for i in range(model.nagents)] # get actions as torch Variables torch_agent_actions = model.step(torch_obs, explore=True) # convert actions to numpy arrays agent_actions = [ac.data.numpy() for ac in torch_agent_actions] # rearrange actions to be per environment actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] next_obs, rewards, dones, infos = env.step(actions) replay_buffer.push(obs, agent_actions, rewards, next_obs, dones) obs = next_obs t += config.n_rollout_threads if (len(replay_buffer) >= config.batch_size and (t % config.steps_per_update) < config.n_rollout_threads): if config.use_gpu: model.prep_training(device='gpu') else: model.prep_training(device='cpu') for u_i in range(config.num_updates): sample = replay_buffer.sample(config.batch_size, to_gpu=config.use_gpu) model.update_critic(sample, logger=logger) model.update_policies(sample, logger=logger) model.update_all_targets() model.prep_rollouts(device='cpu') ep_rews = replay_buffer.get_average_rewards( config.episode_length * config.n_rollout_threads) for a_i, a_ep_rew in enumerate(ep_rews): logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew * config.episode_length, ep_i) if ep_i % config.save_interval < config.n_rollout_threads: model.prep_rollouts(device='cpu') os.makedirs(run_dir / 'incremental', exist_ok=True) model.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))) model.save(run_dir / 'model.pt') model.save(run_dir / 'model.pt') env.close() logger.export_scalars_to_json(str(log_dir / 'summary.json')) logger.close()
def run(args, **args_dict): reward_flag, pos_flag = None, None save_data = {'reward': -1000., 'pos': 0.} # model_dir = Path('./models') / config.env_id / config.model_name # if not model_dir.exists(): # curr_run = 'run1' # else: # exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in # model_dir.iterdir() if # str(folder.name).startswith('run')] # if len(exst_run_nums) == 0: # curr_run = 'run1' # else: # curr_run = 'run%i' % (max(exst_run_nums) + 1) # run_dir = model_dir / curr_run # log_dir = run_dir / 'logs' # os.makedirs(log_dir) th.manual_seed(args.seed) np.random.seed(args.seed) if not args.use_cuda or not th.cuda.is_available(): # th.set_num_threads(args.n_training_threads) FloatTensor = th.FloatTensor else: FloatTensor = th.cuda.FloatTensor env = make_parallel_env(**args_dict) maddpg = MADDPG.init_from_env(env, args) replay_buffer = ReplayBuffer( args.capacity, args.n_agents, [obsp.shape[0] for obsp in env.observation_space], [ acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space ]) t = 0 for ep_i in range(0, args.n_episodes, args.n_rollout_threads): ttt = time.time() obs = env.reset() # obs.shape = (n_rollout_threads, nagent)(nobs), nobs differs per agent so not tensor if args.use_cuda and th.cuda.is_available(): maddpg.prep_rollouts(device='gpu') else: maddpg.prep_rollouts(device='cpu') # maddpg.prep_rollouts(device='cpu') explr_pct_remaining = max( 0, args.n_exploration_eps - ep_i) / args.n_exploration_eps scale_noise_i = args.final_noise_scale + ( args.init_noise_scale - args.final_noise_scale) * explr_pct_remaining maddpg.scale_noise(scale_noise_i) maddpg.reset_noise() print("Episodes %i-%i of %i, replay: %.2f, explore: %.2f" % (ep_i + 1, ep_i + 1 + args.n_rollout_threads, args.n_episodes, float(len(replay_buffer)) / replay_buffer.max_steps, scale_noise_i)) for et_i in range(args.max_steps): ttt = time.time() # rearrange observations to be per agent, and convert to torch Variable torch_obs = [ th.from_numpy(np.vstack(obs[:, i])).type(FloatTensor) for i in range(maddpg.nagents) ] # get actions as torch Variables torch_agent_actions = maddpg.step(torch_obs, explore=True) # convert actions to numpy arrays agent_actions = [ ac.detach().cpu().numpy() for ac in torch_agent_actions ] # rearrange actions to be per environment actions = [[ac[i] for ac in agent_actions] for i in range(args.n_rollout_threads)] next_obs, rewards, dones, infos = env.step(actions) replay_buffer.push(obs, agent_actions, rewards, next_obs, dones) obs = next_obs t += args.n_rollout_threads # # ttt2 = time.time() # print('1', ttt2 - ttt) # if (len(replay_buffer) >= args.batch_size and (t % args.steps_per_update) < args.n_rollout_threads): ttt = time.time() if args.use_cuda and th.cuda.is_available(): maddpg.prep_training(device='gpu') else: maddpg.prep_training(device='cpu') # for u_i in range(args.n_rollout_threads): for a_i in range(maddpg.nagents): sample = replay_buffer.sample(args.batch_size, to_gpu=args.use_cuda and th.cuda.is_available(), norm_rews=args.norm_rews) _, _, _ = maddpg.update(sample, a_i) maddpg.update_all_targets() if args.use_cuda and th.cuda.is_available(): maddpg.prep_rollouts(device='gpu') else: maddpg.prep_rollouts(device='cpu') # maddpg.prep_rollouts(device='cpu') # # ttt2 = time.time() # print('2', ttt2 - ttt) # # ep_rews = replay_buffer.get_average_rewards( # config.episode_length * config.n_rollout_threads) # for a_i, a_ep_rew in enumerate(ep_rews): # logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i) if ep_i % args.test_interval < args.n_rollout_threads: ttt = time.time() obs = env.reset() if args.use_cuda and th.cuda.is_available(): maddpg.prep_rollouts(device='gpu') else: maddpg.prep_rollouts(device='cpu') # maddpg.prep_rollouts(device='cpu') with th.no_grad(): pos_total = 0. finish_ep = np.zeros(args.n_rollout_threads) r_total = np.zeros((args.n_rollout_threads, args.n_agents)) record_r = np.zeros(args.n_agents) for eval_i in range(args.max_steps): torch_obs = [ FloatTensor(np.vstack(obs[:, i])) for i in range(maddpg.nagents) ] torch_agent_actions = maddpg.step(torch_obs, explore=False) agent_actions = [ ac.detach().cpu().numpy() for ac in torch_agent_actions ] actions = [[ac[i] for ac in agent_actions] for i in range(args.n_rollout_threads)] next_obs, rewards, dones, infos = env.step(actions) r_total += rewards obs = next_obs for d_i in range(dones.shape[0]): if dones[d_i] or (eval_i == args.max_steps - 1 and finish_ep[d_i] == 0.): # if eval_i == args.max_steps - 1 and finish_ep[d_i] == 0.: # print(d_i) pos_total += infos[d_i]['pos'] record_r += r_total[d_i] r_total[d_i] = [0., 0.] finish_ep[d_i] += 1 record_r /= finish_ep.sum() pos_total /= finish_ep.sum() # ttt2 = time.time() # print('3', ttt2 - ttt) # new_path = model_path + '/' + str(ep_i) + '.pt' has_saved = False if record_r.sum() > save_data['reward']: save_data['reward'] = record_r.sum() if save_data['reward'] > 0 and pos_total > 10.: # pathlib.Path(new_path).mkdir(parents=True, exist_ok=True) maddpg.save(new_path) if pos_total > save_data['pos']: save_data['pos'] = pos_total if record_r.sum( ) > 0 and pos_total > 10. and not has_saved: # pathlib.Path(new_path).mkdir(parents=True, exist_ok=True) maddpg.save(new_path) if pos_total > 17.0: maddpg.save(new_path) if reward_flag is None: reward_flag = vis.line( X=np.arange(ep_i, ep_i + 1), Y=np.array([np.append(record_r, record_r.sum())]), opts=dict(ylabel='Test Reward', xlabel='Episode', title='Reward', legend=[ 'Agent-%d' % i for i in range(args.n_agents) ] + ['Total'])) else: vis.line(X=np.array( [np.array(ep_i).repeat(args.n_agents + 1)]), Y=np.array([np.append(record_r, record_r.sum())]), win=reward_flag, update='append') if pos_flag is None: pos_flag = vis.line(X=np.arange(ep_i, ep_i + 1), Y=np.array([pos_total]), opts=dict(ylabel='Length', xlabel='Episode', title='How far ?', legend=['position'])) else: vis.line(X=np.array([ep_i]), Y=np.array([pos_total]), win=pos_flag, update='append') # if ep_i % config.save_interval < config.n_rollout_threads: # os.makedirs(run_dir / 'incremental', exist_ok=True) # maddpg.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))) # maddpg.save(run_dir / 'model.pt') # maddpg.save(run_dir / 'model.pt') env.close()
def run(config): scores_window = deque(maxlen=100) model_dir = Path('./models') / config.env_id / config.model_name if not model_dir.exists(): curr_run = 'run1' else: exst_run_nums = [ int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run') ] if len(exst_run_nums) == 0: curr_run = 'run1' else: curr_run = 'run%i' % (max(exst_run_nums) + 1) run_dir = model_dir / curr_run log_dir = run_dir / 'logs' os.makedirs(log_dir) logger = SummaryWriter(str(log_dir)) torch.manual_seed(config.seed) np.random.seed(config.seed) if not USE_CUDA: torch.set_num_threads(config.n_training_threads) # transport configuration name = 'Materials Transport' conf = { 'n_player': 2, #玩家数量 'board_width': 11, #地图宽 'board_height': 11, #地图高 'n_cell_type': 5, #格子的种类 'materials': 4, #集散点数量 'cars': 2, #汽车数 'planes': 0, #飞机数量 'barriers': 12, #固定障碍物数量 'max_step': 500, #最大步数 'game_name': name, #游戏名字 'K': 5, #每个K局更新集散点物资数目 'map_path': 'env/map.txt', #存放初始地图 'cell_range': 6, # 单格中各维度取值范围(tuple类型,只有一个int自动转为tuple)##? 'ob_board_width': None, # 不同智能体观察到的网格宽度(tuple类型),None表示与实际网格相同##? 'ob_board_height': None, # 不同智能体观察到的网格高度(tuple类型),None表示与实际网格相同##? 'ob_cell_range': None, # 不同智能体观察到的单格中各维度取值范围(二维tuple类型),None表示与实际网格相同##? } env = make_parallel_env_transport(config.env_id, conf, config.n_rollout_threads, config.seed, config.discrete_action) maddpg = MADDPG.init_from_env(env, agent_alg=config.agent_alg, adversary_alg=config.adversary_alg, tau=config.tau, lr=config.lr, hidden_dim=config.hidden_dim) replay_buffer = ReplayBuffer( config.buffer_length, maddpg.nagents, [obsp.shape[0] for obsp in env.observation_space], [ acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space ]) t = 0 for ep_i in range(0, config.n_episodes, config.n_rollout_threads): score = 0 # print("Episodes %i-%i of %i" % (ep_i + 1, # ep_i + 1 + config.n_rollout_threads, # config.n_episodes)) obs = env.reset() # TODO: TO CHECK # obs.shape = (n_rollout_threads, nagent)(nobs), nobs differs per agent so not tensor maddpg.prep_rollouts(device='cpu') explr_pct_remaining = max( 0, config.n_exploration_eps - ep_i) / config.n_exploration_eps maddpg.scale_noise(config.final_noise_scale + (config.init_noise_scale - config.final_noise_scale) * explr_pct_remaining) maddpg.reset_noise() for et_i in range(config.episode_length): # print('step', et_i) # env.render() # rearrange observations to be per agent, and convert to torch Variable # print('step', et_i) # print(maddpg.nagents) torch_obs = [ Variable( torch.Tensor(np.vstack(obs[:, i])), # 沿着竖直方向将矩阵堆叠起来。 requires_grad=False) for i in range(maddpg.nagents) ] # get actions as torch Variables torch_agent_actions = maddpg.step(torch_obs, explore=False) # convert actions to numpy arrays agent_actions = [ac.data.numpy() for ac in torch_agent_actions] # rearrange actions to be per environment actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] ############################################ # add # actions = actions.astype(int) ############################################ # add: 前两个action joint_action = [] for i in range(2): player = [] for j in range(1): each = [0] * 11 # idx = np.random.randint(11) each[3] = 1 player.append(each) joint_action.append(player) for m in range(2): joint_action.append([actions[0][m].astype(int).tolist()]) next_obs, rewards, dones, infos = env.step(joint_action) ################################# agents_action = actions[0] ################################# replay_buffer.push(obs, agents_action, rewards, next_obs, dones) obs = next_obs t += config.n_rollout_threads if (len(replay_buffer) >= config.batch_size and (t % config.steps_per_update) < config.n_rollout_threads): if USE_CUDA: maddpg.prep_training(device='gpu') else: maddpg.prep_training(device='cpu') for u_i in range(config.n_rollout_threads): for a_i in range(maddpg.nagents): sample = replay_buffer.sample(config.batch_size, to_gpu=USE_CUDA) maddpg.update(sample, a_i, logger=logger) maddpg.update_all_targets() maddpg.prep_rollouts(device='cpu') score += rewards[0][0] ep_rews = replay_buffer.get_average_rewards(config.episode_length * config.n_rollout_threads) for a_i, a_ep_rew in enumerate(ep_rews): logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i) if ep_i % config.save_interval < config.n_rollout_threads: os.makedirs(run_dir / 'incremental', exist_ok=True) maddpg.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))) maddpg.save(run_dir / 'model.pt') scores_window.append(score) reward_epi = np.mean(scores_window) reward_epi_var = np.var(scores_window) logger.add_scalar('results/completion_window' % reward_epi, ep_i) logger.add_scalar('results/completion_window' % reward_epi_var, ep_i) print( '\r Episode {}\t Average Reward: {:.3f}\t Var Reward: {:.3f} \t '. format(ep_i, reward_epi, reward_epi_var)) maddpg.save(run_dir / 'model.pt') env.close() logger.export_scalars_to_json(str(log_dir / 'summary.json')) logger.close()
class DDPG(Chain): def __init__(self): super(DDPG, self).__init__( actor=Actor(), critic=Critic(), ) self.target_actor = deepcopy(self.actor) self.target_critic = deepcopy(self.critic) disable_train(self.target_actor) disable_train(self.target_critic) self.noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(A_DIM)) self.buffer = ReplayBuffer(BUFFER_SIZE) self.time = 0 def reset(self, s): self.prev_s = s self.noise.reset() def step(self, s, r, done, trainable): self.time += 1 self.buffer.add(self.prev_s, self.prev_a, r, done, s, self.prev_noise) self.prev_s = s if trainable and self.time % TRAIN_INTERVAL == 0: if len(self.buffer) > NUM_WARMUP_STEP: return self._update() def get_action(self): S, = make_batch(self.prev_s) a = self.actor(S)[0] # (A_DIM, ) noise = self.noise().astype(np.float32) self.prev_a = a self.prev_noise = noise return (a + noise).data.reshape(-1) def _update(self): S, A, R, D, S2, N = self.buffer.sample_batch( BATCH_SIZE) # (6, BATCH_SIZE) S = np.array(S, dtype=np.float32) # (BATCH_SIZE, O_DIM) S2 = np.array(S2, dtype=np.float32) A = F.stack(A) # (BATCH_SIZE, A_DIM) R = np.array(R, dtype=np.float32).reshape(-1, 1) N = np.array(N) # update critic A_ = self.target_actor(S2) Y = R + GAMMA * self.target_critic(S2, A_.data) Q_batch = self.critic(S, (A + N).data) critic_loss = F.mean_squared_error(Y.data, Q_batch) self.critic.update(critic_loss) # update actor A = self.actor(S) # why?? but essential!! Q = self.critic(S, A) actor_loss = -F.sum(Q) / BATCH_SIZE #from chainer import computational_graph as c #g = c.build_computational_graph([actor_loss]) #with open('graph_actorloss.dot', 'w') as o: # o.write(g.dump()) #exit() self.actor.update(actor_loss) # update target soft_copy_param(self.target_critic, self.critic, TAU) soft_copy_param(self.target_actor, self.actor, TAU) return actor_loss.data, critic_loss.data
def run(config): model_dir = Path('./models') / config.env_id / config.model_name if not model_dir.exists(): curr_run = 'run1' else: exst_run_nums = [ int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run') ] if len(exst_run_nums) == 0: curr_run = 'run1' else: curr_run = 'run%i' % (max(exst_run_nums) + 1) run_dir = model_dir / curr_run log_dir = run_dir / 'logs' os.makedirs(log_dir) logger = SummaryWriter(str(log_dir)) torch.manual_seed(config.seed) np.random.seed(config.seed) if not USE_CUDA: torch.set_num_threads(config.n_training_threads) env = make_parallel_env(config.env_id, config.n_rollout_threads, config.seed, config.discrete_action) ##################### INITIALIZE FROM SAVED? ########################### if init_from_saved: if model_path is not None: maddpg = MADDPG.init_from_save(model_path) print("Initialized from saved model") # -------------------------------------------------------------------- # else: maddpg = MADDPG.init_from_env(env, agent_alg=config.agent_alg, adversary_alg=config.adversary_alg, tau=config.tau, lr=config.lr, hidden_dim=config.hidden_dim) # used for learning (updates) replay_buffer = ReplayBuffer( config.buffer_length, maddpg.nagents, [obsp.shape[0] for obsp in env.observation_space], [ acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space ]) # This is just to store the global rewards and not for updating the policies g_storage_buffer = ReplayBuffer( config.buffer_length, maddpg.nagents, [obsp.shape[0] for obsp in env.observation_space], [ acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space ]) t = 0 for ep_i in range(0, config.n_episodes, config.n_rollout_threads): print( "Episodes %i-%i of %i" % (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes)) obs = env.reset() # obs.shape = (n_rollout_threads, nagent)(nobs), nobs differs per agent so not tensor maddpg.prep_rollouts(device='cpu') explr_pct_remaining = max( 0, config.n_exploration_eps - ep_i) / config.n_exploration_eps maddpg.scale_noise(config.final_noise_scale + (config.init_noise_scale - config.final_noise_scale) * explr_pct_remaining) maddpg.reset_noise() for et_i in range(config.episode_length): # rearrange observations to be per agent, and convert to torch Variable torch_obs = [ Variable(torch.Tensor(np.vstack(obs[:, i])), requires_grad=False) for i in range(maddpg.nagents) ] # get actions as torch Variables torch_agent_actions = maddpg.step(torch_obs, explore=True) # convert actions to numpy arrays agent_actions = [ac.data.numpy() for ac in torch_agent_actions] # rearrange actions to be per environment actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] next_obs, rewards, dones, infos = env.step(actions, maddpg) ''' Reward Shaping using D++, D. The rewards now contain global as well as shaped rewards Keep the global for logging, and use the shaped rewards for updates ''' # Choose which reward to use use_dpp = True # DIFFERENCE REWARDS d_rewards = [] for n in range(maddpg.nagents): d_rewards.append([rewards[0][n][1]]) d_rewards = [d_rewards] d_rewards = np.array(d_rewards) # GLOBAL REWARDS g_rewards = [] for n in range(maddpg.nagents): g_rewards.append([rewards[0][n][0]]) g_rewards = [g_rewards] g_rewards = np.array(g_rewards) if use_dpp: rewards = d_rewards else: rewards = g_rewards # ----------------------------------------------------------- # # Buffer used for updates replay_buffer.push(obs, agent_actions, rewards, next_obs, dones) # push global rewards into g_replay_buffer for plotting g_storage_buffer.push(obs, agent_actions, g_rewards, next_obs, dones) obs = next_obs t += config.n_rollout_threads if (len(replay_buffer) >= config.batch_size and (t % config.steps_per_update) < config.n_rollout_threads): if USE_CUDA: maddpg.prep_training(device='gpu') else: maddpg.prep_training(device='cpu') for u_i in range(config.n_rollout_threads): for a_i in range(maddpg.nagents): sample = replay_buffer.sample(config.batch_size, to_gpu=USE_CUDA) maddpg.update(sample, a_i, logger=logger) maddpg.update_all_targets() maddpg.prep_rollouts(device='cpu') # Take out global reward from g_storage_buffer ep_rews = g_storage_buffer.get_average_rewards( config.episode_length * config.n_rollout_threads) for a_i, a_ep_rew in enumerate(ep_rews): logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i) if ep_i % config.save_interval < config.n_rollout_threads: os.makedirs(run_dir / 'incremental', exist_ok=True) maddpg.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))) maddpg.save(run_dir / 'model.pt') maddpg.save(run_dir / 'model.pt') env.close() logger.export_scalars_to_json(str(log_dir / 'summary.json')) logger.close()
def run(config): model_dir = Path('./models') / config["env_id"] / config["model_name"] if not model_dir.exists(): run_num = 1 else: exst_run_nums = [ int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run') ] if len(exst_run_nums) == 0: run_num = 1 else: run_num = max(exst_run_nums) + 1 curr_run = 'run%i' % run_num run_dir = model_dir / curr_run log_dir = run_dir / 'logs' os.makedirs(log_dir) logger = SummaryWriter(str(log_dir)) torch.manual_seed(run_num) np.random.seed(run_num) env = make_parallel_env(config["n_rollout_threads"], run_num) model = AttentionSAC.init_from_env( env, tau=config["tau"], pi_lr=config["pi_lr"], q_lr=config["q_lr"], gamma=config["gamma"], pol_hidden_dim=config["pol_hidden_dim"], critic_hidden_dim=config["critic_hidden_dim"], attend_heads=config["attend_heads"], reward_scale=config["reward_scale"]) # (** EDITED **) Set Replay Buffer # env.action_space, env.observation_space 의 shape를 iteration을 통해 버퍼 설정 replay_buffer = ReplayBuffer(config["buffer_length"], model.nagents, [115 for _ in range(model.nagents)], [19 for _ in range(model.nagents)]) t = 0 for ep_i in range(0, config["n_episodes"], config["n_rollout_threads"]): print("Episodes %i-%i of %i" % (ep_i + 1, ep_i + 1 + config["n_rollout_threads"], config["n_episodes"])) obs = env.reset() model.prep_rollouts(device='cpu') for et_i in range(config["episode_length"]): print("episode : {} | step : {}".format(ep_i, et_i), end='\r') # rearrange observations to be per agent, and convert to torch Variable torch_obs = [ Variable(torch.Tensor(np.vstack(obs[:, i])), requires_grad=False) for i in range(model.nagents) ] # get actions as torch Variables torch_agent_actions = model.step(torch_obs, explore=True) # convert actions to numpy arrays agent_actions = [ac.data.numpy() for ac in torch_agent_actions] # rearrange actions to be per environment actions = [[ac[i] for ac in agent_actions] for i in range(config["n_rollout_threads"])] # Reform Actions list to fit on Football Env # Google Football 환경은 액션 리스트 (one hot encoded)가 아닌 정수값을 받음 actions_list = [[np.argmax(b) for b in a] for a in actions] # Step next_obs, rewards, dones, infos = env.step(actions_list) # Prevention of divergence # 안해주면 발산해서 학습 불가 (NaN) rewards = rewards - 0.000001 # Reform Done Flag list # replay buffer에 알맞도록 done 리스트 재구성 dones = (np.array([dones for _ in range(model.nagents)])).T replay_buffer.push(obs, agent_actions, rewards, next_obs, dones) obs = next_obs t += config["n_rollout_threads"] if (len(replay_buffer) >= config["batch_size"] and (t % config["steps_per_update"]) < config["n_rollout_threads"]): if config["use_gpu"]: model.prep_training(device='gpu') else: model.prep_training(device='cpu') for u_i in range(config["num_updates"]): sample = replay_buffer.sample(config["batch_size"], to_gpu=config["use_gpu"]) model.update_critic(sample, logger=logger) model.update_policies(sample, logger=logger) model.update_all_targets() model.prep_rollouts(device='cpu') ep_rews = replay_buffer.get_average_rewards( config["episode_length"] * config["n_rollout_threads"]) for a_i, a_ep_rew in enumerate(ep_rews): logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew * config["episode_length"], ep_i) if ep_i % config["save_interval"] < config["n_rollout_threads"]: model.prep_rollouts(device='cpu') os.makedirs(run_dir / 'incremental', exist_ok=True) model.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))) model.save(run_dir / 'model.pt') model.save(run_dir / 'model.pt') env.close() logger.export_scalars_to_json(str(log_dir / 'summary.json')) logger.close()
torch.manual_seed(1024) np.random.seed(1024) env = make_parallel_env(env_id, n_rollout_threads, 1024, True) maddpg = MADDPG.init_from_env(env, agent_alg='MADDPG', adversary_alg='MADDPG', tau=0.01, lr=0.01, hidden_dim=64, est_ac=True, game_id='simple_speaker_listener') replay_buffer = ReplayBuffer( buffer_length, maddpg.nagents, [obsp.shape[0] for obsp in env.observation_space], [ acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space ]) t = 0 #for ep_i in range(0, n_episodes, n_rollout_threads): for ep_i in range(0, 10, 1): #print("Episodes %i-%i of %i" % (ep_i + 1, ep_i + 1 + n_rollout_threads, n_episodes)) obs = env.reset() maddpg.prep_rollouts(device='cpu') explr_pct_remaining = max(0, n_exploration_eps - ep_i) / n_exploration_eps maddpg.scale_noise(final_noise_scale + (init_noise_scale - final_noise_scale) * explr_pct_remaining) maddpg.reset_noise()
def run(config): model_dir = Path('./models') / config.env_id / config.model_name if not model_dir.exists(): curr_run = 'run1' else: exst_run_nums = [ int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run') ] if len(exst_run_nums) == 0: curr_run = 'run1' else: curr_run = 'run%i' % (max(exst_run_nums) + 1) run_dir = model_dir / curr_run log_dir = run_dir / 'logs' os.makedirs(log_dir) logger = SummaryWriter(str(log_dir)) torch.manual_seed(config.seed) np.random.seed(config.seed) if not USE_CUDA: torch.set_num_threads(config.n_training_threads) env = make_parallel_env(config.env_id, config.n_rollout_threads, config.seed, config.discrete_action) if isinstance(env.action_space[0], Box): discr_act = False get_shape = lambda x: x.shape[0] else: # Discrete discr_act = True get_shape = lambda x: x.n num_out_pol = get_shape(env.action_space[0]) agent_init_params = { 'num_in_pol': env.observation_space[0].shape[0], 'num_out_pol': num_out_pol, 'num_vars': len(env.agent_types) } maddpg = MADDPG(agent_init_params, nagents=len(env.agent_types), tau=config.tau, lr=config.lr, hidden_dim=config.hidden_dim, discrete_action=discr_act) replay_buffer = ReplayBuffer( config.buffer_length, maddpg.nagents, [obsp.shape[0] for obsp in env.observation_space], [ acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space ], config.hidden_dim * (maddpg.nagents - 1)) t = 0 for ep_i in range(0, config.n_episodes, config.n_rollout_threads): print( "Episodes %i-%i of %i" % (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes)) obs = env.reset() # obs.shape = (n_rollout_threads, nagent)(nobs), nobs differs per agent so not tensor explr_pct_remaining = max( 0, config.n_exploration_eps - ep_i) / config.n_exploration_eps maddpg.scale_noise(config.final_noise_scale + (config.init_noise_scale - config.final_noise_scale) * explr_pct_remaining) maddpg.reset_noise() rnn_hidden = (torch.zeros( 1, config.n_rollout_threads * (maddpg.nagents) * (maddpg.nagents - 1), config.hidden_dim), torch.zeros( 1, config.n_rollout_threads * (maddpg.nagents) * (maddpg.nagents - 1), config.hidden_dim)) for et_i in range(config.episode_length): # rearrange observations to be per agent, and convert to torch Variable torch_obs = [ Variable(torch.Tensor(np.vstack(obs[:, i])), requires_grad=False) for i in range(maddpg.nagents) ] # get actions as torch Variables torch_agent_actions, new_rnn_hidden = maddpg.step(torch_obs, rnn_hidden, explore=True) hid_to_store = (rnn_hidden[0].detach().contiguous().view( config.n_rollout_threads, maddpg.nagents, -1), rnn_hidden[1].detach().contiguous().view( config.n_rollout_threads, maddpg.nagents, -1)) next_hid_to_store = (new_rnn_hidden[0].detach().contiguous().view( config.n_rollout_threads, maddpg.nagents, -1), new_rnn_hidden[1].detach().contiguous().view( config.n_rollout_threads, maddpg.nagents, -1)) # convert actions to numpy arrays agent_actions = [ ac.data.numpy() for ac in torch_agent_actions.cpu() ] # rearrange actions to be per environment actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] next_obs, rewards, dones, infos = env.step(actions) replay_buffer.push(obs, hid_to_store, agent_actions, rewards, next_obs, next_hid_to_store, dones) obs = next_obs t += config.n_rollout_threads if (len(replay_buffer) >= config.batch_size and (t % config.steps_per_update) < config.n_rollout_threads): sample = replay_buffer.sample(config.batch_size, to_gpu=USE_CUDA) maddpg.update(sample, ep_i) maddpg.update_all_targets() rnn_hidden = new_rnn_hidden ep_rews = replay_buffer.get_average_rewards(config.episode_length * config.n_rollout_threads) for a_i, a_ep_rew in enumerate(ep_rews): logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i) print("Episode %i, reward for %i is " % (ep_i + 1, a_i), a_ep_rew) maddpg.save(run_dir / 'model.pt') env.close() logger.export_scalars_to_json(str(log_dir / 'summary.json')) logger.close()
def train(env): n_agents = env["n_agents"] x_dim = env["x_dim"] y_dim = env["y_dim"] n_cities = env["n_cities"] max_rails_between_cities = env["max_rails_between_cities"] max_rails_in_city = env["max_rails_in_city"] seed = 0 use_fast_tree_obs = False # Observation parameters observation_tree_depth = 4 observation_radius = 10 observation_max_path_depth = 30 # Set the seeds random.seed(seed) np.random.seed(seed) # Break agents from time to time malfunction_parameters = MalfunctionParameters( malfunction_rate=1. / 10000, # Rate of malfunctions min_duration=15, # Minimal duration max_duration=50 # Max duration ) # Observation builder predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth) tree_observation = None if use_fast_tree_obs: tree_observation = FastTreeObs(max_depth=observation_tree_depth) print("Using FastTreeObs") else: tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth, predictor=predictor) print("Using StandardTreeObs") speed_profiles = { 1.: 1.0, # Fast passenger train 1. / 2.: 0.0, # Fast freight train 1. / 3.: 0.0, # Slow commuter train 1. / 4.: 0.0 # Slow freight train } env = RailEnv( width=x_dim, height=y_dim, rail_generator=sparse_rail_generator( max_num_cities=n_cities, grid_mode=False, max_rails_between_cities=max_rails_between_cities, max_rails_in_city=max_rails_in_city), schedule_generator=sparse_schedule_generator(speed_profiles), number_of_agents=n_agents, malfunction_generator_and_process_data=malfunction_from_params( malfunction_parameters), obs_builder_object=tree_observation, random_seed=seed) rewards = [] obs, info = env.reset() if use_fast_tree_obs: state_size = tree_observation.observation_dim else: # Calculate the state size given the depth of the tree observation and the # number of features n_features_per_node = env.obs_builder.observation_dim n_nodes = 0 for i in range(observation_tree_depth + 1): n_nodes += np.power(4, i) state_size = n_features_per_node * n_nodes action_size = 5 DEVICE = 'cpu' # if torch.cuda.is_available(): # DEVICE = 'gpu' buffer_length = 10000 steps_to_save_model = 10 step_size = 100 num_steps = 100 # update every 100 steps avg_steps = 20 # num steps to average and plot rewards reward_q = [] batch_size = 100 agent_obs = np.array([None] * env.get_num_agents()) max_steps = int(4 * 2 * (env.height + env.width + (n_agents / n_cities))) num_episodes = 100000 agent_init_params = [] sa_size = [] for i in range(n_agents): agent_init_params.append({ 'num_in_pol': state_size, 'num_out_pol': action_size, 'init_weights': 'model.pt' }) sa_size.append((state_size, action_size)) hyperparams = { "tau": 0.01, "pi_lr": 0.00001, "q_lr": 0.00005, "pol_hidden_dim": 256, "critic_hidden_dim": 256, "attend_heads": 8 } model = AttentionSAC(agent_init_params=agent_init_params, sa_size=sa_size, tau=hyperparams["tau"], pi_lr=hyperparams["pi_lr"], q_lr=hyperparams["q_lr"], pol_hidden_dim=hyperparams["pol_hidden_dim"], critic_hidden_dim=hyperparams["critic_hidden_dim"], attend_heads=hyperparams["attend_heads"]) model.init_dict = {} replay_buffer = ReplayBuffer(buffer_length, n_agents, [state_size for i in range(n_agents)], [action_size for i in range(n_agents)]) print("MAX STEPS: " + str(max_steps)) print("NUM EPISODES: ", num_episodes) print("HYPERPARAMS: ") print(hyperparams) start_time = time.time() for ep in range(num_episodes): print("Episode " + str(ep) + ":", flush=True) obs, info = env.reset(True, True) model.prep_rollouts(device=DEVICE) reward_sum_for_this_episode = 0 for steps in range(max_steps): if steps % step_size == 0: print("=", end="", flush=True) for agent in env.get_agent_handles(): if obs[agent] is not None: if use_fast_tree_obs: agent_obs[agent] = obs[agent] else: agent_obs[agent] = normalize_observation( obs[agent], observation_tree_depth, observation_radius=observation_radius) else: agent_obs[agent] = np.array([0.] * state_size) action_dict = {} agent_actions = [] torch_obs = [ Variable(torch.Tensor([agent_obs[i]]), requires_grad=False) for i in range(n_agents) ] torch_agent_actions = model.step(torch_obs, explore=True) agent_actions = [ac.data.numpy() for ac in torch_agent_actions] for i in range(n_agents): dist = torch_agent_actions[i][0] idx = -1 for j in range(action_size): if dist[j] != 0: idx = j break action_dict[i] = idx next_obs, all_rewards, done, info = env.step(action_dict) rewards = [] dones = [] next_agent_obs = np.array([None] * env.get_num_agents()) for agent in env.get_agent_handles(): if next_obs[agent] is not None: if use_fast_tree_obs: next_agent_obs[agent] = next_obs[agent] else: next_agent_obs[agent] = normalize_observation( obs[agent], observation_tree_depth, observation_radius=observation_radius) else: next_agent_obs[agent] = np.array([0.] * state_size) for i in range(n_agents): reward_sum_for_this_episode += all_rewards[i] rewards.append(all_rewards[i]) all_rewards[i] += augment_reward(agent_obs[agent]) dones.append(done[i]) replay_buffer.push(np.array([agent_obs]), np.array(agent_actions), np.array([rewards]), np.array([next_agent_obs]), np.array([dones])) if steps % num_steps == 0: model.prep_training(device=DEVICE) sample = replay_buffer.sample(batch_size, norm_rews=False) #print(sample) model.update_critic(sample) model.update_policies(sample) model.update_all_targets() model.prep_rollouts(device=DEVICE) reward_sum_for_this_episode /= n_agents reward_q.append(reward_sum_for_this_episode) if len(reward_q) == avg_steps: wandb.log({'reward': np.mean(reward_q)}) reward_q = [] print() if ep % steps_to_save_model == 0: print("\nSaving model") model.save(os.getcwd() + "/model.pt") cur_time = time.time() time_elapsed = (cur_time - start_time) // 60 print("Time Elapsed: " + str(time_elapsed) + "\n")
def run(config): model_dir = Path('./models') / config.env_name / config.model_name if not model_dir.exists(): run_num = 1 else: exst_run_nums = [ int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run') ] if len(exst_run_nums) == 0: run_num = 1 else: run_num = max(exst_run_nums) + 1 curr_run = 'run%i' % run_num run_dir = model_dir / curr_run log_dir = run_dir / 'logs' os.makedirs(log_dir) os.system("cp shape.txt {}".format(run_dir)) logger = SummaryWriter(str(log_dir)) torch.manual_seed(run_num) np.random.seed(run_num) #training时的线程数 if not config.use_cuda: torch.set_num_threads(config.n_training_threads) #env并行采样的进程 env = make_parallel_env(config.num_agents, config.n_rollout_threads, run_num, config.shape_file) #''' maddpg = MADDPG.init_from_env(env=env, agent_alg=config.agent_alg, cripple_alg=config.cripple_alg, tau=config.tau, lr=config.lr, hidden_dim=config.hidden_dim, discrete_action=config.discrete_action) #''' #maddpg = MADDPG.init_from_save(model_dir/'run1'/'model.pt') replay_buffer = ReplayBuffer( config.buffer_length, maddpg.nagents, [obsp.shape[0] for obsp in env.observation_space], [ acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space ]) t = 0 a_loss = [] c_loss = [] rewss = [] for ep_i in range(0, config.n_episodes, config.n_rollout_threads): print( "Episodes %i-%i of %i" % (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes)) obs = env.reset() # obs.shape = (n_rollout_threads, nagent)(nobs), nobs differs per agent so not tensor maddpg.prep_rollouts(device='cpu') # show for the first time explr_pct_remaining = max( 0, config.n_exploration_eps - ep_i) / config.n_exploration_eps maddpg.scale_noise(config.final_noise_scale + (config.init_noise_scale - config.final_noise_scale) * explr_pct_remaining) maddpg.reset_noise() #if config.display: # for env_show in env.envs: # env_show.render('human', close=False) for et_i in range(config.episode_length): # rearrange observations to be per agent, and convert to torch Variable torch_obs = [ Variable(torch.Tensor(np.vstack(obs[:, i])), requires_grad=False) for i in range(maddpg.nagents) ] # get actions as torch Variables torch_agent_actions = maddpg.step(torch_obs, explore=True) # convert actions to numpy arrays agent_actions = [ac.data.numpy() for ac in torch_agent_actions] # rearrange actions to be per environment actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] #actions = [np.array([i.tolist().index(1.0) for i in action]) for action in actions_one_hot] for i in actions: # print(i) for j in i: j[1] *= np.pi #print(actions[0]) next_obs, rewards, dones, infos = env.step(actions) #print(len(agent_actions),len(next_obs)) #if config.display: # for env_show in env.envs: # env_show.render('human', close=False) replay_buffer.push(obs, agent_actions, rewards, next_obs, dones) obs = next_obs t += config.n_rollout_threads if (len(replay_buffer) >= config.batch_size and (t % config.steps_per_update) < config.n_rollout_threads): #print(t) if config.use_cuda: maddpg.prep_training(device='gpu') else: maddpg.prep_training(device='cpu') for u_i in range(config.n_rollout_threads): for a_i in range(maddpg.nagents): sample = replay_buffer.sample(config.batch_size, to_gpu=config.use_cuda, norm_rews=True) maddpg.update(sample, a_i, logger=logger, actor_loss_list=a_loss, critic_loss_list=c_loss) maddpg.update_all_targets() maddpg.prep_rollouts(device='cpu') ep_rews = replay_buffer.get_average_rewards(config.episode_length * config.n_rollout_threads) rewss.append(ep_rews) for a_i, a_ep_rew in enumerate(ep_rews): logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i) # print('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i) if ep_i % config.save_interval < config.n_rollout_threads: os.makedirs(str(run_dir / 'incremental'), exist_ok=True) maddpg.save( str(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1)))) maddpg.save(str(run_dir / 'model.pt')) maddpg.save(str(run_dir / 'model.pt')) env.close() logger.export_scalars_to_json(str(log_dir / 'summary.json')) logger.close() '''
def run(config): model_dir = Path('./models') / config.env_id / config.model_name if not model_dir.exists(): curr_run = 'run1' else: exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run')] if len(exst_run_nums) == 0: curr_run = 'run1' else: curr_run = 'run%i' % (max(exst_run_nums) + 1) run_dir = model_dir / curr_run log_dir = run_dir / 'logs' os.makedirs(log_dir) logger = SummaryWriter(str(log_dir)) torch.manual_seed(config.seed) np.random.seed(config.seed) if not USE_CUDA: torch.set_num_threads(config.n_training_threads) env = make_parallel_env(config.env_id, config.n_rollout_threads, config.seed, config.discrete_action) maddpg = MADDPG.init_from_env(env, agent_alg=config.agent_alg, adversary_alg=config.adversary_alg, tau=config.tau, lr=config.lr, hidden_dim=config.hidden_dim) replay_buffer = ReplayBuffer(config.buffer_length, maddpg.nagents, [obsp.shape[0] for obsp in env.observation_space], [acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space]) t = 0 for ep_i in range(0, config.n_episodes, config.n_rollout_threads): print("Episodes %i-%i of %i" % (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes)) obs = env.reset() # obs.shape = (n_rollout_threads, nagent)(nobs), nobs differs per agent so not tensor maddpg.prep_rollouts(device='cpu') explr_pct_remaining = max(0, config.n_exploration_eps - ep_i) / config.n_exploration_eps maddpg.scale_noise(config.final_noise_scale + (config.init_noise_scale - config.final_noise_scale) * explr_pct_remaining) maddpg.reset_noise() for et_i in range(config.episode_length): env.render() # rearrange observations to be per agent, and convert to torch Variable torch_obs = [Variable(torch.Tensor(np.vstack(obs[:, i])), requires_grad=False) for i in range(maddpg.nagents)] # get actions as torch Variables torch_agent_actions = maddpg.step(torch_obs, explore=True) # convert actions to numpy arrays agent_actions = [ac.data.numpy() for ac in torch_agent_actions] # rearrange actions to be per environment actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] next_obs, rewards, dones, infos = env.step(actions) replay_buffer.push(obs, agent_actions, rewards, next_obs, dones) obs = next_obs t += config.n_rollout_threads if (len(replay_buffer) >= config.batch_size and (t % config.steps_per_update) < config.n_rollout_threads): if USE_CUDA: maddpg.prep_training(device='gpu') else: maddpg.prep_training(device='cpu') for u_i in range(config.n_rollout_threads): for a_i in range(maddpg.nagents): sample = replay_buffer.sample(config.batch_size, to_gpu=USE_CUDA) maddpg.update(sample, a_i, logger=logger) maddpg.update_all_targets() maddpg.prep_rollouts(device='cpu') ep_rews = replay_buffer.get_average_rewards( config.episode_length * config.n_rollout_threads) for a_i, a_ep_rew in enumerate(ep_rews): logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i) if ep_i % config.save_interval < config.n_rollout_threads: os.makedirs(run_dir / 'incremental', exist_ok=True) maddpg.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))) maddpg.save(run_dir / 'model.pt') maddpg.save(run_dir / 'model.pt') env.close() logger.export_scalars_to_json(str(log_dir / 'summary.json')) logger.close()
def run(config): cover_ratio = [] model_dir = Path('./models') / config.env_id / config.model_name if not model_dir.exists(): run_num = 1 else: exst_run_nums = [ int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run') ] if len(exst_run_nums) == 0: run_num = 1 else: run_num = max(exst_run_nums) + 1 curr_run = 'run%i' % run_num run_dir = model_dir / curr_run log_dir = run_dir / 'logs' # os.makedirs(log_dir) # logger = SummaryWriter(str(log_dir)) # torch.manual_seed(run_num) # np.random.seed(run_num) #env = make_parallel_env(, config.n_rollout_threads, run_num) env = make_env(config.env_id, benchmark=BENCHMARK, discrete_action=True, use_handcraft_policy=config.use_handcraft_policy) model = AttentionSAC.init_from_env( env, tau=config.tau, pi_lr=config.pi_lr, q_lr=config.q_lr, gamma=config.gamma, pol_hidden_dim=config.pol_hidden_dim, critic_hidden_dim=config.critic_hidden_dim, attend_heads=config.attend_heads, reward_scale=config.reward_scale) model.init_from_save_self('./models/swift_scenario/model/run8/model.pt') replay_buffer = ReplayBuffer( config.buffer_length, model.nagents, [obsp.shape[0] for obsp in env.observation_space], [ acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space ]) t = 0 update_count = 0 for ep_i in range(0, config.n_episodes, config.n_rollout_threads): print( "Episodes %i-%i of %i" % (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes)) obs = env.reset() model.prep_rollouts(device='cpu') for et_i in range(config.episode_length): # rearrange observations to be per agent, and convert to torch Variable torch_obs = [ Variable(torch.Tensor(obs[i]).view(1, -1), requires_grad=False) for i in range(model.nagents) ] # get actions as torch Variables torch_agent_actions = model.step(torch_obs, explore=False) # convert actions to numpy arrays agent_actions = [ ac.data.numpy().squeeze() for ac in torch_agent_actions ] # rearrange actions to be per environment # actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] # agent_actions[0][5]=1 # agent_actions[1][5]=1 # agent_actions[2][5]=1 next_obs, rewards, dones, infos = env.step( agent_actions, use_handcraft_policy=config.use_handcraft_policy) env.render() time.sleep(0.1) # # # get actions as torch Variables # torch_agent_actions = model.step(torch_obs, explore=True) # # convert actions to numpy arrays # agent_actions = [ac.data.numpy() for ac in torch_agent_actions] # # rearrange actions to be per environment # actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] # next_obs, rewards, dones, infos = env.step(actions) # env.render() #if et_i == config.episode_length - 1: #print(infos) #print(type(infos['cover_ratio'])) #cover_ratio.append(float(infos[0]['n'][0]['cover_ratio'])) #print(infos) # replay_buffer.push(obs, agent_actions, rewards, next_obs, dones) obs = next_obs ''' t += config.n_rollout_threads if (len(replay_buffer) >= config.batch_size and (t % config.steps_per_update) < config.n_rollout_threads): if config.use_gpu: model.prep_training(device='gpu') else: model.prep_training(device='cpu') for u_i in range(config.num_updates): update_count += 1 print("episode:", ep_i, ", total steps:", t, " update_count:", update_count) sample = replay_buffer.sample(config.batch_size, to_gpu=config.use_gpu) model.update_critic(sample, logger=logger) model.update_policies(sample, logger=logger) model.update_all_targets() model.prep_rollouts(device='cpu') ep_rews = replay_buffer.get_average_rewards( config.episode_length * config.n_rollout_threads) for a_i, a_ep_rew in enumerate(ep_rews): logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i) if ep_i % config.save_interval < config.n_rollout_threads: model.prep_rollouts(device='cpu') os.makedirs(run_dir / 'incremental', exist_ok=True) model.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))) model.save(run_dir / 'model.pt') logger.export_scalars_to_json(str(log_dir / 'summary.json')) model.save(run_dir / 'model.pt') env.close() logger.export_scalars_to_json(str(log_dir / 'summary.json')) logger.close() print(cover_ratio) ''' env.close()