def __init__(self, discount_factor=0.95, tau=0.02): super(MADDPG, self).__init__() # critic input = obs_full + actions = 14+2+2+2=20 self.maddpg_agent = [ DDPGAgent( 24, 2, (8, 16, 32), (8, 4, 2), (2, 1, 1), (32, 16, 8), # actor settings 26, (8, 16, 32), (8, 4, 2), (2, 1, 1), (32, 16, 8)), # critic settings DDPGAgent( 24, 2, (8, 16, 32), (8, 4, 2), (2, 1, 1), (32, 16, 8), # actor settings 26, (8, 16, 32), (8, 4, 2), (2, 1, 1), (32, 16, 8)) ] # critic settings self.discount_factor = discount_factor self.tau = tau self.iter = 0
def learn(self): agent = DDPGAgent( env=self.env, replay_memory_size=REPLAY_MEMORY_SIZE, learning_rate=LEARNING_RATE, batch_size=MINIBATCH_SIZE, gamma=GAMMA, tau=TAU ) stats = {'scores': [], 'avg': [], 'min': [], 'max': []} for ep in tqdm(range(1, self.episodes + 1), ascii=True, unit='episodes'): print(self.epsilon) action_stats = [0, 0] current_state = self.env.reset() current_state = self.convert_gray(current_state) done = False score = 0 steps = 0 while not done: steps += 1 if np.random.random() > self.epsilon: action_stats[0] += 1 action = agent.get_action(current_state) else: action_stats[1] += 1 action = self.env.action_space.sample() action[2] = min(action[2], 0.2) action[1] = action[1]*2 new_state, reward, done, _ = self.env.step(action) if ep % self.results_every_n_episodes == 0: self.env.render() score += reward new_state = self.convert_gray(new_state) agent.memory.push(current_state, action, reward, new_state) if steps % 64 == 0: agent.update() current_state = new_state if self.epsilon > 0.1: self.epsilon -= self.epsilon_decay_value if score < 0: break print(action_stats) print(score) stats['scores'].append(score) self.env.close() return agent.actor
def __init__(self, seed, discount_factor=0.95, tau=0.02): super(MADDPG, self).__init__() # critic input = obs_full + actions = 14+2+2+2=20 self.maddpg_agent = [DDPGAgent(24, 2,seed), DDPGAgent(24, 2,seed)] self.discount_factor = discount_factor self.tau = tau self.iter = 0
def __init__(self, discount_factor=0.99, tau=0.001): super(MADDPG, self).__init__() # critic input = obs_full + actions = 48+2+2=52 self.maddpg_agent = [DDPGAgent(24, 256, 128, 2, 26, 256, 128), DDPGAgent(24, 256, 128, 2, 26, 256, 128)] self.discount_factor = discount_factor self.tau = tau self.iter = 0
def __init__(self, discount_factor=0.95, tau=0.02): super(MADDPG, self).__init__() # critic input = obs_full + actions = 24*2+2+2=52 self.maddpg_agent = [ DDPGAgent(24, 256, 256, 2, 52, 256, 256), DDPGAgent(24, 256, 256, 2, 52, 256, 256) ] self.discount_factor = discount_factor self.tau = tau self.iter = 0
def test(device, args): env = create_env(args.env, args) ram = MemoryBuffer(1) player = DDPGAgent(env.observation_space, env.action_space, ram, None, device, args) if args.model_dir is not None: player.load_models(args.model_dir, test=True) steps_done = 0 count_eps = 0 count_success = 0 while True: episode_rewards = [] episode_lenghts = [] for _ep in range(1, args.eval_eps): if args.ar: env.seed(True) observation = env.reset() total_reward = 0 episode_action = [] for steps in range(1000): if 'img' in args.obs: state = np.expand_dims(observation, axis=0) else: state = np.float32(observation) action, action_rescale = player.get_exploitation_action(state) episode_action.append(action) new_observation, reward, done, info = env.step(action_rescale) observation = new_observation total_reward += reward steps_done += 1 if args.render: env.render() if done: episode_rewards.append(total_reward) count_eps += 1 episode_lenghts.append(steps) if reward > 1: count_success += 1.0 break # check memory consumption and clear memory gc.collect() reward_ave = np.array(episode_rewards).mean() length_ave = np.array(episode_lenghts).mean() print( 'Test, episode %d, steps: %d, Success_rate: %.3f ave_reward: %.3f, ave_length: %.3f' % (count_eps, steps_done, count_success / count_eps, reward_ave, length_ave)) env.close()
def __init__(self, discount_factor=0.95, tau=0.02): super(MADDPG, self).__init__() # critic input = obs_full + actions = 8+2 = 10 self.maddpg_agent = [ DDPGAgent(8, 16, 8, 2, 10, 32, 16), DDPGAgent(8, 16, 8, 2, 10, 32, 16) ] self.discount_factor = discount_factor self.tau = tau self.iter = 0
def __init__(self, discount_factor=0.95, tau=0.02): super(MADDPG, self).__init__() # each agent has it's own critic and actor network' # each actor gets its agents state # but each agent critic seems to be getting the same input # critic input = obs_full + actions = 14+2+2+2=20 self.maddpg_agent = [DDPGAgent(14, 16, 8, 2, 20, 32, 16), DDPGAgent(14, 16, 8, 2, 20, 32, 16), DDPGAgent(14, 16, 8, 2, 20, 32, 16)] self.discount_factor = discount_factor self.tau = tau self.iter = 0
def __init__(self, discount_factor=0.95, tau=0.02): super(MADDPG, self).__init__() # critic input = obs_full + actions = 14+2+2+2=20 # in_actor=14, hidden_in_actor=16, hidden_out_actor=8, out_actor=2, # in_critic=20, hidden_in_critic=32, hidden_out_critic=16, self.maddpg_agent = [DDPGAgent(14, 16, 8, 2, 20, 32, 16), DDPGAgent(14, 16, 8, 2, 20, 32, 16), DDPGAgent(14, 16, 8, 2, 20, 32, 16)] self.discount_factor = discount_factor self.tau = tau self.iter = 0
def __init__(self, discount_factor=0.95, tau=0.02): super().__init__() # critic input = obs_full + actions = 14+2+2+2=20 self.maddpg_agent = [ DDPGAgent(14, 16, 8, 2, 20, 32, 16), DDPGAgent(14, 16, 8, 2, 20, 32, 16), DDPGAgent(14, 16, 8, 2, 20, 32, 16) ] self.discount_factor = discount_factor self.tau = tau self.iter = 0
def __init__(self, discount_factor=0.95, tau=0.01): super(MADDPG, self).__init__() # args = in_actor, hidden_in_actor, hidden_out_actor, out_actor, in_critic, hidden_in_critic, hidden_out_critic # critic input = obs_full + actions = 2*24+2+2=52 self.maddpg_agent = [ DDPGAgent(24, 400, 300, 2, 52, 400, 300), DDPGAgent(24, 400, 300, 2, 52, 400, 300) ] # DDPGAgent(24, 16, 8, 2, 52, 32, 16)] self.discount_factor = discount_factor self.tau = tau self.iter = 0
def __init__(self, discount_factor=0.95, tau=0.02): super(MADDPG, self).__init__() # critic input = obs_full + actions = 24 + 24 + 2 + 2 # in_actor=24, hidden_in_actor=16, hidden_out_actor=8, out_actor=2, # in_critic=52, hidden_in_critic=32, hidden_out_critic=16, self.maddpg_agent = [ DDPGAgent(24, 256, 128, 2, 52, 256, 128), DDPGAgent(24, 256, 128, 2, 52, 256, 128) ] self.discount_factor = discount_factor self.tau = tau self.iter = 0
def __init__(self, in_actor, hidden_in_actor, hidden_out_actor, out_actor, in_critic, hidden_in_critic, hidden_out_critic, lr_actor=1.0e-4, lr_critic=1.0e-5, discount_factor=0.99, tau=1.0e-2): super(MADDPG, self).__init__() # critic input = obs_full + actions = 14+2+2+2=20 agent1 = DDPGAgent(in_actor, hidden_in_actor, hidden_out_actor, out_actor, in_critic, hidden_in_critic, hidden_out_critic, lr_actor, lr_critic) agent2 = DDPGAgent(in_actor, hidden_in_actor, hidden_out_actor, out_actor, in_critic, hidden_in_critic, hidden_out_critic, lr_actor, lr_critic) self.maddpg_agent = [agent1, agent2] self.discount_factor = discount_factor self.tau = tau self.iter = 0
def __init__(self, discount_factor=0.99, tau=0.01, random_seed=0): super(MADDPG, self).__init__() self.maddpg_agent = [ DDPGAgent(24, 52, 2, random_seed), DDPGAgent(24, 52, 2, random_seed) ] self.discount_factor = discount_factor self.tau = tau self.iter = 0 # Replay memory self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, random_seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0
def __init__(self, actor_layer_sizes=[24, 128,128,2], critic_layer_sizes=[52, 128,128,1], discount_factor=0.95, tau=0.02, logger=None, lr_actor=1e-3, lr_critic=1e-3, gradient_clipping=None, clamp_actions=True, log_layers=False, log_weights=False, log_losses=True): super(MADDPG, self).__init__() # INITIALIZE EACH AGENT AS A DDPG MODEL self.agents = [ DDPGAgent(actor_layer_sizes=actor_layer_sizes, critic_layer_sizes=critic_layer_sizes, lr_actor=lr_actor, lr_critic=lr_critic, clamp_actions=clamp_actions, logger=logger, log_layers=log_layers), DDPGAgent(actor_layer_sizes=actor_layer_sizes, critic_layer_sizes=critic_layer_sizes, lr_actor=lr_actor, lr_critic=lr_critic, clamp_actions=clamp_actions, logger=logger, log_layers=log_layers), ] self.discount_factor = discount_factor # For discounted returns self.tau = tau # Soft Update factor self.iter = 0 # Keep track of how many iterations have passed self.gradient_clipping = gradient_clipping # upper limit to gradients self.logger = logger # Tensorboard logger object self.log_weights=log_weights # Monitor weights in Tensorboard?
def __init__(self, num_agents, num_states, num_actions, discount_factor=0.99, tau=1e-3): super(MADDPG, self).__init__() self.maddpg_agent = [ DDPGAgent(num_states, num_actions, num_states * 2), DDPGAgent(num_states, num_actions, num_states * 2) ] self.num_agents = num_agents self.discount_factor = discount_factor self.tau = tau self.iter = 0
def __init__(self, num_agents, x_dim, o_dim, a_dim, lr_actor=1e-3, lr_critic=1e-3, batch_size=16, gamma=0.99, tau=0.001, buffer_size=int(1e5), seed=1234): self.num_agents = num_agents self.x_dim = x_dim self.o_dim = o_dim self.a_dim = a_dim self.lr_actor = lr_actor self.lr_critic = lr_critic self.batch_size = batch_size self.gamma = gamma self.tau = tau self.buffer_size = buffer_size self.seed = seed self.buffer = ReplayBuffer(buffer_size, batch_size, seed) self.agents = [DDPGAgent(num_agents, id, x_dim, o_dim, a_dim, lr_actor, lr_critic, gamma, seed) \ for id in range(num_agents)]
def __init__(self, state_size, action_size, num_agents, random_seed): self.agents = [ DDPGAgent(state_size, action_size, random_seed) for _ in range(num_agents) ] self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, device, random_seed) self.t_step = 0
def run(self): ### create TORCS environment env = TorcsEnv(vision=False, throttle=True) ### start run according to supplied arguments if self.algorithm == "dqn" and self.modus == "train": agent = DQNAgent(env, self.track, self.numOfEpisodes) agent.trainAgent() elif self.algorithm == "dqn" and self.modus == "test": agent = DQNAgent(env, self.track, self.numOfEpisodes) agent.testAgent() elif self.algorithm == "ddpg" and self.modus == "train": agent = DDPGAgent(env, self.track, self.numOfEpisodes) agent.trainAgent() elif self.algorithm == "ddpg" and self.modus == "test": agent = DDPGAgent(env, self.track, self.numOfEpisodes) agent.testAgent()
def __init__(self, cfg: Config, discount_factor=0.95, tau=0.02, checkpoint_path: Optional[str] = None): self.logger = logging.getLogger(__name__) self.maddpg_agent = [ DDPGAgent(in_actor=24, hidden_in_actor=cfg.actor_hidden[0], hidden_out_actor=cfg.actor_hidden[1], out_actor=2, in_critic=52, hidden_in_critic=cfg.critic_hidden[0], hidden_out_critic=cfg.critic_hidden[1], lr_actor=cfg.actor_lr, lr_critic=cfg.critic_lr, noise_dist=cfg.noise_distribution), DDPGAgent(in_actor=24, hidden_in_actor=cfg.actor_hidden[0], hidden_out_actor=cfg.actor_hidden[1], out_actor=2, in_critic=52, hidden_in_critic=cfg.critic_hidden[0], hidden_out_critic=cfg.critic_hidden[1], lr_actor=cfg.actor_lr, lr_critic=cfg.critic_lr, noise_dist=cfg.noise_distribution) ] if checkpoint_path: checkpoint = torch.load(checkpoint_path) for i, agent in enumerate(self.maddpg_agent): agent.actor.load_state_dict(checkpoint[i]['actor_params']) agent.target_actor.load_state_dict( checkpoint[i]['actor_params']) agent.critic.load_state_dict(checkpoint[i]['critic_params']) agent.target_critic.load_state_dict( checkpoint[i]['critic_params']) # agent.actor_optimizer.load_state_dict(checkpoint[i]['actor_optim_params']) # agent.critic_optimizer.load_state_dict(checkpoint[i]['critic_optim_params']) self.tau = tau self.discount_factor = discount_factor self.iter = 0
def __init__(self, discount_factor=0.95, tau=0.02): super(MADDPG, self).__init__() # critic input = obs_full + actions = 14+2+2+2=20 #self.maddpg_agent = [DDPGAgent(14, 16, 8, 2, 20, 32, 16), # DDPGAgent(14, 16, 8, 2, 20, 32, 16), # DDPGAgent(14, 16, 8, 2, 20, 32, 16)] # DDPGAgent parameters are : # in_actor, hidden_in_actor, hidden_out_actor, out_actor, # in_critic, hidden_in_critic, hidden_out_critic # lr_actor=1.0e-2, lr_critic=1.0e-2 self.maddpg_agent = [ DDPGAgent(14, 128, 128, 2, 20, 128, 128, lr_actor=5.0e-3, lr_critic=5.0e-3), DDPGAgent(14, 128, 128, 2, 20, 128, 128, lr_actor=5.0e-3, lr_critic=5.0e-3), DDPGAgent(14, 128, 128, 2, 20, 128, 128, lr_actor=5.0e-3, lr_critic=5.0e-3) ] self.discount_factor = discount_factor self.tau = tau self.iter = 0
def __init__(self, env, state_dim: int, action_dim: int, config: Dict, device=None, writer=None): self.logger = logging.getLogger("MADDPG") self.device = device if device is not None else DEVICE self.writer = writer self.env = env self.state_dim = state_dim self.action_dim = action_dim self.agents_number = config['agents_number'] hidden_layers = config.get('hidden_layers', (400, 300)) noise_scale = config.get('noise_scale', 0.2) noise_sigma = config.get('noise_sigma', 0.1) actor_lr = config.get('actor_lr', 1e-3) actor_lr_decay = config.get('actor_lr_decay', 0) critic_lr = config.get('critic_lr', 1e-3) critic_lr_decay = config.get('critic_lr_decay', 0) self.actor_tau = config.get('actor_tau', 0.002) self.critic_tau = config.get('critic_tau', 0.002) create_agent = lambda: DDPGAgent(state_dim, action_dim, agents=self.agents_number, hidden_layers=hidden_layers, actor_lr=actor_lr, actor_lr_decay=actor_lr_decay, critic_lr=critic_lr, critic_lr_decay=critic_lr_decay, noise_scale=noise_scale, noise_sigma=noise_sigma, device=self.device) self.agents = [create_agent() for _ in range(self.agents_number)] self.discount = 0.99 if 'discount' not in config else config['discount'] self.gradient_clip = 1.0 if 'gradient_clip' not in config else config[ 'gradient_clip'] self.warm_up = 1e3 if 'warm_up' not in config else config['warm_up'] self.buffer_size = int( 1e6) if 'buffer_size' not in config else config['buffer_size'] self.batch_size = config.get('batch_size', 128) self.p_batch_size = config.get('p_batch_size', int(self.batch_size // 2)) self.n_batch_size = config.get('n_batch_size', int(self.batch_size // 4)) self.buffer = ReplayBuffer(self.batch_size, self.buffer_size) self.update_every_iterations = config.get('update_every_iterations', 2) self.number_updates = config.get('number_updates', 2) self.reset()
def __init__(self, state_size, obs_size, action_size, num_agents): super(MADDPG, self).__init__() self.maddpg_agent = [ DDPGAgent(state_size, obs_size, action_size, num_agents) for x in range(num_agents) ] self.discount_factor = DISCOUNT_FACTOR self.tau = TAU self.iter = 0
def __init__(self, state_size, action_size, num_agents, batchsize, discount_factor=0.95, tau=0.02): super(MADDPG, self).__init__() self.maddpg_agent = [ DDPGAgent(state_size, action_size, num_agents), DDPGAgent(state_size, action_size, num_agents) ] self.discount_factor = discount_factor self.tau = tau self.iter = 0 self.num_agents = num_agents self.batchsize = batchsize
def __init__(self, config): self.config = config self.gamma = config.gamma self.memory = config.memory() self.batch_size = config.batch_size self.update_every = config.update_every self.num_updates = config.num_updates self.t_step = 0 self.maddpg_agents = [DDPGAgent(config) for _ in range(config.num_agents)]
def __init__(self, discount_factor, tau, batch_size): super(MADDPG, self).__init__() self.maddpg_agent = [ DDPGAgent(24, 128, 128, 2, 52, 64, 64) for i in range(2) ] self.discount_factor = discount_factor self.tau = tau self.iter = 0 self.batch_size = batch_size
def __init__(self): """Initialize a MADDPG Agent object.""" super(MADDPGAgent, self).__init__() self.config = Config.getInstance() self.action_num = self.config.action_size * self.config.num_agents self.t_step = 0 self.maddpg_agent = [ DDPGAgent() for _ in range(self.config.num_agents) ] self.memory = ReplayBuffer()
def __init__(self, episodes_before_train, batch_size, replay_buffer, discount_factor=0.95, tau=0.02): super(MADDPG, self).__init__() # critic input = obs_full + actions = 24+24+2+2=52 self.maddpg_agent = [ DDPGAgent(24, 2, 400, 300, 48, 4, 400, 300), DDPGAgent(24, 2, 400, 300, 48, 4, 400, 300) ] self.num_agents = 2 self.action_size = 2 self.discount_factor = discount_factor self.tau = tau self.iter = 0 self.episodes_before_train = episodes_before_train self.batch_size = batch_size self.buffer = replay_buffer
def __init__(self, state_size, action_size, discount_factor=0.95, tau=0.05, lr_actor=2e-4, lr_critic=2e-3, num_agents=2): super(MADDPG, self).__init__() hidden_in_dim = 512 hidden_out_dim = 256 # critic input = obs_full + actions = 48+2+2=52 # have to change the agent neurons for sure # the no of agents is two because there are only two players self.maddpg_agent = [ DDPGAgent(state_size, action_size, hidden_in_dim, hidden_out_dim, num_agents=num_agents, lr_actor=lr_actor, lr_critic=lr_critic), DDPGAgent(state_size, action_size, hidden_in_dim, hidden_out_dim, num_agents=num_agents, lr_actor=lr_actor, lr_critic=lr_critic) ] self.num_agents = num_agents self.action_vector = 2 self.discount_factor = discount_factor self.tau = tau self.iter = 0
def train_ddpg(): args = DDPGArgs() env = gym.make(args.env_name) agent = DDPGAgent(env, DDPGQNet, DDPGActor, SimpleNormalizer, args) for ep in range(args.max_ep): agent.train_one_episode() if ep % args.test_interval == 0: agent.test_model()