예제 #1
0
    def __init__(self,
                 in_actor,
                 out_actor,
                 in_critic,  # e.g. = n_agent * (state_size + action_size)
                 lr_actor=1e-4,
                 lr_critic=1e-3,  # better learn faster than actor
                 random_seed=2):
        self.state_size = in_actor
        self.action_size = out_actor
        self.seed = random.seed(random_seed)

        self.params = {"lr_actor": lr_actor,
                       "lr_critic": lr_critic,
                       "optimizer": "adam"}

        self.local_actor = Actor(in_shape=in_actor, out_shape=out_actor).to(device)
        self.target_actor = Actor(in_shape=in_actor, out_shape=out_actor).to(device)
        self.actor_optimizer = optim.Adam(self.local_actor.parameters(), lr=lr_actor)

        # for a single agent, critic takes global observations as input, and output action-value Q
        # e.g. global_states = all_states + all_actions
        self.local_critic = Critic(in_shape=in_critic).to(device)
        self.target_critic = Critic(in_shape=in_critic).to(device)
        self.critic_optimizer = optim.Adam(self.local_critic.parameters(), lr=lr_critic)

        # Q: should local/target start with same weights ? synchronized after first copy after all
        # A: better hard copy at the beginning
        hard_update_A_from_B(self.target_actor, self.local_actor)
        hard_update_A_from_B(self.target_critic, self.local_critic)

        # Noise process
        self.noise = OUNoise(out_actor, scale=1.0)
예제 #2
0
    def __init__(self, actor_size, action_size, critic_size):
        super().__init__()
        gpu = torch.cuda.is_available()
        if (gpu):
            print('GPU/CUDA works! Happy fast training :)')
            torch.cuda.current_device()
            torch.cuda.empty_cache()
            self.device = torch.device("cuda")
        else:
            print('training on cpu...')
        self.device = torch.device("cpu")

        self.actor = Actor(actor_size, action_size).to(self.device)
        self.actor_target = Actor(actor_size, action_size).to(self.device)
        self.actor_optim = optim.Adam(self.actor.parameters(), lr=0.0001)
        self.critic = Critic(critic_size).to(self.device)
        self.critic_target = Critic(critic_size).to(self.device)
        self.critic_optim = optim.Adam(self.critic.parameters(),
                                       lr=0.001,
                                       weight_decay=0)
        self.gamma = 0.95  #0.99
        self.tau = 0.001
        self.noise = OUNoise((action_size), 2)
        self.target_network_update(self.actor_target, self.actor, 1.0)
        self.target_network_update(self.critic_target, self.critic, 1.0)
예제 #3
0
    def __init__(self, sess, dimo, dimu, u_bound, critic_lr, actor_lr,
                 critic_l2, clip_norm, tau, layer_norm, noisy_layer, gamma,
                 memory_size, exploration, batch_size, env_dt):
        self._sess = sess

        self._dimo = dimo
        self._dimu = dimu
        self._critic_l2 = critic_l2
        self._actor_lr = actor_lr
        self._critic_lr = critic_lr
        self._clip_norm = clip_norm

        self._noisy = noisy_layer
        self._gamma = gamma
        self._tau = tau
        self._batch_size = batch_size
        self._u_bound = u_bound

        self._global_step = tf.train.get_or_create_global_step()

        self.ou_noise = OUNoise(dim=dimu,
                                n_step_annealing=exploration,
                                dt=env_dt)
        self._memory = ReplayMemory(memory_size)

        with tf.variable_scope('inputs'):
            self._obs = tf.placeholder(tf.float32, [None, self._dimo],
                                       name='state')
            self._u = tf.placeholder(tf.float32, [None, self._dimu],
                                     name='action')
            self._t_obs = tf.placeholder(tf.float32, [None, self._dimo],
                                         name='target_state')

        with tf.variable_scope('actor'):
            self._actor = Actor('main', self._obs, dimu, layer_norm,
                                noisy_layer)
            self._target_actor = Actor('target', self._t_obs, dimu, layer_norm,
                                       noisy_layer)

        with tf.variable_scope('critic'):
            self._critic = Critic('main', self._obs, self._u, layer_norm,
                                  noisy_layer)
            self._critic_pi = Critic('main',
                                     self._obs,
                                     U.scaling(self._actor.pi, -1.0, 1.0,
                                               self._u_bound['low'],
                                               self._u_bound['high']),
                                     layer_norm,
                                     noisy_layer,
                                     reuse=True)
            self._target_critic = Critic(
                'target', self._t_obs,
                U.scaling(self._target_actor.pi, -1.0, 1.0,
                          self._u_bound['low'], self._u_bound['high']),
                layer_norm, noisy_layer)

        self._build_train_method()
        self._update_target_op = self._update_target_networks()
    def __init__(self, state_size: int, action_size: int, num_agents: int,
                 epsilon, random_seed: int):
        """ Initialize a DDPG Agent Object

        :param state_size: dimension of state (input)
        :param action_size: dimension of action (output)
        :param num_agents: number of concurrent agents in the environment
        :param epsilon: initial value of epsilon for exploration
        :param random_seed: random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(random_seed)
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")
        self.t_step = 0

        # Hyperparameters
        self.buffer_size = 1000000
        self.batch_size = 128
        self.update_every = 10
        self.num_updates = 10
        self.gamma = 0.99
        self.tau = 0.001
        self.lr_actor = 0.0001
        self.lr_critic = 0.001
        self.weight_decay = 0
        self.epsilon = epsilon
        self.epsilon_decay = 0.97
        self.epsilon_min = 0.005

        # Networks (Actor: State -> Action, Critic: (State,Action) -> Value)
        self.actor_local = Actor(self.state_size, self.action_size,
                                 random_seed).to(self.device)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  random_seed).to(self.device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=self.lr_actor)
        self.critic_local = Critic(self.state_size, self.action_size,
                                   random_seed).to(self.device)
        self.critic_target = Critic(self.state_size, self.action_size,
                                    random_seed).to(self.device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=self.lr_critic,
                                           weight_decay=self.weight_decay)
        # Initialize actor and critic networks to start with same parameters
        self.soft_update(self.actor_local, self.actor_target, tau=1)
        self.soft_update(self.critic_local, self.critic_target, tau=1)

        # Noise Setup
        self.noise = OUNoise(self.action_size, random_seed)

        # Replay Buffer Setup
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)
예제 #5
0
    def __init__(self, state_dim, action_dim, action_lim, update_type='soft',
                lr_actor=1e-4, lr_critic=1e-3, tau=1e-3,
                mem_size=1e6, batch_size=256, gamma=0.99,
                other_cars=False, ego_dim=None):
        self.device = torch.device("cuda:0" if torch.cuda.is_available()
                                        else "cpu")

        self.joint_model = False
        if len(state_dim) == 3:
            self.model = ActorCriticCNN(state_dim, action_dim, action_lim)
            self.model_optim = optim.Adam(self.model.parameters(), lr=lr_actor)

            self.target_model = ActorCriticCNN(state_dim, action_dim, action_lim)
            self.target_model.load_state_dict(self.model.state_dict())

            self.model.to(self.device)
            self.target_model.to(self.device)

            self.joint_model = True
        else:
            self.actor = Actor(state_dim, action_dim, action_lim, other_cars=other_cars, ego_dim=ego_dim)
            self.actor_optim = optim.Adam(self.actor.parameters(), lr=lr_actor)
            self.target_actor = Actor(state_dim, action_dim, action_lim, other_cars=other_cars, ego_dim=ego_dim)
            self.target_actor.load_state_dict(self.actor.state_dict())
            self.target_actor.eval()

            self.critic = Critic(state_dim, action_dim, other_cars=other_cars, ego_dim=ego_dim)
            self.critic_optim = optim.Adam(self.critic.parameters(), lr=lr_critic, weight_decay=1e-2)
            self.target_critic = Critic(state_dim, action_dim, other_cars=other_cars, ego_dim=ego_dim)
            self.target_critic.load_state_dict(self.critic.state_dict())
            self.target_critic.eval()

            self.actor.to(self.device)
            self.target_actor.to(self.device)
            self.critic.to(self.device)
            self.target_critic.to(self.device)

        self.action_lim = action_lim
        self.tau = tau # hard update if tau is None
        self.update_type = update_type
        self.batch_size = batch_size
        self.gamma = gamma

        if self.joint_model:
            mem_size = mem_size//100
        self.memory = Memory(int(mem_size), action_dim, state_dim)

        mu = np.zeros(action_dim)
        sigma = np.array([0.5, 0.05])
        self.noise = OrnsteinUhlenbeckActionNoise(mu, sigma)
        self.target_noise = OrnsteinUhlenbeckActionNoise(mu, sigma)

        self.initialised = True
        self.training = False
예제 #6
0
    def __init__(
            self,
            state_size,
            action_size,
            sample_batch_size,
            memory_size=int(1e5),  # replay buffer size
            batch_size=128,  # minibatch size
            gamma=0.99,  # discount factor
            tau=1e-3,  # for soft update of target parameters
            update_every=10,
            lr_actor=1e-4,
            lr_critic=1e-3,
            random_seed=2):
        self.sample_batch_size = sample_batch_size
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        self.params = {
            "lr_actor": lr_actor,
            "lr_critic": lr_critic,
            "gamma": gamma,
            "tau": tau,
            "memory_size": memory_size,
            "batch_size": batch_size,
            "optimizer": "adam"
        }

        self.actor_local = Actor(state_size, action_size,
                                 seed=random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  seed=random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=lr_actor)

        self.critic_local = Critic(state_size, action_size,
                                   seed=random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    seed=random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=lr_critic)

        self.memory = ReplayBuffer(action_size, memory_size, batch_size,
                                   random_seed)

        # Noise process
        self.noise = OUNoise([sample_batch_size, action_size], random_seed)

        self.learn_steps = 0
        self.update_every = update_every
예제 #7
0
파일: agents.py 프로젝트: zshchou/keras-a3c
    def __init__(self, env_name: str, threads: int, episodes: int,
                 entropy_weight: float, learning_rate: Union[
                     float,
                     tf.keras.optimizers.schedules.LearningRateSchedule],
                 discount_factor: float):

        self.env_name = env_name
        env = gym.make(env_name)

        self.save_dir = os.path.expanduser('~/keras-a3c/models/')

        self.threads = threads
        self.EPISODES = episodes
        self.entropy_weight = entropy_weight
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor

        actor = Actor(action_space_size=env.action_space.n)
        critic = Critic()
        self.global_model = ActorCriticModel(actor, critic)

        self.actor_loss = ActorLoss(entropy_weight)

        self.optimizer = tf.keras.optimizers.RMSprop(lr=learning_rate)

        self.global_model(
            tf.convert_to_tensor(
                np.random.random((1, env.observation_space.shape[0]))))
 def __init__(self, observation_space, action_space, lr_actor, lr_critic, gamma, 
              device='cpu', discrete=False, project_dim=8):
     """
     Parameters
     ----------
     observation_space: int
         Number of flattened entries of the state
     action_space: int
         Number of (discrete) possible actions to take
     """
     
     self.gamma = gamma
     
     self.n_actions = action_space
     self.discrete = discrete
     if self.discrete:
         self.actor = DiscreteActor(observation_space, action_space, project_dim)
         self.critic = DiscreteCritic(observation_space, project_dim)
     else:
         self.actor = Actor(observation_space, action_space)
         self.critic = Critic(observation_space)
     self.actor_optim = torch.optim.Adam(self.actor.parameters(), lr=lr_actor)
     self.critic_optim = torch.optim.Adam(self.critic.parameters(), lr=lr_critic)
     
     self.device = device 
예제 #9
0
    def __init__(self, image_size, input_channels, hidden_channels, output_channels, latent_dimension, lr, device, clamp=0.01, gp_weight=10):
        self.image_size = image_size
        self.input_channels = input_channels
        self.hidden_chanels = hidden_channels
        self.output_channels = output_channels
        self.latent_dimension = latent_dimension
        self.device = device
        self.clamp = clamp
        self.gp_weight = gp_weight

        self.critic = Critic(image_size, hidden_channels,
                             input_channels).to(device)
        self.generator = Generator(
            image_size, latent_dimension, hidden_channels, output_channels).to(device)

        self.critic.apply(self.weights_init)
        self.generator.apply(self.weights_init)

        self.optimizer_critic = torch.optim.RMSprop(
            self.critic.parameters(), lr)
        self.optimizer_gen = torch.optim.RMSprop(
            self.generator.parameters(), lr)

        self.optimizer_critic = torch.optim.Adam(
            self.critic.parameters(), lr, betas=(0, 0.9))
        self.optimizer_gen = torch.optim.Adam(
            self.generator.parameters(), lr, betas=(0, 0.9))

        self.critic_losses = []
        self.gen_losses = []

        self.losses = []
예제 #10
0
def get_models(latent_dim, model_dim, device, output_dim, channels, init=True):
    generator = Generator(latent_dim, model_dim, channels).to(device)
    critic = Critic(model_dim, output_dim, channels).to(device)
    if init:
        generator.apply(__weights_init_normal)
        critic.apply(__weights_init_normal)
    return generator, critic
예제 #11
0
    def __init__(self, state_dim, action_dim, num_shared, device):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.device = device

        self.actor = Actor(state_dim, action_dim, num_shared).to(device)
        self.critic = Critic(state_dim, num_shared).to(device)
예제 #12
0
    def __init__(self, policy: str, action_dim: int, max_action: float,
                 lr: float, discount: float, noise_clip: float,
                 policy_noise: float, policy_freq: int, actor_rng: jnp.ndarray,
                 critic_rng: jnp.ndarray, sample_state: np.ndarray):
        self.discount = discount
        self.noise_clip = noise_clip
        self.policy_noise = policy_noise
        self.policy_freq = policy_freq
        self.max_action = max_action
        self.td3_update = policy == 'TD3'

        self.actor = hk.transform(lambda x: Actor(action_dim, max_action)(x))
        actor_opt_init, self.actor_opt_update = optix.adam(lr)

        self.critic = hk.transform(lambda x: Critic()(x))
        critic_opt_init, self.critic_opt_update = optix.adam(lr)

        self.actor_params = self.target_actor_params = self.actor.init(
            actor_rng, sample_state)
        self.actor_opt_state = actor_opt_init(self.actor_params)

        action = self.actor.apply(self.actor_params, sample_state)

        self.critic_params = self.target_critic_params = self.critic.init(
            critic_rng, jnp.concatenate((sample_state, action), 0))
        self.critic_opt_state = critic_opt_init(self.critic_params)

        self.updates = 0
예제 #13
0
    def __init__(self, state_size, action_size, num_agents, random_seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            num_agents (int): number of agents
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = [
            OUNoise(action_size, random_seed, sigma=0.1)
            for i in range(self.num_agents)
        ]

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

        # Make sure target is with the same weight as the source
        self.hard_update(self.actor_target, self.actor_local)
        self.hard_update(self.critic_target, self.critic_local)

        self.t_step = 0
예제 #14
0
    def __init__(self, state_dim, action_dim, device):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.device = device

        self.actor = Actor(state_dim, action_dim).to(device)
        self.critic = Critic(state_dim).to(device)
        self.optimizer = torch.optim.Adam(
            itertools.chain(self.actor.parameters(), self.critic.parameters()),
            LR)

        self.philosophers = list()
        for i in range(P_COUNT):
            self.philosophers.append(Critic(state_dim).to(device))

        self.p_optimizers = [
            torch.optim.Adam(p.parameters(), lr=P_LR)
            for p in self.philosophers
        ]
        self.update_cnt = 0
예제 #15
0
    def __init__(self, num_agents, state_size, action_size, random_seed=2018):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.num_agents = num_agents
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.device = torch.device('cuda' if cuda else 'cpu')

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed, device)
    def __init__(self):
      
        self.max_action = 1
        self.policy_freq = 2
        self.policy_freq_it = 0
        self.batch_size = 512
        self.discount = 0.99
        self.replay_buffer = int(1e5)
        
        
        self.device = 'cuda'
        
        self.state_dim = 24
        self.action_dim = 2
        self.max_action = 1
        self.policy_noise = 0.1
        self.agents = 1
        
        self.random_period = 1e4
        
        self.tau = 5e-3
        
        self.replay_buffer = ReplayBuffer(self.replay_buffer)
        
        self.actor = Actor(self.state_dim, self.action_dim, self.max_action).to(self.device)
        self.actor_target = Actor(self.state_dim, self.action_dim, self.max_action).to(self.device)
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=1e-4)
#         self.actor.load_state_dict(torch.load('actor2.pth'))
#         self.actor_target.load_state_dict(torch.load('actor2.pth'))

        self.noise = OUNoise(2, 32)
        
        
        self.critic = Critic(48, self.action_dim).to(self.device)
        self.critic_target = Critic(48, self.action_dim).to(self.device)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=3e-4)
예제 #17
0
    def __init__(self,
                 env,
                 hidden_size=256,
                 actor_lr=1e-4,
                 critic_lr=1e-3,
                 gamma=0.99,
                 tau=1e-3,
                 max_memory=int(1e6)):
        obs = env.reset()
        self.num_states = obs['desired_goal'].shape[0] + obs[
            'observation'].shape[0]
        self.num_actions = env.action_space.shape[0]
        self.gamma = gamma
        self.tau = tau
        self.action_max = env.action_space.high[0]

        self.actor = Actor(self.num_states, hidden_size, self.num_actions)
        self.critic = Critic(self.num_states + self.num_actions, hidden_size,
                             1)

        self.target_actor = Actor(self.num_states, hidden_size,
                                  self.num_actions)
        self.target_critic = Critic(self.num_states + self.num_actions,
                                    hidden_size, 1)

        for target_param, param in zip(self.target_actor.parameters(),
                                       self.actor.parameters()):
            target_param.data.copy_(param.data)

        for target_param, param in zip(self.target_critic.parameters(),
                                       self.critic.parameters()):
            target_param.data.copy_(param.data)

        self.experience_replay = ExperienceReplay(max_memory)
        self.critic_loss_func = nn.MSELoss()
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_lr)
        self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                           lr=critic_lr)
예제 #18
0
    def __init__(self, device, state_size, action_size, folder, config):

        self.folder = folder
        self.config = config
        self.device = device
        self.memory = ReplayMemory(self.config["MEMORY_CAPACITY"])

        self.state_size = state_size
        self.action_size = action_size

        self.critic = Critic(self.state_size, self.action_size, self.device,
                             self.config)
        self.actor = Actor(self.state_size, self.action_size, self.device,
                           self.config)
예제 #19
0
    def __init__(self, device, memory, state_size, action_size, low_bound, high_bound, folder, config):

        self.folder = folder
        self.config = config
        self.device = device
        self.memory = memory

        self.state_size = state_size
        self.action_size = action_size
        self.low_bound = low_bound
        self.high_bound = high_bound

        self.critic = Critic(state_size, action_size, device, self.config)
        self.actor = Actor(state_size, action_size, low_bound, high_bound, device, self.config)
    def __init__(self, state_size, action_size, seed):
        """ Initialize a DDPG Agent Object
        :param state_size: dimension of state (input) for this decentralized actor
        :param action_size: dimension of action (output) for this decentralized actor
        :param random_seed: random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = seed
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        # Hyperparameters
        self.buffer_size = 100000
        self.batch_size = 256
        self.gamma = 0.99
        self.tau = 0.01
        self.lr_actor = 0.0001
        self.lr_critic = 0.001

        # Setup Networks (Actor: State -> Action, Critic: (States for all agents, Actions for all agents) -> Value)
        self.actor_local = Actor(self.state_size, self.action_size,  self.seed).to(self.device)
        self.actor_target = Actor(self.state_size, self.action_size, self.seed).to(self.device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr = self.lr_actor)
        self.critic_local = Critic(self.state_size, self.action_size, self.seed).to(self.device)
        self.critic_target = Critic(self.state_size, self.action_size, self.seed).to(self.device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr = self.lr_critic)

        # Initialize local and taret networks to start with same parameters
        self.soft_update(self.actor_local, self.actor_target, tau=1)
        self.soft_update(self.critic_local, self.critic_target, tau=1)

        # Noise Setup
        self.noise = OUNoise(self.action_size, self.seed)

        # Replay Buffer Setup
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)
예제 #21
0
    def __init__(self, state_size, action_size, fc1_units, fc2_units):
        """Initialize an Agent object.

        Params
        ======
        state_size (int): dimension of each state
        action_size (int): dimension of each action
        random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = torch.manual_seed(SEED)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, fc1_units,
                                 fc2_units).to(device)
        self.actor_target = Actor(state_size, action_size, fc1_units,
                                  fc2_units).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, fc1_units,
                                   fc2_units).to(device)
        self.critic_target = Critic(state_size, action_size, fc1_units,
                                    fc2_units).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OrnsteinUhlenbeck(action_size, SEED)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, SEED,
                                   device)
예제 #22
0
    def __init__(self,
                 env,
                 log_dir,
                 gamma=0.99,
                 batch_size=64,
                 sigma=0.2,
                 batch_norm=True,
                 merge_layer=2,
                 buffer_size=int(1e6),
                 buffer_min=int(1e4),
                 tau=1e-3,
                 Q_wd=1e-2,
                 num_episodes=1000):

        self.s_dim = env.reset().shape[0]
        # self.a_dim = env.action_space.shape[0]
        self.a_dim = env.action_space2.shape[0]
        # self.a_dim = 1

        self.env = env
        # self.mu = Actor(self.s_dim, self.a_dim, env.action_space, batch_norm=batch_norm)
        self.mu = Actor(self.s_dim,
                        self.a_dim,
                        env.action_space2,
                        batch_norm=batch_norm)
        self.Q = Critic(self.s_dim,
                        self.a_dim,
                        batch_norm=batch_norm,
                        merge_layer=merge_layer)
        self.targ_mu = copy.deepcopy(self.mu).eval()
        self.targ_Q = copy.deepcopy(self.Q).eval()
        self.noise = OrnsteinUhlenbeck(mu=torch.zeros(self.a_dim),
                                       sigma=sigma * torch.ones(self.a_dim))
        self.buffer = Buffer(buffer_size, self.s_dim, self.a_dim)
        self.buffer_min = buffer_min
        self.mse_fn = torch.nn.MSELoss()
        self.mu_optimizer = torch.optim.Adam(self.mu.parameters(), lr=1e-4)
        self.Q_optimizer = torch.optim.Adam(self.Q.parameters(),
                                            lr=1e-3,
                                            weight_decay=Q_wd)

        self.gamma = gamma
        self.batch_size = batch_size
        self.num_episodes = num_episodes
        self.tau = tau
        self.log_dir = log_dir

        self.fill_buffer()
예제 #23
0
 def __init__(self, args):
     self.args = args
     self.critic = Critic(args.dim_s, args.dim_a, args.dim_h, args.device)
     self.optim = torch.optim.Adam(self.critic.parameters(), lr=0.001)
     self.scheduler_lr = torch.optim.lr_scheduler.StepLR(self.optim, step_size=1000, gamma=0.9, last_epoch=-1)
     self.predicate = get_predicate()
     f = get_formula()
     self.formula = add_w_to_formula(f, [0 for i in f])
     self.database = []
     self.data, self.mln = self.model_config(self.predicate, self.formula, self.database, 'TicTacToe.mln', 'TicTacToe.db')
     self.state_list = []
     self.step = 0
     self.action_list={0:{0:'Place(0,0)', 1:'Place(0,1)', 2:'Place(0,2)'},
                       1:{0:'Place(1,0)', 1:'Place(1,1)', 2:'Place(1,2)'},
                       2:{0:'Place(2,0)', 1:'Place(2,1)', 2:'Place(2,2)'}}
     self.EPSILON = 1
예제 #24
0
    def __init__(self, args, env):

        self.learning_rate = args.learning_rate
        self.gamma = args.gamma
        self.lamb = args.lamb
        self.batch_size = args.batch_size
        self.step = 0
        self.epochs = args.epochs
 
        self.actor = Actor()
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=self.learning_rate)

        self.critic = Critic()
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=self.learning_rate)

        self.env = env
        self.num_actions = env.num_actions
        self.num_states = env.num_states

        self.data = {'step' : [], 'reward' : [], 'losses' : []}
예제 #25
0
    def __init__(self, env_name: str, save_dir: str, entropy_weight: float,
                 discount_factor: float, id: int):
        import tensorflow as tf
        self.env_name = env_name
        self.env = gym.make(env_name)
        self.id = id

        self.save_dir = save_dir

        self.discount_factor = discount_factor

        actor = Actor(action_space_size=self.env.action_space.n)
        critic = Critic()
        self.local_model = ActorCriticModel(actor, critic)

        self.actor_loss = ActorLoss(entropy_weight)

        self.local_model(
            tf.convert_to_tensor(
                np.random.random((1, self.env.observation_space.shape[0]))))
예제 #26
0
    def update(self, trajectories):
        trajectories = map(self._compute_lambda_returns_and_gae, trajectories)
        transitions = sum(
            trajectories,
            [])  # Turn a list of trajectories into list of transitions

        state, action, old_log_prob, target_value, advantage = zip(
            *transitions)
        state = torch.from_numpy(np.array(state)).float().to(self.device)
        action = torch.from_numpy(np.array(action)).float().to(self.device)
        old_log_prob = torch.from_numpy(np.array(old_log_prob)).float().to(
            self.device)
        target_value = torch.from_numpy(np.array(target_value)).float().to(
            self.device)
        advantage = torch.from_numpy(np.array(advantage)).float().to(
            self.device)

        for _ in range(BATCHES_PER_UPDATE):
            idx = np.random.randint(0, len(transitions), BATCH_SIZE)
            loss = self._calc_loss(state[idx], action[idx], old_log_prob[idx],
                                   target_value[idx], advantage[idx])

            self.optimizer.zero_grad()
            for p_optimizer in self.p_optimizers:
                p_optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()
            for p_optimizer in self.p_optimizers:
                p_optimizer.step()

        self.update_cnt += 1
        if self.update_cnt % P_DELAY == 0:
            self.critic = self.philosophers[0]
            self.optimizer = self.p_optimizers[0]

            self.philosophers.pop(0)
            self.philosophers.append(Critic(self.state_dim).to(self.device))
            self.p_optimizers.pop(0)
            self.p_optimizers.append(
                torch.optim.Adam(self.philosophers[-1].parameters(), lr=P_LR))
예제 #27
0
    # Load the datasets
    X_set, Y_set = load_data()

    # Load infinite data
    X_data = get_infinite_X_data(X_set)
    Y_data = get_infinite_Y_data(Y_set)

########################################################
# Define device, neural nets, losses, optimizers, etc. #
########################################################

# Automatic GPU/CPU device placement
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Networks
C_X = Critic().to(device)  # Criticizes X data
C_Y = Critic().to(device)  # Criticizes Y data
G = Generator(upsample=UPSAMPLE).to(device)  # Translates X -> Y
F = Generator(upsample=UPSAMPLE).to(device)  # Translates Y -> X

# Losses
l1_loss = nn.L1Loss()

# Optimizers
C_X_optim = optim.Adam(C_X.parameters(), lr=LR, betas=(BETA1, BETA2))
C_Y_optim = optim.Adam(C_Y.parameters(), lr=LR, betas=(BETA1, BETA2))
G_optim = optim.Adam(G.parameters(), lr=LR, betas=(BETA1, BETA2))
F_optim = optim.Adam(F.parameters(), lr=LR, betas=(BETA1, BETA2))

###############
# Training ­ЪДа #
예제 #28
0
        critic = torch.load(model_path + '_critic.pt')
        print('...done')

        run(actor, env, min_rate=0.05, writer=writer, render=True)

    else:  # TRAIN MODE
        # Non-linearity is an argument
        non_linear = None
        if args.non_linear == 'relu':
            non_linear = torch.nn.ReLU()
        elif args.non_linear == 'elu':
            non_linear = torch.nn.ELU()

        # New actor and critic policies
        actor = Actor(use_gpu=use_gpu, non_linear=non_linear, batch_norm=args.batch_norm)
        critic = Critic(use_gpu=use_gpu, non_linear=non_linear, batch_norm=args.batch_norm)

        for i in range(args.num_train_cycles):
            print('Training cycle %s of %s' % (i, args.num_train_cycles))
            act(actor, env, task, B,
                num_trajectories=args.num_trajectories,
                task_period=30, writer=writer)
            learn(actor, critic, task, B,
                  num_learning_iterations=args.num_learning_iterations,
                  episode_batch_size=args.episode_batch_size,
                  lr=0.0002, writer=writer, loss=args.loss)
            run(actor, env, min_rate=0.05, writer=writer)
            # Remove early trajectories when buffer gets too large
            B = B[-args.buffer_size:]

        # Save the model to local directory
def run_seed(seed_params):
    '''
    Runs a learning simulation for the seed in question.

    Parameters
    ----------
    seed_params : dict

    Returns
    -------
    Seed_rewards : np.array
        Reward history for the seed.
    Seed_entropies : np.array
        Entropy history for the seed.
    Loss_history : list
        History of the losses of the critic's value function.

    '''
    env_params = seed_params['Env']
    sim_params = seed_params['Sim']
    hyper_params = seed_params['Hyp']

    seed = seed_params['seed']
    env = Environment(env_params, n_cyclists=sim_params['n_cyclists'])
    action_space = env.action_space
    np.random.seed(seed)
    state_normaliser = State_normaliser(env_params)
    nA = len(env.action_space)
    # Init weights
    model = []
    for i in range(sim_params['n_cyclists']):
        model.append({
            'actor': Actor(env, hyper_params, seed),
            'critic': Critic(env, hyper_params, seed)
        })

    # Keep stats for final print of graph
    episode_rewards = []
    episode_entropies = []

    # Main loop
    for episode in range(sim_params['n_episodes']):
        cyclists_done = []
        # Keep track of game score to print
        states = env.reset()

        states = [state_normaliser.normalise_state(state) for state in states]
        step_rewards = [[] for i in range(sim_params['n_cyclists'])]
        step_entropies = [[] for i in range(sim_params['n_cyclists'])]

        scores = [0] * sim_params['n_cyclists']
        step = 0

        while True:
            actions = []
            for state in states:
                cyclist_number = state['number']
                # Sample from policy and take action in environment
                if cyclist_number in cyclists_done:
                    model[cyclist_number]['critic'].store_dead(
                        state['state'], model[state['number']]['actor'])
                    actions.append(0)
                else:
                    try:
                        probs, action, entropy = model[cyclist_number][
                            'actor'].choose_action(state['state'])
                    except ValueError:
                        break
                    step_entropies[cyclist_number].append(entropy)
                    actions.append(action)

                    model[cyclist_number]['critic'].store_transition_1(
                        state['state'], action, entropy, probs)

            next_states, rewards, done, cyclists_done = env.step(actions)
            next_states = [
                state_normaliser.normalise_state(state)
                for state in next_states
            ]
            for next_state, reward in zip(next_states, rewards):
                cyclist_number = next_state['number']

                model[next_state['number']]['critic'].store_transition_2(
                    reward, next_state['state'],
                    model[cyclist_number]['actor'])

            states = next_states.copy()
            for state, reward in zip(states, rewards):
                scores[state['number']] += reward
                step_rewards[state['number']].append(reward)

            step += 1
            if done:
                # print(state['state'])
                [
                    model[state['number']]['critic'].store_dead(
                        state['state'], model[state['number']]['actor'])
                    for state in states
                ]
                break

        episode_entropies.append(
            [np.mean(entrops) for entrops in step_entropies])
        # Append for logging and print
        episode_rewards.append(scores)
        if sim_params['print_rewards']:
            print(f'Seed: {seed}, EP: {episode}, Score: {scores}', flush=True)
            print()

    Seed_entropies = np.array(episode_entropies)
    Seed_rewards = np.array(episode_rewards)
    return Seed_rewards, Seed_entropies, [c['critic'].losses for c in model]
예제 #30
0
    def __init__(self,
                 obs_shape,
                 action_shape,
                 device,
                 model_kind,
                 kind='D',
                 step_MVE=5,
                 hidden_dim=256,
                 discount=0.99,
                 init_temperature=0.01,
                 alpha_lr=1e-3,
                 alpha_beta=0.9,
                 actor_lr=1e-3,
                 actor_beta=0.9,
                 actor_log_std_min=-10,
                 actor_log_std_max=2,
                 critic_lr=1e-3,
                 critic_beta=0.9,
                 critic_tau=0.005,
                 critic_target_update_freq=2,
                 model_lr=1e-3,
                 log_interval=100):

        self.device = device
        self.discount = discount
        self.critic_tau = critic_tau
        self.critic_target_update_freq = critic_target_update_freq
        self.log_interval = log_interval
        self.step_MVE = step_MVE
        self.model_kind = model_kind

        self.actor = Actor(obs_shape, action_shape, hidden_dim,
                           actor_log_std_min, actor_log_std_max).to(device)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
                                                lr=actor_lr,
                                                betas=(actor_beta, 0.999))

        self.critic = Critic(obs_shape, action_shape, hidden_dim).to(device)
        self.critic_target = Critic(obs_shape, action_shape,
                                    hidden_dim).to(device)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 lr=critic_lr,
                                                 betas=(critic_beta, 0.999))

        self.log_alpha = torch.tensor(np.log(init_temperature)).to(device)
        self.log_alpha.requires_grad = True
        self.target_entropy = -np.prod(
            action_shape)  # set target entropy to -|A|
        self.log_alpha_optimizer = torch.optim.Adam([self.log_alpha],
                                                    lr=alpha_lr,
                                                    betas=(alpha_beta, 0.999))

        if self.model_kind == 'dynode_model':
            self.model = DyNODE(obs_shape,
                                action_shape,
                                hidden_dim_p=200,
                                hidden_dim_r=200).to(device)
        elif self.model_kind == 'nn_model':
            self.model = NN_Model(obs_shape,
                                  action_shape,
                                  hidden_dim_p=200,
                                  hidden_dim_r=200,
                                  kind=kind).to(device)
        else:
            assert 'model is not supported'

        self.model_optimizer = torch.optim.Adam(self.model.parameters(),
                                                lr=model_lr)

        self.train()
        self.critic_target.train()