示例#1
0
    def __init__(self, memory, nb_status, nb_actions, action_noise=None,
                 gamma=0.99, tau=0.001, normalize_observations=True,
                 batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.),
                 actor_lr=1e-4, critic_lr=1e-3):
        self.nb_status = nb_status
        self.nb_actions = nb_actions
        self.action_range = action_range
        self.observation_range = observation_range
        self.normalize_observations = normalize_observations

        self.actor = Actor(self.nb_status, self.nb_actions)
        self.actor_target = Actor(self.nb_status, self.nb_actions)
        self.actor_optim = Adam(self.actor.parameters(), lr=actor_lr)

        self.critic = Critic(self.nb_status, self.nb_actions)
        self.critic_target = Critic(self.nb_status, self.nb_actions)
        self.critic_optim = Adam(self.critic.parameters(), lr=critic_lr)

        # Create replay buffer
        self.memory = memory  # SequentialMemory(limit=args.rmsize, window_length=args.window_length)
        self.action_noise = action_noise

        # Hyper-parameters
        self.batch_size = batch_size
        self.tau = tau
        self.discount = gamma

        if self.normalize_observations:
            self.obs_rms = RunningMeanStd()
        else:
            self.obs_rms = None
示例#2
0
    def __init__(self, nb_status, nb_actions, args, writer):
        self.clip_actor_grad = args.clip_actor_grad
        self.nb_status = nb_status * args.window_length
        self.nb_actions = nb_actions
        self.discrete = args.discrete
        self.pic = args.pic
        self.writer = writer
        self.select_time = 0        
        if self.pic:
            self.nb_status = args.pic_status
        
        # Create Actor and Critic Network
        net_cfg = {
            'hidden1':args.hidden1, 
            'hidden2':args.hidden2, 
            'use_bn':args.bn,
            'init_method':args.init_method
        }
        if args.pic:
            self.cnn = CNN(1, args.pic_status)
            self.cnn_target = CNN(1, args.pic_status)
            self.cnn_optim = Adam(self.cnn.parameters(), lr=args.crate)
        self.actor = Actor(self.nb_status, self.nb_actions, **net_cfg)
        self.actor_target = Actor(self.nb_status, self.nb_actions, **net_cfg)
        self.actor_optim  = Adam(self.actor.parameters(), lr=args.prate)

        self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg)
        self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg)
        self.critic_optim  = Adam(self.critic.parameters(), lr=args.rate)

        hard_update(self.actor_target, self.actor) # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)
        if args.pic:
            hard_update(self.cnn_target, self.cnn)
        
        #Create replay buffer
        self.memory = rpm(args.rmsize) # SequentialMemory(limit=args.rmsize, window_length=args.window_length)
        self.random_process = Myrandom(size=nb_actions)

        # Hyper-parameters
        self.batch_size = args.batch_size
        self.tau = args.tau
        self.discount = args.discount
        self.depsilon = 1.0 / args.epsilon

        # 
        self.epsilon = 1.0
        self.s_t = None # Most recent state
        self.a_t = None # Most recent action
        self.use_cuda = args.cuda
        # 
        if self.use_cuda: self.cuda()
    def __init__(self, state_size, action_size, random_seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, random_seed).to(device)
        self.actor_target = Actor(state_size, action_size, random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, random_seed).to(device)
        self.critic_target = Critic(state_size, action_size, random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
示例#4
0
def buscarActores(pkActores):
    """
    Busca a más de un actor y su información desde una lista con las Pk de
    los actores que se desean buscar.
    """
    actores = Actor.actores(pkActores)

    return actores
示例#5
0
def crearActor(id_actor, nombre, nacimiento, genero):
    """
    Método que crea un actor.
    Valida la información recibida.
    @param nombre del actor
    @param fecha de nacimiento del actor
    @genero masculino o femenino
    """
    nuevo = Actor()

    nuevo.id_actor = id_actor

    if len(nombre.strip()) is 0:
        mensaje = u"Ingrese nombre del actor"
        return mensaje
    if nombre.strip().replace(" ", "").isalpha() is False:
        mensaje = u"Nombre del actor no valido"
        return mensaje
    nombre = nombre.strip()
    nuevo.nombre = nombre

    if "Mes" in nacimiento:
        mensaje = u"Ingrese mes de cumpleaños."
        return mensaje
    nuevo.nacimiento = nacimiento

    if "No definido o.O" in genero:
        mensaje = u"Especifique el genero del actor"
        return mensaje
    nuevo.genero = genero
    nuevo.save()

    return True
def crearActor(nombre, codigo, semestre, area):
    """
    Método que crea un curso. Lo correcto sería validar
    que toda la información es correcta
    Ej:
        - Semestre puede ser 1 o 2
        - Los códigos podrían tener un formato predefinido
        - Etc
    """
    nuevo = Actor()
    nuevo.nombre = nombre
    # Aquí podrían haber validaciones para el codigo
    nuevo.codigo = codigo
    nuevo.semetre = semestre
    nuevo.area = area
    nuevo.save()
示例#7
0
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0  #0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor - 0.99
        self.tau = 0.01  # for soft update of target parameters - 0.01
        
        # Score tracker and learning parameters
        self.best_w = None
        self.best_score = -np.inf
        self.score = -np.inf
def crearActor(nombre, nacimiento, genero, imagen):
    """
    Método que crea un actor.
    Valida la información recibida.
    @param nombre del actor
    @param fecha de nacimiento del actor
    @genero masculino o femenino
    @imagen dirección de la imagen que contiene al actor
    """
    nuevo = Actor()

    if len(nombre.strip()) is 0:
        mensaje = u"Ingrese nombre del actor"
        return mensaje
    if nombre.strip().replace(" ", "").isalpha() is False:
        mensaje = u"Nombre del actor no valido"
        return mensaje
    nombre = nombre.strip()
    nuevo.nombre = nombre

    if "Mes" in nacimiento:
        mensaje = u"Ingrese mes de cumpleaños."
        return mensaje
    nuevo.nacimiento = nacimiento

    if "No definido o.O" in genero:
        mensaje = u"Especifique el genero del actor"
        return mensaje
    nuevo.genero = genero

    nuevo.imagen = imagen

    nuevo.save()

    # Procedemos a guardar la imagen en su directorio correspondiente
    id_actor = nuevo.id_actor[0]
    nuevaImagen = "imgActor/{}".format(id_actor)
    almacenarImagen(imagen, nuevaImagen)

    return True
示例#9
0
def ppo():
    # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    env = UnityEnvironment(file_name="../Reacher_Linux/Reacher.x86_64",
                           no_graphics=True)

    # get the default brain
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]

    # reset the environment
    env_info = env.reset(train_mode=True)[brain_name]

    # number of agents in the environment
    print('Number of agents:', len(env_info.agents))

    # number of actions
    action_size = brain.vector_action_space_size
    print('Number of actions:', action_size)

    # examine the state space
    state = env_info.vector_observations[0]
    print('States look like:', state)
    state_size = len(state)
    print('States have length:', state_size)

    config = Config()
    config.env = env

    config.actor_critic_fn = lambda: ActorCritic(
        actor=Actor(state_size, action_size), critic=Critic(state_size))

    config.discount = 0.99
    config.use_gae = True
    config.gae_tau = 0.95
    config.gradient_clip = 5
    config.rollout_length = 2048
    config.optimization_epochs = 5
    config.num_mini_batches = 512
    config.ppo_ratio_clip = 0.2
    config.log_interval = 10 * 2048
    config.max_steps = 2e7
    config.eval_episodes = 10
    # config.logger = get_logger()

    print("GPU available: {}".format(torch.cuda.is_available()))
    print("GPU tensor test: {}".format(torch.rand(3, 3).cuda()))

    agent = PPOAgent(config)

    random_seed()
    config = agent.config
    t0 = time.time()
    scores = []
    scores_window = deque(maxlen=100)  # last 100 scores

    while True:
        if config.log_interval and not agent.total_steps % config.log_interval and len(
                agent.episode_rewards):
            rewards = agent.episode_rewards
            for reward in rewards:
                scores.append(reward)
                scores_window.append(reward)
            agent.episode_rewards = []

            print('\r===> Average Score: {:d} episodes {:.2f}'.format(
                len(scores), np.mean(scores_window)))
            if np.mean(scores_window) >= 1.0:
                print(
                    '\nEnvironment solved in {:d}  episodes!\tAverage Score: {:.2f}'
                    .format(len(scores_window), np.mean(scores_window)))
                torch.save(agent.actor_critic.state_dict(),
                           '../checkpoints/ppo_checkpoint.pth')
                break

            print(
                'Total steps %d, returns %d/%.2f/%.2f/%.2f/%.2f (count/mean/median/min/max), %.2f steps/s'
                % (agent.total_steps, len(rewards), np.mean(rewards),
                   np.median(rewards), np.min(rewards), np.max(rewards),
                   config.log_interval / (time.time() - t0)))

            t0 = time.time()

        agent.step()

    return scores
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, random_seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

    def step(self, state, action, reward, next_state, done, ts):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        #        for states, actions, rewards, state_next, complete in zip(state, action, reward, next_state, done):
        #            self.memory.add(states, actions, rewards, state_next, complete)
        self.memory.add(state, action, reward, next_state, done)
        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE and ts % 20 == 0:
            for _ in range(10):
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        #        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, random_seed):
        """Initialize an Agent object.
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.epsilon = 1.0

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=1e-3)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=1e-3,
                                           weight_decay=0)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, int(1e6), 256, random_seed)

        # Make sure target is with the same weight as the source
        self.hard_copy(self.actor_target, self.actor_local)
        self.hard_copy(self.critic_target, self.critic_local)

    def step(self, states, actions, rewards, next_states, dones, timestep):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        for state, action, reward, next_state, done in zip(
                states, actions, rewards, next_states, dones):
            self.memory.add(state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > 256 and timestep % 20 == 0:
            for _ in range(10):
                experiences = self.memory.sample()
                self.learn(experiences, 0.99)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""

        state = torch.from_numpy(state).float().to(device)

        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()

        if add_noise:
            action += self.epsilon * self.noise.sample()

        return action

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        states, actions, rewards, next_states, dones = experiences
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)

        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()

        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        self.soft_update(self.critic_local, self.critic_target, 1e-3)
        self.soft_update(self.actor_local, self.actor_target, 1e-3)

        self.epsilon -= 1e-6
        self.noise.reset()

    def soft_update(self, local_model, target_model, tau):
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def hard_copy(self, target, source):
        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(param.data)
示例#12
0
class Agent():
    def __init__(self, state_size, action_size, random_seed):
        self.state_size = state_size
        self.action_size = action_size

        # Construct Actor networks
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Construct Critic networks
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

    def step(self, memory):
        if len(memory) > BATCH_SIZE:
            experiences = memory.sample()
            self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
示例#13
0
class Agent():
    def __init__(self,
                 device,
                 state_size,
                 n_agents,
                 action_size,
                 random_seed,
                 buffer_size,
                 batch_size,
                 gamma,
                 TAU,
                 lr_actor,
                 lr_critic,
                 weight_decay,
                 checkpoint_folder='./'):

        self.DEVICE = device

        self.state_size = state_size
        self.n_agents = n_agents
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Hyperparameters
        self.BUFFER_SIZE = buffer_size
        self.BATCH_SIZE = batch_size
        self.GAMMA = gamma
        self.TAU = TAU
        self.LR_ACTOR = lr_actor
        self.LR_CRITIC = lr_critic
        self.WEIGHT_DECAY = weight_decay

        self.CHECKPOINT_FOLDER = checkpoint_folder

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(self.DEVICE)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(self.DEVICE)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=self.LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(self.DEVICE)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(self.DEVICE)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=self.LR_CRITIC,
                                           weight_decay=self.WEIGHT_DECAY)
        '''
        if os.path.isfile(self.CHECKPOINT_FOLDER + 'checkpoint_actor.pth') and os.path.isfile(self.CHECKPOINT_FOLDER + 'checkpoint_critic.pth'):
            self.actor_local.load_state_dict(torch.load(self.CHECKPOINT_FOLDER + 'checkpoint_actor.pth'))
            self.actor_target.load_state_dict(torch.load(self.CHECKPOINT_FOLDER + 'checkpoint_actor.pth'))

            self.critic_local.load_state_dict(torch.load(self.CHECKPOINT_FOLDER + 'checkpoint_critic.pth'))
            self.critic_target.load_state_dict(torch.load(self.CHECKPOINT_FOLDER + 'checkpoint_critic.pth'))
        '''
        # Noise process
        self.noise = OUNoise((n_agents, action_size), random_seed)

        # Replay memory
        self.memory = ReplayBuffer(device, action_size, self.BUFFER_SIZE,
                                   self.BATCH_SIZE, random_seed)

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        for i in range(self.n_agents):
            self.memory.add(state[i, :], action[i, :], reward[i],
                            next_state[i, :], done[i])

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(self.DEVICE)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (self.GAMMA * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        # torch.nn.utils.clip_grad_norm(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target)
        self.soft_update(self.actor_local, self.actor_target)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        tau = self.TAU
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def checkpoint(self):
        torch.save(self.actor_local.state_dict(),
                   self.CHECKPOINT_FOLDER + 'checkpoint_actor.pth')
        torch.save(self.critic_local.state_dict(),
                   self.CHECKPOINT_FOLDER + 'checkpoint_critic.pth')
示例#14
0
class CAC(object):
    def __init__(self,
                 a_dim,
                 s_dim,
                 variant,
                 action_prior='uniform',
                 max_global_steps=100000):
        """
        a_dim : dimension of action space
        s_dim: state space dimension
        variant: dictionary containing parameters for the algorithms
        """
        ###############################  Model parameters  ####################################
        set_seed(variant['seed'])
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.actor = Actor(input_dim=s_dim,
                           output_dim=a_dim,
                           n_layers=3,
                           layer_sizes=[256, 256, 256],
                           hidden_activation="leakyrelu").to(self.device)
        self.actor_target = Actor(input_dim=s_dim,
                                  output_dim=a_dim,
                                  n_layers=3,
                                  layer_sizes=[256, 256, 256],
                                  hidden_activation="leakyrelu").to(
                                      self.device).eval()
        self.critic = LyapunovCritic(state_dim=s_dim,
                                     action_dim=a_dim,
                                     output_dim=None,
                                     n_layers=2,
                                     layer_sizes=[256, 256],
                                     hidden_activation="leakyrelu").to(
                                         self.device)
        self.critic_target = LyapunovCritic(state_dim=s_dim,
                                            action_dim=a_dim,
                                            output_dim=None,
                                            n_layers=2,
                                            layer_sizes=[256, 256],
                                            hidden_activation="leakyrelu").to(
                                                self.device).eval()

        # copy parameters of the learning network to the target network
        hard_update(self.critic_target, self.critic)
        hard_update(self.actor_target, self.actor)
        # disable gradient calculations of the target network
        stop_grad(self.critic_target)
        stop_grad(self.actor_target)
        # self.memory_capacity = variant['memory_capacity']

        ################################ parameters for training ###############################
        self.batch_size = variant[
            'batch_size']  # batch size for learning the actor
        self.gamma = variant['gamma']  # discount factor
        self.tau = variant['tau']  # smoothing parameter for the weight updates
        self.approx_value = True if 'approx_value' not in variant.keys(
        ) else variant['approx_value']
        self._action_prior = action_prior  # prior over action space
        s_dim = s_dim * (variant['history_horizon'] + 1)
        self.a_dim, self.s_dim, = a_dim, s_dim
        self.history_horizon = variant[
            'history_horizon']  # horizon to consider for the history
        self.working_memory = deque(maxlen=variant['history_horizon'] +
                                    1)  # memory to store history
        target_entropy = variant['target_entropy']
        if target_entropy is None:
            self.target_entropy = -self.a_dim  #lower bound of the policy entropy
        else:
            self.target_entropy = target_entropy
        self.target_variance = 0.0
        self.finite_horizon = variant['finite_horizon']
        self.soft_predict_horizon = variant['soft_predict_horizon']
        self.use_lyapunov = variant['use_lyapunov']
        self.adaptive_alpha = variant['adaptive_alpha']
        self.adaptive_beta = variant[
            'adaptive_beta'] if 'adaptive_beta' in variant.keys() else False
        self.time_near = variant['Time_near']
        self.max_global_steps = max_global_steps
        self.LR_A = variant['lr_a']
        self.LR_L = variant['lr_l']
        self.LR_lag = self.LR_A / 10
        self.alpha3 = variant['alpha3']

        labda = variant['labda']  # formula (12) in the paper
        alpha = variant['alpha']  # entropy temperature (beta in the paper)
        beta = variant['beta']  # constraint error weight

        self.log_labda = torch.log(torch.tensor([labda], device=self.device))
        self.log_alpha = torch.log(torch.tensor(
            [alpha], device=self.device))  # Entropy Temperature
        self.log_beta = torch.log(torch.tensor([beta], device=self.device))
        self.log_alpha.requires_grad = True
        self.log_beta.requires_grad = True
        self.log_labda.requires_grad = True
        # The update is in log space
        self.labda = torch.clamp(torch.exp(self.log_labda),
                                 min=SCALE_lambda_MIN_MAX[0],
                                 max=SCALE_lambda_MIN_MAX[1])
        self.alpha = torch.exp(self.log_alpha)
        self.beta = torch.clamp(torch.exp(self.log_beta),
                                min=SCALE_beta_MIN_MAX[0],
                                max=SCALE_beta_MIN_MAX[1])

        self.actor_optim = torch.optim.Adam(self.actor.parameters(),
                                            lr=self.LR_A)
        self.critic_optim = torch.optim.Adam(self.critic.parameters(),
                                             lr=self.LR_L)
        self.alpha_optim = torch.optim.Adam([self.log_alpha], lr=self.LR_A)
        self.labda_optim = torch.optim.Adam([self.log_labda], lr=self.LR_lag)
        self.beta_optim = torch.optim.Adam([self.log_beta], lr=0.01)

        # step_fn = lambda i : 1.0 - (i - 1.)/self.max_global_steps
        # self.actor_scheduler = torch.optim.lr_scheduler.MultiplicativeLR(self.actor_optim, lr_lambda = step_fn)
        # self.critic_scheduler = torch.optim.lr_scheduler.MultiplicativeLR(self.critic_optim, lr_lambda = step_fn)
        # self.alpha_scheduler = torch.optim.lr_scheduler.MultiplicativeLR(self.alpha_optim, lr_lambda = step_fn)
        # self.labda_scheduler = torch.optim.lr_scheduler.MultiplicativeLR(self.labda_optim, lr_lambda = step_fn)
        # self.beta_scheduler = torch.optim.lr_scheduler.MultiplicativeLR(self.beta_optim, lr_lambda = step_fn)

        self.actor.float()
        self.critic.float()

    def act(self, s, evaluation=False):
        a, deterministic_a, _, _ = self.actor(s)
        if evaluation is True:
            return deterministic_a
        else:
            return a

    def learn(self, batch):

        bs = torch.tensor(batch['s'],
                          dtype=torch.float).to(self.device)  # state
        ba = torch.tensor(batch['a'],
                          dtype=torch.float).to(self.device)  # action
        br = torch.tensor(batch['r'],
                          dtype=torch.float).to(self.device)  # reward
        bterminal = torch.tensor(batch['terminal'],
                                 dtype=torch.float).to(self.device)
        bs_ = torch.tensor(batch['s_'],
                           dtype=torch.float).to(self.device)  # next state
        b_s = torch.tensor(batch['_s'],
                           dtype=torch.float).to(self.device)  # prev state
        bv = None
        b_r_ = None
        # print(bs)
        alpha_loss = None
        beta_loss = None

        # # beta learning
        # self.beta_optim.zero_grad()
        # beta_loss = self.get_beta_loss(b_s)
        # if self.adaptive_beta:
        #     beta_loss.backward(retain_graph = False)
        #     self.beta_optim.step()
        # else:
        #     self.beta_optim.zero_grad()

        # lyapunov learning
        start_grad(self.critic)
        if self.finite_horizon:
            bv = torch.tensor(batch['value'])
            b_r_ = torch.tensor(batch['r_N_'])

        self.critic_optim.zero_grad()
        critic_loss = self.get_lyapunov_loss(bs, bs_, ba, br, b_r_, bv,
                                             bterminal)
        critic_loss.backward()
        self.critic_optim.step()

        # actor lerning
        stop_grad(self.critic)
        self.actor_optim.zero_grad()
        actor_loss = self.get_actor_loss(bs, bs_, ba, br)
        actor_loss.backward(retain_graph=False)
        self.actor_optim.step()

        # alpha learning
        if self.adaptive_alpha:
            self.alpha_optim.zero_grad()
            alpha_loss = self.get_alpha_loss(bs, self.target_entropy)
            alpha_loss.backward(retain_graph=False)
            self.alpha_optim.step()
            self.alpha = torch.exp(self.log_alpha)
        # labda learning
        self.labda_optim.zero_grad()
        labda_loss = self.get_labda_loss(br, bs, bs_, ba)
        # print("labda loss = ", labda_loss)
        labda_loss.backward(retain_graph=False)
        self.labda_optim.step()
        self.labda = torch.clamp(torch.exp(self.log_labda),
                                 min=SCALE_lambda_MIN_MAX[0],
                                 max=SCALE_lambda_MIN_MAX[1])

        # update target networks
        soft_update(self.critic_target, self.critic, self.tau)
        soft_update(self.actor_target, self.actor, self.tau)
        return alpha_loss, beta_loss, labda_loss, actor_loss, critic_loss

    def get_alpha_loss(self, s, target_entropy):

        # with torch.no_grad():
        #     _, self.deterministic_a,self.log_pis, _ = self.actor_target(s)
        intermediate = (self.log_pis + target_entropy).detach()
        # self.a, self.deterministic_a, self.log_pis, _ = self.actor(s)
        # print(self.a)

        return -torch.mean(self.log_alpha * intermediate)

    def get_labda_loss(self, r, s, s_, a):
        # with torch.no_grad():
        #     l = self.critic(s, a)
        #     lya_a_, _, _, _ = self.actor_target(s_)
        #     self.l_ = self.critic_target(s_, lya_a_)
        l = self.l.detach()
        lyapunov_loss = torch.mean(self.l_ - l + self.alpha3 * r)
        return -torch.mean(self.log_labda * lyapunov_loss)

    def get_beta_loss(self, _s):
        with torch.no_grad():
            _, _deterministic_a, _, _ = self.actor_target(_s)
        self.l_action = torch.mean(
            torch.norm(_deterministic_a.detach() - self.deterministic_a,
                       dim=1))
        with torch.no_grad():
            intermediate = (self.l_action - 0.02).detach()
        return -torch.mean(self.log_beta * intermediate)

    def get_actor_loss(self, s, s_, a, r):
        if self._action_prior == 'normal':
            policy_prior = torch.distributions.MultivariateNormal(
                loc=torch.zeros(self.a_dim),
                covariance_matrix=torch.diag(torch.ones(self.a_dim)))
            policy_prior_log_probs = policy_prior.log_prob(self.a)
        elif self._action_prior == 'uniform':
            policy_prior_log_probs = 0.0

        # only actor weights are updated!
        _, self.deterministic_a, self.log_pis, _ = self.actor(s)
        # self.l = self.critic(s, a)
        with torch.no_grad():
            # self.l = self.critic(s, a)
            lya_a_, _, _, _ = self.actor(s_)
            self.l_ = self.critic(s_, lya_a_)
        l = self.l.detach()
        self.lyapunov_loss = torch.mean(self.l_ - l + self.alpha3 * r)
        labda = self.labda.detach()
        alpha = self.alpha.detach()
        a_loss = labda * self.lyapunov_loss + alpha * torch.mean(
            self.log_pis) - policy_prior_log_probs
        return a_loss

    def get_lyapunov_loss(self, s, s_, a, r, r_n_=None, v=None, terminal=0.):
        with torch.no_grad():
            a_, _, _, _ = self.actor_target(s_)
            l_ = self.critic_target(s_, a_)
        self.l = self.critic(s, a)
        if self.approx_value:
            if self.finite_horizon:
                if self.soft_predict_horizon:
                    l_target = r - r_n_ + l_
                else:
                    l_target = v
            else:
                l_target = r + self.gamma * (
                    1 - terminal
                ) * l_  # Lyapunov critic - self.alpha * next_log_pis
        else:
            l_target = r
        mse_loss = nn.MSELoss()
        l_loss = mse_loss(self.l, l_target)

        return l_loss

    def save_result(self, path):
        if not os.path.exists(path + "/policy/"):
            os.mkdir(path + "/policy/")
        self.actor_target.save(path + "/policy/actor_target.pth")
        self.critic_target.save(path + "/policy/critic_target.pth")
        self.actor.save(path + "/policy/actor.pth")
        self.critic.save(path + "/policy/critic.pth")
        print("Save to path: ", path + "/policy/")

    def restore(self, path):
        result_path = path
        if not os.path.exists(result_path):
            raise IOError("Results path ", result_path,
                          " does not contain anything to load")
        self.actor_target.load(result_path + "/actor_target.pth")
        self.critic_target.load(result_path + "/critic_target.pth")
        self.actor.load(result_path + "/actor.pth")
        self.critic.load(result_path + "/critic.pth")
        success_load = True
        print("Load successful, model file from ", result_path)
        print("#########################################################")
        return success_load

    def scheduler_step(self):
        self.alpha_scheduler.step()
        self.beta_scheduler.step()
        self.labda_scheduler.step()
        self.actor_scheduler.step()
        self.critic_scheduler.step()
class Agent():
    """Interacts with and learns from the environment."""
    
    def __init__(self, state_size, action_size, random_seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, random_seed).to(device)
        self.actor_target = Actor(state_size, action_size, random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, random_seed).to(device)
        self.critic_target = Critic(state_size, action_size, random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
    
    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)                     

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
示例#16
0
                    if done:  # done and print information
                        record(self.g_ep, self.g_ep_r, ep_r, self.res_queue,
                               self.name)
                        break

                state = next_state
                total_step += 1


if __name__ == '__main__':

    # multiple copies of both actor and critic (one pair per worker)
    # updates sent to global model

    gnet = {'actor': Actor(state_size, action_size, random_seed).to(device), \
            'critic': Critic(state_size, action_size, random_seed).to(device) }

    opt = {}  # stores both shared optimizers for critic and actor networks
    LR_ACTOR = 1e-4
    LR_CRITIC = 1e-3

    print('Networks present are: ')
    for key, value in gnet.items(
    ):  # Alternatively if gnet is a class, use gnet.__dict__
        if isinstance(value, nn.Module):
            value.share_memory()
            print('Sharing in memory {}: '.format(key))
            if key == 'actor' or key == 'critic':
                opt[key + '_optimizer'] = SharedAdam(
                    value.parameters(),
示例#17
0
class DDPG():
    def __init__(self,
                 env,
                 action_dim,
                 state_dim,
                 device,
                 critic_lr=3e-4,
                 actor_lr=3e-4,
                 gamma=0.99,
                 batch_size=100,
                 validate_steps=100,
                 max_episode_length=150):
        """
        param: env: An gym environment
        param: action_dim: Size of action space
        param: state_dim: Size of state space
        param: critic_lr: Learning rate of the critic
        param: actor_lr: Learning rate of the actor
        param: gamma: The discount factor
        param: batch_size: The batch size for training
        param: device: The device used for training
        param: validate_steps: Number of iterations after which we evaluate trained policy 
        """
        self.gamma = gamma
        self.batch_size = batch_size
        self.env = env
        self.device = device
        self.eval_env = deepcopy(env)
        self.validate_steps = validate_steps
        self.max_episode_length = max_episode_length

        # actor and actor_target where both networks have the same initial weights
        self.actor = Actor(state_dim=state_dim,
                           action_dim=action_dim).to(self.device)
        self.actor_target = deepcopy(self.actor)

        # critic and critic_target where both networks have the same initial weights
        self.critic = Critic(state_dim=state_dim,
                             action_dim=action_dim).to(self.device)
        self.critic_target = deepcopy(self.critic)

        # Optimizer for the actor and critic
        self.optimizer_actor = optim.Adam(self.actor.parameters(), lr=actor_lr)
        self.optimizer_critic = optim.Adam(self.critic.parameters(),
                                           lr=critic_lr)

        # Replay buffer
        self.ReplayBuffer = ReplayBuffer(buffer_size=10000, init_length=1000, state_dim=state_dim, \
                                         action_dim=action_dim, env=env, device = device)

    def update_target_networks(self):
        """
        A function to update the target networks
        """
        weighSync(self.actor_target, self.actor)
        weighSync(self.critic_target, self.critic)

    def update_network(self, batch):
        """
        A function to update the function just once
        """

        # Sample and parse batch
        state, action, reward, state_next, done = self.ReplayBuffer.batch_sample(
            batch)

        # Predicting the next action and q_value
        action_next = self.actor_target(state_next)
        q_next = self.critic_target(state_next, action_next)
        target_q = reward + (self.gamma * done * q_next)

        q = self.critic(state, action)

        # Critic update
        self.critic.zero_grad()
        value_loss = F.mse_loss(q, target_q)
        value_loss.backward()
        self.optimizer_critic.step()

        # Actor update
        self.actor.zero_grad()
        policy_loss = -self.critic(state, self.actor(state)).mean()
        policy_loss.backward()
        self.optimizer_actor.step()

        # Target update
        self.update_target_networks()
        return value_loss.item(), policy_loss.item()

    def select_action(self, state, isEval):

        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)
        action = self.actor(state).squeeze(0).detach()
        if isEval:
            return action.cpu().numpy()
        action += torch.normal(0, 0.1, size=action.shape).to(self.device)
        action = torch.clamp(action, -1., 1.).cpu().numpy()
        return action

    def train(self, num_steps):
        """
        Train the policy for the given number of iterations
        :param num_steps:The number of steps to train the policy for
        """
        value_losses, policy_losses, validation_reward, validation_steps = [],[],[],[]

        step, episode, episode_steps, episode_reward, state = 0, 0, 0, 0., None

        while step < num_steps:
            # reset if it is the start of episode
            if state is None:
                state = deepcopy(self.env.reset())

            action = self.select_action(state, False)
            # env response with next_state, reward, terminate_info
            state_next, reward, done, _ = self.env.step(action)
            state_next = deepcopy(state_next)

            if self.max_episode_length and episode_steps >= self.max_episode_length - 1:
                done = True

            # observe and store in replay buffer
            self.ReplayBuffer.buffer_add(
                Exp(state=state,
                    action=action,
                    reward=reward,
                    state_next=state_next,
                    done=done))

            # update policy based on sampled batch
            batch = self.ReplayBuffer.buffer_sample(self.batch_size)
            value_loss, policy_loss = self.update_network(batch)
            value_losses.append(value_loss)
            policy_losses.append(policy_loss)

            # evaluate
            if step % self.validate_steps == 0:
                validate_reward, steps = self.evaluate()
                validation_reward.append(validate_reward)
                validation_steps.append(steps)
                print(
                    "[Eval {:06d}/{:06d}] Steps: {:06d}, Episode Reward:{:04f}"
                    .format(step, int(num_steps), steps, validate_reward))

            # update
            step += 1
            episode_steps += 1
            episode_reward += reward
            state = deepcopy(state_next)

            if done:  # reset at the end of episode
                #print("[Train {:06d}/{:06d}] - Episode Reward:{:04f} ".format(step, num_steps, step, episode_reward))
                episode_steps, episode_reward, state = 0, 0., None
                episode += 1

        return value_losses, policy_losses, validation_reward, validation_steps

    def evaluate(self):
        """
        Evaluate the policy trained so far in an evaluation environment
        """
        state, done, total_reward, steps = self.eval_env.reset(), False, 0., 0

        while not done:
            action = self.select_action(state, True)
            state_next, reward, done, _ = self.eval_env.step(action)
            total_reward += reward
            steps += 1
            state = state_next
        return total_reward / steps, steps
示例#18
0
    def __init__(self,
                 state_size,
                 action_size,
                 CER=False,
                 num_agents=1,
                 idx=0,
                 random_seed=23,
                 fc1_units=96,
                 fc2_units=96,
                 epsilon=1.0,
                 lr_actor=1e-3,
                 lr_critic=1e-3,
                 weight_decay=0):
        self.state_size = state_size
        self.action_size = action_size
        self.CER = CER
        self.EXPmemory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                      random_seed)
        self.CERmem = ReplayBuffer(action_size, CER_SIZE, BATCH_SIZE,
                                   random_seed)
        self.random_seed = random_seed
        self.fc1_units = fc1_units
        self.fc2_units = fc2_units
        self.state_size = state_size
        self.action_size = action_size
        if (torch.cuda.is_available()):
            self.idx = torch.cuda.LongTensor([idx])
        else:
            self.idx = torch.LongTensor([idx])
        self.num_agents = num_agents
        self.epsilon = epsilon
        self.lr_actor = lr_actor
        self.lr_critic = lr_critic
        self.weight_decay = weight_decay
        self.noise = OUNoise(action_size, random_seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
        random.seed(random_seed)

        #### The actor only sees its own state
        self.actor_local = Actor(self.state_size,
                                 self.action_size,
                                 self.random_seed,
                                 fc1_units=self.fc1_units,
                                 fc2_units=self.fc2_units).to(device)
        self.actor_target = Actor(self.state_size,
                                  self.action_size,
                                  self.random_seed,
                                  fc1_units=self.fc1_units,
                                  fc2_units=self.fc2_units).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=self.lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(num_agents * state_size,
                                   num_agents * action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(num_agents * state_size,
                                    num_agents * action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=self.lr_critic,
                                           weight_decay=self.weight_decay)

        # Initialize target and local being the same
        self.hard_copy(self.actor_target, self.actor_local)
        self.hard_copy(self.critic_target, self.critic_local)
示例#19
0
class DDPG():
    def __init__(self,
                 state_size,
                 action_size,
                 CER=False,
                 num_agents=1,
                 idx=0,
                 random_seed=23,
                 fc1_units=96,
                 fc2_units=96,
                 epsilon=1.0,
                 lr_actor=1e-3,
                 lr_critic=1e-3,
                 weight_decay=0):
        self.state_size = state_size
        self.action_size = action_size
        self.CER = CER
        self.EXPmemory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                      random_seed)
        self.CERmem = ReplayBuffer(action_size, CER_SIZE, BATCH_SIZE,
                                   random_seed)
        self.random_seed = random_seed
        self.fc1_units = fc1_units
        self.fc2_units = fc2_units
        self.state_size = state_size
        self.action_size = action_size
        if (torch.cuda.is_available()):
            self.idx = torch.cuda.LongTensor([idx])
        else:
            self.idx = torch.LongTensor([idx])
        self.num_agents = num_agents
        self.epsilon = epsilon
        self.lr_actor = lr_actor
        self.lr_critic = lr_critic
        self.weight_decay = weight_decay
        self.noise = OUNoise(action_size, random_seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
        random.seed(random_seed)

        #### The actor only sees its own state
        self.actor_local = Actor(self.state_size,
                                 self.action_size,
                                 self.random_seed,
                                 fc1_units=self.fc1_units,
                                 fc2_units=self.fc2_units).to(device)
        self.actor_target = Actor(self.state_size,
                                  self.action_size,
                                  self.random_seed,
                                  fc1_units=self.fc1_units,
                                  fc2_units=self.fc2_units).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=self.lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(num_agents * state_size,
                                   num_agents * action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(num_agents * state_size,
                                    num_agents * action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=self.lr_critic,
                                           weight_decay=self.weight_decay)

        # Initialize target and local being the same
        self.hard_copy(self.actor_target, self.actor_local)
        self.hard_copy(self.critic_target, self.critic_local)

    def step(self, states, actions, rewards, next_states, dones):
        # Save experience in replay memory
        for state, action, reward, next_state, done in zip(
                states, actions, rewards, next_states, dones):
            self.EXPmemory.add(state, action, reward, next_state, done)
            if (self.CER):
                self.CERmem.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.EXPmemory) > BATCH_SIZE:
                for _ in range(NUM_UPDATES):
                    experiences = self.EXPmemory.sample()
                    self.learn(experiences, GAMMA)
                if (self.CER):
                    for _ in range(5):
                        experiences = self.CERmem.sample()
                        self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):
        if (not torch.is_tensor(state)):
            state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = (self.actor_local(state).cpu().data.numpy())
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample() * self.epsilon
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    #DDPG has to learn from experience and would be actions/next-actions from other agents
    def learn(self,
              experiences,
              wouldbe_actions,
              wouldbe_next_actions,
              gamma,
              tau=1e-3,
              epsilon_decay=1e-6):
        states, actions, rewards, next_states, dones = experiences
        # Get predicted next-state actions and Q values from target models
        next_actions = torch.cat(wouldbe_next_actions, dim=1).to(device)
        with torch.no_grad():
            Q_targets_next = self.critic_target(next_states, next_actions)

        # Compute Q targets for current states (y_i)
        Q_targets = rewards.index_select(
            1, self.idx) + (gamma * Q_targets_next *
                            (1 - dones.index_select(1, self.idx)))
        Q_expected = self.critic_local(states, actions)
        # Critic update
        critic_loss = F.mse_loss(Q_expected, Q_targets.detach())
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        #torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # Actor update, actions from other agents must be detached from optimization of local neywork
        # wouldbe_action is already calculated via local actor from above layer
        actions_pred = [
            a if i == self.idx else a.detach()
            for i, a in enumerate(wouldbe_actions)
        ]
        actions_pred = torch.cat(actions_pred, dim=1).to(device)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # Targets update
        self.soft_update(self.critic_local, self.critic_target, tau)
        self.soft_update(self.actor_local, self.actor_target, tau)

        #  Noise update
        self.epsilon -= epsilon_decay
        self.noise.reset()

    def soft_update(self, local_model, target_model, tau):
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def hard_copy(self, target, source):
        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(param.data)

    def save(self):
        torch.save(self.actor_local.state_dict(),
                   '_actor' + str(self.idx.item()) + '.pth')
        torch.save(self.critic_local.state_dict(),
                   '_critic' + str(self.idx.item()) + '.pth')

    def load(self):
        self.critic_local.load_state_dict(
            torch.load('_critic' + str(self.idx.item()) + '.pth'))
        self.actor_local.load_state_dict(
            torch.load('_actor' + str(self.idx.item()) + '.pth'))
        self.critic_target.load_state_dict(
            torch.load('_critic' + str(self.idx.item()) + '.pth'))
        self.actor_target.load_state_dict(
            torch.load('_actor' + str(self.idx.item()) + '.pth'))
示例#20
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, config, state_size, action_size, random_seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.config = config
        self.device = config['device']

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.noise_epsilon = config['NOISE_EPSILON']

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(self.device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(self.device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=config['LR_ACTOR'])
        self.hard_update(self.actor_local, self.actor_target)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(self.device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(self.device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=config['LR_CRITIC'],
                                           weight_decay=config['WEIGHT_DECAY'])
        self.hard_update(self.critic_local, self.critic_target)

        # Noise process
        self.noise = OUNoise((1, action_size), random_seed, 0.0,
                             config['OU_THETA'], config['OU_SIGMA'])
        self.noise_epsilon = config['NOISE_EPSILON']

        # Replay memory
        self.memory = ReplayBuffer(action_size, self.config, random_seed)

    def step(self, t, state, action, reward, next_state, done, agent_index):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        self.memory.add(state, action, reward, next_state, done)

        if t % self.config['DDPG_UPDATE_EVERY'] == 0 and len(
                self.memory) > self.config['BATCH_SIZE']:
            for _ in range(self.config['DDPG_LEARN_TIMES']):
                experiences = self.memory.sample()
                self.learn(experiences, agent_index)

    def act(self, states):
        states = torch.from_numpy(states).float().to(self.device)
        actions = np.zeros((1, self.action_size))

        self.actor_local.eval()

        with torch.no_grad():
            for agent_num, state in enumerate(states):
                action = self.actor_local(state).cpu().data.numpy()
                actions[agent_num, :] = action

        self.actor_local.train()
        actions += self.noise_epsilon * self.noise.sample()

        return np.clip(actions, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, agent_index):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        gamma = self.config['GAMMA']
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        if agent_index == 0:
            actions_next = torch.cat((actions_next, actions[:, 2:]), dim=1)
        else:
            actions_next = torch.cat((actions[:, :2], actions_next), dim=1)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        if agent_index == 0:
            actions_pred = torch.cat((actions_pred, actions[:, 2:]), dim=1)
        else:
            actions_pred = torch.cat((actions[:, :2], actions_pred), dim=1)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target)
        self.soft_update(self.actor_local, self.actor_target)

        # ---------------------------- update noise ---------------------------- #
        self.noise_epsilon = max(
            self.noise_epsilon - self.config['NOISE_EPSILON_DECAY'],
            self.config['NOISE_EPSILON_MIN'])
        self.noise.reset()

    def hard_update(self, local_model, target_model):
        """Hard update model parameters.
        θ_target = θ_local
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(local_param.data)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
        """
        tau = self.config['TAU']
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
示例#21
0
    def __init__(self,
                 a_dim,
                 s_dim,
                 variant,
                 action_prior='uniform',
                 max_global_steps=100000):
        """
        a_dim : dimension of action space
        s_dim: state space dimension
        variant: dictionary containing parameters for the algorithms
        """
        ###############################  Model parameters  ####################################
        set_seed(variant['seed'])
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.actor = Actor(input_dim=s_dim,
                           output_dim=a_dim,
                           n_layers=3,
                           layer_sizes=[256, 256, 256],
                           hidden_activation="leakyrelu").to(self.device)
        self.actor_target = Actor(input_dim=s_dim,
                                  output_dim=a_dim,
                                  n_layers=3,
                                  layer_sizes=[256, 256, 256],
                                  hidden_activation="leakyrelu").to(
                                      self.device).eval()
        self.critic = LyapunovCritic(state_dim=s_dim,
                                     action_dim=a_dim,
                                     output_dim=None,
                                     n_layers=2,
                                     layer_sizes=[256, 256],
                                     hidden_activation="leakyrelu").to(
                                         self.device)
        self.critic_target = LyapunovCritic(state_dim=s_dim,
                                            action_dim=a_dim,
                                            output_dim=None,
                                            n_layers=2,
                                            layer_sizes=[256, 256],
                                            hidden_activation="leakyrelu").to(
                                                self.device).eval()

        # copy parameters of the learning network to the target network
        hard_update(self.critic_target, self.critic)
        hard_update(self.actor_target, self.actor)
        # disable gradient calculations of the target network
        stop_grad(self.critic_target)
        stop_grad(self.actor_target)
        # self.memory_capacity = variant['memory_capacity']

        ################################ parameters for training ###############################
        self.batch_size = variant[
            'batch_size']  # batch size for learning the actor
        self.gamma = variant['gamma']  # discount factor
        self.tau = variant['tau']  # smoothing parameter for the weight updates
        self.approx_value = True if 'approx_value' not in variant.keys(
        ) else variant['approx_value']
        self._action_prior = action_prior  # prior over action space
        s_dim = s_dim * (variant['history_horizon'] + 1)
        self.a_dim, self.s_dim, = a_dim, s_dim
        self.history_horizon = variant[
            'history_horizon']  # horizon to consider for the history
        self.working_memory = deque(maxlen=variant['history_horizon'] +
                                    1)  # memory to store history
        target_entropy = variant['target_entropy']
        if target_entropy is None:
            self.target_entropy = -self.a_dim  #lower bound of the policy entropy
        else:
            self.target_entropy = target_entropy
        self.target_variance = 0.0
        self.finite_horizon = variant['finite_horizon']
        self.soft_predict_horizon = variant['soft_predict_horizon']
        self.use_lyapunov = variant['use_lyapunov']
        self.adaptive_alpha = variant['adaptive_alpha']
        self.adaptive_beta = variant[
            'adaptive_beta'] if 'adaptive_beta' in variant.keys() else False
        self.time_near = variant['Time_near']
        self.max_global_steps = max_global_steps
        self.LR_A = variant['lr_a']
        self.LR_L = variant['lr_l']
        self.LR_lag = self.LR_A / 10
        self.alpha3 = variant['alpha3']

        labda = variant['labda']  # formula (12) in the paper
        alpha = variant['alpha']  # entropy temperature (beta in the paper)
        beta = variant['beta']  # constraint error weight

        self.log_labda = torch.log(torch.tensor([labda], device=self.device))
        self.log_alpha = torch.log(torch.tensor(
            [alpha], device=self.device))  # Entropy Temperature
        self.log_beta = torch.log(torch.tensor([beta], device=self.device))
        self.log_alpha.requires_grad = True
        self.log_beta.requires_grad = True
        self.log_labda.requires_grad = True
        # The update is in log space
        self.labda = torch.clamp(torch.exp(self.log_labda),
                                 min=SCALE_lambda_MIN_MAX[0],
                                 max=SCALE_lambda_MIN_MAX[1])
        self.alpha = torch.exp(self.log_alpha)
        self.beta = torch.clamp(torch.exp(self.log_beta),
                                min=SCALE_beta_MIN_MAX[0],
                                max=SCALE_beta_MIN_MAX[1])

        self.actor_optim = torch.optim.Adam(self.actor.parameters(),
                                            lr=self.LR_A)
        self.critic_optim = torch.optim.Adam(self.critic.parameters(),
                                             lr=self.LR_L)
        self.alpha_optim = torch.optim.Adam([self.log_alpha], lr=self.LR_A)
        self.labda_optim = torch.optim.Adam([self.log_labda], lr=self.LR_lag)
        self.beta_optim = torch.optim.Adam([self.log_beta], lr=0.01)

        # step_fn = lambda i : 1.0 - (i - 1.)/self.max_global_steps
        # self.actor_scheduler = torch.optim.lr_scheduler.MultiplicativeLR(self.actor_optim, lr_lambda = step_fn)
        # self.critic_scheduler = torch.optim.lr_scheduler.MultiplicativeLR(self.critic_optim, lr_lambda = step_fn)
        # self.alpha_scheduler = torch.optim.lr_scheduler.MultiplicativeLR(self.alpha_optim, lr_lambda = step_fn)
        # self.labda_scheduler = torch.optim.lr_scheduler.MultiplicativeLR(self.labda_optim, lr_lambda = step_fn)
        # self.beta_scheduler = torch.optim.lr_scheduler.MultiplicativeLR(self.beta_optim, lr_lambda = step_fn)

        self.actor.float()
        self.critic.float()
def actor():
    return Actor.all()
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, num_agents, random_seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size * num_agents,
                                   action_size * num_agents,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size * num_agents,
                                    action_size * num_agents,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        self.soft_update(self.critic_local, self.critic_target, 1)
        self.soft_update(self.actor_local, self.actor_target, 1)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)
        self.noise_reduction_ratio = NOISE_START

        self.step_count = 0

    def act(self, state, i_episode, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()

        if add_noise:
            if i_episode > EPISODES_BEFORE_TRAINING and self.noise_reduction_ratio > NOISE_END:
                self.noise_reduction_ratio = NOISE_REDUCTION_RATE**(
                    i_episode - EPISODES_BEFORE_TRAINING)
#             noise_reduction_ratio = 1
            action += self.noise_reduction_ratio * self.add_noise2()
#             action += noise_reduction_ratio * self.noise.sample()
        return np.clip(action, -1, 1)

    def add_noise2(self):
        #         noise = 0.5*np.random.randn(1,self.action_size) #sigma of 0.5 as sigma of 1 will have alot of actions just clipped
        noise = 0.5 * np.random.standard_normal(self.action_size)
        return noise

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        full_states, actions, actor_local_actions, actor_target_actions, agent_state, agent_action, agent_reward, agent_done, next_states, next_full_states = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        #         actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_full_states,
                                            actor_target_actions)
        # Compute Q targets for current states (y_i)
        Q_targets = agent_reward + (gamma * Q_targets_next * (1 - agent_done))
        # Compute critic loss
        Q_expected = self.critic_local(full_states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        #         torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        #         actions_pred = self.actor_local(agent_state)
        actor_loss = -self.critic_local(full_states,
                                        actor_local_actions).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #


#         self.soft_update(self.critic_local, self.critic_target, TAU)
#         self.soft_update(self.actor_local, self.actor_target, TAU)

    def hard_copy_weights(self, target, source):
        """ copy weights from source to target network (part of initialization)"""
        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(param.data)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
class Agent():
    """Interacts with and learns from the environment."""
    
    def __init__(self, state_size, action_size, lr_actor, lr_critic, batch_size, buffer_size, noise_decay, random_seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.batch_size = batch_size
        self.seed = random.seed(random_seed)
        
        if torch.cuda.is_available():
            print("--- Using GPU ---")
        else:
            print("--- Using CPU ---")
         
        # Actor Network (w/ Target Network)
        
        fc1 = 128
        fc2 = 64       
        fc3 = 32       
        self.actor_local = Actor(state_size, action_size, random_seed, fc1_units=fc1, fc2_units=fc2, fc3_units=fc3).to(device)
        self.actor_target = Actor(state_size, action_size, random_seed*2, fc1_units=fc1, fc2_units=fc2, fc3_units=fc3).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor)

        # Critic Network (w/ Target Network)
        
        fc1 = 128
        fc2 = 64       
        fc3 = 32       
        self.critic_local  = Critic(state_size, action_size, random_seed*3, fcs1_units=fc1, fc2_units=fc2, fc3_units=fc3).to(device)
        self.critic_target = Critic(state_size, action_size, random_seed*4, fcs1_units=fc1, fc2_units=fc2, fc3_units=fc3).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic)
        
        # Load the networks from previous simulation, if there are any
        self.load_graphs()
        
        # Noise process
        self.noise = OUNoise(action_size, random_seed)
        
        # Noise ampliture
        self.noise_amplitude = 1.0
        
        # Replay memory
        self.memory = ReplayBuffer(action_size, buffer_size, batch_size, random_seed)
            
    def step(self, states, actions, rewards, next_states, dones, gamma, tau):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones):
            self.memory.add(state, action, reward, next_state, done)
            
        # Learn, if enough samples are available in memory        
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences, gamma, tau)

    def act(self, state, add_noise=True, noise_decay=1):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()*self.noise_amplitude
            self.noise_amplitude *= noise_decay
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()                               
        
    def learn(self, experiences, gamma, tau):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        # torch.nn.utils.clip_grad_norm(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, tau)
        self.soft_update(self.actor_local, self.actor_target, tau)   
      
    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
            
    def save_graphs(self):
        """
            Save the graphs in the same directory as the notebook.
        """
        torch.save(self.actor_local.state_dict(), 'checkpoint_actor.pth')
        torch.save(self.critic_local.state_dict(), 'checkpoint_critic.pth')

            
    def load_graphs(self):
        """
            Load the graphs if there are in the same directory as the notebook.
        """
        if (os.path.isfile('checkpoint_actor.pth') and os.path.isfile('checkpoint_critic.pth')):
            self.actor_local.load_state_dict(torch.load('checkpoint_actor.pth'))
            self.critic_local.load_state_dict(torch.load('checkpoint_critic.pth'))
示例#25
0
class DDPG(object):
    def __init__(self, memory, nb_status, nb_actions, action_noise=None,
                 gamma=0.99, tau=0.001, normalize_observations=True,
                 batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.),
                 actor_lr=1e-4, critic_lr=1e-3):
        self.nb_status = nb_status
        self.nb_actions = nb_actions
        self.action_range = action_range
        self.observation_range = observation_range
        self.normalize_observations = normalize_observations

        self.actor = Actor(self.nb_status, self.nb_actions)
        self.actor_target = Actor(self.nb_status, self.nb_actions)
        self.actor_optim = Adam(self.actor.parameters(), lr=actor_lr)

        self.critic = Critic(self.nb_status, self.nb_actions)
        self.critic_target = Critic(self.nb_status, self.nb_actions)
        self.critic_optim = Adam(self.critic.parameters(), lr=critic_lr)

        # Create replay buffer
        self.memory = memory  # SequentialMemory(limit=args.rmsize, window_length=args.window_length)
        self.action_noise = action_noise

        # Hyper-parameters
        self.batch_size = batch_size
        self.tau = tau
        self.discount = gamma

        if self.normalize_observations:
            self.obs_rms = RunningMeanStd()
        else:
            self.obs_rms = None

    def pi(self, obs, apply_noise=True, compute_Q=True):
        obs = np.array([obs])
        action = to_numpy(self.actor(to_tensor(obs))).squeeze(0)
        if compute_Q:
            q = self.critic([to_tensor(obs), to_tensor(action)]).cpu().data
        else:
            q = None

        if self.action_noise is not None and apply_noise:
            noise = self.action_noise()
            assert noise.shape == action.shape
            action += noise

        action = np.clip(action, self.action_range[0], self.action_range[1])
        return action, q[0][0]

    def store_transition(self, obs0, action, reward, obs1, terminal1):
        self.memory.append(obs0, action, reward, obs1, terminal1)
        if self.normalize_observations:
            self.obs_rms.update(np.array([obs0]))

    def train(self):
        # Get a batch.
        batch = self.memory.sample(batch_size=self.batch_size)

        next_q_values = self.critic_target([
            to_tensor(batch['obs1'], volatile=True),
            self.actor_target(to_tensor(batch['obs1'], volatile=True))])
        next_q_values.volatile = False

        target_q_batch = to_tensor(batch['rewards']) + \
                         self.discount * to_tensor(1 - batch['terminals1'].astype('float32')) * next_q_values

        self.critic.zero_grad()
        q_batch = self.critic([to_tensor(batch['obs0']), to_tensor(batch['actions'])])
        value_loss = criterion(q_batch, target_q_batch)
        value_loss.backward()
        self.critic_optim.step()

        self.actor.zero_grad()
        policy_loss = -self.critic([to_tensor(batch['obs0']), self.actor(to_tensor(batch['obs0']))]).mean()
        policy_loss.backward()
        self.actor_optim.step()

        # Target update
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)

        return value_loss.cpu().data[0], policy_loss.cpu().data[0]

    def initialize(self):
        hard_update(self.actor_target, self.actor)  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

    def update_target_net(self):
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)

    def reset(self):
        if self.action_noise is not None:
            self.action_noise.reset()

    def cuda(self):
        self.actor.cuda()
        self.actor_target.cuda()
        self.critic.cuda()
        self.critic_target.cuda()
class Agent():
    """Interacts with and learns from the environment"""
    
    def __init__(self, state_size, action_size, random_seed=0):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
            
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        
        self.epsilon = EPSILON
        
        # Actor network
        self.actor_local = Actor(state_size, action_size, random_seed).to(device)
        self.actor_target = Actor(state_size, action_size, random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR)
        #self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR, weight_decay=WEIGHT_DECAY)
        
        # Critic network
        self.critic_local = Critic(state_size, action_size, random_seed).to(device)
        self.critic_target = Critic(state_size, action_size, random_seed).to(device)
        #self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)
        
        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
        
        
    def step(self, state, action, reward, next_state, done, timestep):
        """Save experience in replay memory, and use random sample from buffer to learn"""
        # save experience/reward
        # if updating in batches, then add the last memory of the agents(e.g. 20 agents) to a buffer
        #  and if we've met batch size, push to learn in multiples of LEARN_NUM
        self.memory.add(state, action, reward, next_state, done)
        
        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE and timestep % LEARN_EVERY == 0:
            for _ in range(LEARN_NUM):
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)
                
    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy"""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        
        if add_noise:
            action += self.epsilon * self.noise.sample()
            
        return np.clip(action, -1, 1)
    
    def reset(self):
        """Reset the noise"""
        self.noise.reset()
        
    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done)
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences
        
        # Update critic
        # get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # compute Q targets for current states(y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        # gradient clipping for critic
        if GRAD_CLIPPING > 0:
            torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), GRAD_CLIPPING)
        self.critic_optimizer.step()
        
        # update actor
        # compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()
        
        # update target networks
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)
        # update epsilon decay
        #if EPSILON_DECAY > 0:
        #    self.epsilon -= EPSILON_DECAY
        #    self.noise.reset()

        self.epsilon -= EPSILON_DECAY
        self.noise.reset()

        if EPSILON_DECAY =< 0:
            self.epsilon = 0
            self.noise.reset()
示例#27
0
def training(file_name):
    # Create folders.
    if not os.path.isdir(SAVE_DIR):
        os.makedirs(SAVE_DIR)
    if not os.path.isdir(CSV_DIR):
        os.makedirs(CSV_DIR)
    if not os.path.isdir(FIGURE_TRAINING_DIR):
        os.makedirs(FIGURE_TRAINING_DIR)

    # Load models.
    actor = Actor(name="actor")
    actor_target = Actor(name="actor_target")
    actor_initial_update_op = target_update_op(
        actor.trainable_variables, actor_target.trainable_variables, 1.0)
    actor_target_update_op = target_update_op(actor.trainable_variables,
                                              actor_target.trainable_variables,
                                              TARGET_UPDATE_RATE)

    critic = Critic(name="critic")
    critic.build_training()
    critic_target = Critic(name="critic_target")
    critic_initial_update_op = target_update_op(
        critic.trainable_variables, critic_target.trainable_variables, 1.0)
    critic_target_update_op = target_update_op(
        critic.trainable_variables, critic_target.trainable_variables,
        TARGET_UPDATE_RATE)

    critic_with_actor = Critic(name="critic", A=actor.pi)
    actor.build_training(critic_with_actor.actor_loss)

    env = PendulumEnv()
    replay_buffer = ReplayBuffer(BUFFER_SIZE)
    action_noise = OUActionNoise(np.zeros(A_LENGTH))

    with tf.Session() as sess:
        # Initialize actor and critic networks.
        sess.run(tf.global_variables_initializer())
        sess.run([actor_initial_update_op, critic_initial_update_op])

        list_final_reward = []

        additional_episode = int(np.ceil(MIN_BUFFER_SIZE / MAX_FRAME))
        for episode in range(-additional_episode, MAX_EPISODE):
            list_actor_loss = []
            list_critic_loss = []

            # Reset the environment and noise.
            s = env.reset()
            action_noise.reset()

            for step in range(MAX_FRAME):
                env.render()

                # Get action.
                a = sess.run(actor.pi,
                             feed_dict={actor.S: np.reshape(s, (1, -1))})
                noise = action_noise.get_noise()
                a = a[0] + ACTION_SCALING * noise
                a = np.clip(a, -ACTION_SCALING, ACTION_SCALING)

                # Interact with the game engine.
                s1, r, _, _ = env.step(a)

                # Add data to the replay buffer.
                data = [s, a, [r], s1]
                replay_buffer.append(data)

                if episode >= 0:
                    for _ in range(BATCHES_PER_STEP):
                        # Sample data from the replay buffer.
                        batch_data = replay_buffer.sample(BATCH_SIZE)
                        batch_s, batch_a, batch_r, batch_s1 = [
                            np.array(
                                [batch_data[j][i] for j in range(BATCH_SIZE)])
                            for i in range(len(batch_data[0]))
                        ]

                        # Compute the next action.
                        a1 = sess.run(actor_target.pi,
                                      feed_dict={actor_target.S: batch_s1})

                        # Compute the target Q.
                        q1 = sess.run(critic_target.q,
                                      feed_dict={
                                          critic_target.S: batch_s1,
                                          critic_target.A: a1
                                      })
                        q_target = batch_r + DISCOUNT * q1

                        # Update actor and critic.
                        _, _, actor_loss, critic_loss = sess.run(
                            [
                                actor.train_op, critic.train_op,
                                actor.actor_loss, critic.critic_loss
                            ],
                            feed_dict={
                                actor.S: batch_s,
                                critic_with_actor.S: batch_s,
                                actor.LR: LR_ACTOR,
                                critic.S: batch_s,
                                critic.A: batch_a,
                                critic.QTarget: q_target,
                                critic.LR: LR_CRITIC
                            })
                        list_actor_loss.append(actor_loss)
                        list_critic_loss.append(critic_loss)

                        # Update target networks.
                        sess.run(
                            [actor_target_update_op, critic_target_update_op])

                s = s1

            # Postprocessing after each episode.
            if episode >= 0:
                list_final_reward.append(r)
                avg_actor_loss = np.mean(list_actor_loss)
                avg_critic_loss = np.mean(list_critic_loss)
                print("Episode ", format(episode, "03d"), ":", sep="")
                print("  Final Reward = ",
                      format(r, ".6f"),
                      ", Actor Loss = ",
                      format(avg_actor_loss, ".6f"),
                      ", Critic Loss = ",
                      format(avg_critic_loss, ".6f"),
                      sep="")

        # Testing.
        avg_reward = 0
        for i in range(TEST_EPISODE):
            # Reset the environment and noise.
            s = env.reset()
            action_noise.reset()

            for step in range(MAX_FRAME):
                env.render()

                # Get action.
                a = sess.run(actor.pi,
                             feed_dict={actor.S: np.reshape(s, (1, -1))})
                a = a[0]

                # Interact with the game engine.
                s, r, _, _ = env.step(a)

            # Postprocessing after each episode.
            avg_reward += r
        avg_reward /= TEST_EPISODE

        # Save the parameters.
        saver = tf.train.Saver(
            [*actor.trainable_variables, *critic.trainable_variables])
        saver.save(sess, SAVE_DIR + file_name)
    tf.contrib.keras.backend.clear_session()
    env.close()

    # Store data in the csv file.
    with open(CSV_DIR + file_name + ".csv", "w") as f:
        fieldnames = ["Episode", "Final Reward", "Average Reward"]
        writer = csv.DictWriter(f, fieldnames=fieldnames, lineterminator="\n")
        writer.writeheader()
        for episode in range(MAX_EPISODE):
            content = {
                "Episode": episode,
                "Final Reward": list_final_reward[episode]
            }
            if episode == MAX_EPISODE - 1:
                content.update({"Average Reward": avg_reward})
            writer.writerow(content)

    # Plot the training process.
    list_episode = list(range(MAX_EPISODE))
    f, ax = plt.subplots(nrows=1, ncols=1, figsize=(5, 5))
    ax.plot(list_episode, list_final_reward, "r-", label="Final Reward")
    ax.plot([MAX_EPISODE - 1], [avg_reward], "b.", label="Average Reward")
    ax.set_title("Final Reward")
    ax.set_xlabel("Episode")
    ax.set_ylabel("Reward")
    ax.legend(loc="lower right")
    ax.grid()

    f.savefig(FIGURE_TRAINING_DIR + file_name + ".png")
    plt.close(f)
示例#28
0
class Agent:
    """ The reinforcement learning agent.  """
    
    def __init__(self, state_size: int, action_size: int, n_agents: int, seed: int) -> None:
        """Initializes an Agent object.
         Args:
                state_size (int): The dimension of the state vector.
                action_size (int): The dimension of the action vector.
                n_agents (int): The number of agents.
                seed (int): The initialization value for the random number generator.
        """
        self.state_size = state_size
        self.action_size = action_size
        self.n_agents = n_agents
        self.seed = random.seed(seed)
        
        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, seed).to(device)
        self.actor_target = Actor(state_size, action_size, seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, seed).to(device)
        self.critic_target = Critic(state_size, action_size, seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)

        # An Ornstein Uhlenbeck process is used to generate noise.
        self.noise = OrnsteinUhlenbeckNoise(action_size, seed)
        
        # Replay buffer
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
    
    def step(self, state: torch.Tensor, action: torch.Tensor, reward: torch.Tensor,
             next_state: torch.Tensor, done: torch.Tensor) -> None:
        """
            Save the experience within the ReplayBuffer.
                Args:
                    state (torch.Tensor): A state vector.
                    action (torch.Tensor): An action vector.
                    reward (torch.Tensor): A reward vector.
                    next_state (torch.Tensor): A vector containing the states following the given states.
                    done (torch.Tensor): A vector containing done flags.
        """
        for i in range(self.n_agents):
            self.memory.add(state[i,:], action[i,:], reward[i], next_state[i,:], done[i])
            
        """ In case there are enough experiences within ReplayBuffer, start learning. """      
        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)

    def act(self, state: torch.Tensor, add_noise: bool = True):
        """
            Using the actor network the method return a vector of actions given the state vector
            using the current policy.
                Args:
                    state (torch.Tensor): A state vector.
                    add_noise (bool): A flag indicating the use of noise.
                Returns:
                    An vector of actions.
        """
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:  # Add Ornstein Uhlenbeck noise.
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self) -> None:
        """ Reset the Ornstein Uhlenbeck process. """
        self.noise.reset()
        
    def learn(self, experiences: Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor],
              gamma: float) -> None:
        """
            Update policy and value parameters using given batch of experience tuples.
            Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
            where:
                actor_target(state) -> action
                critic_target(state, action) -> Q-value
            Args:
                experiences (Tuple[torch.Tensor]): Tuple of (s, a, r, s', done) tuples
                gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        q_targets = rewards + (gamma * q_targets_next * (1 - dones))
        # Retrieve the predicted q value
        q_expected = self.critic_local(states, actions)
        # Compute the loss as the measn square error between expected and computed q value.
        critic_loss = f.mse_loss(q_expected, q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)                     

    @staticmethod
    def soft_update(local_model, target_model, tau: float) -> None:
        """
           Update the model parameters according to this formula:
           θ_target = τ*θ_local + (1 - τ)*θ_target

           Args:
               local_model (PyTorch model): weights will be copied from this model
               target_model (PyTorch model): weights will be copied to this model
               tau (float): interpolation parameter, tau = 1 results in complete overwrite
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
示例#29
0
    def __init__(self,
                 device,
                 state_size,
                 n_agents,
                 action_size,
                 random_seed,
                 buffer_size,
                 batch_size,
                 gamma,
                 TAU,
                 lr_actor,
                 lr_critic,
                 weight_decay,
                 checkpoint_folder='./'):

        self.DEVICE = device

        self.state_size = state_size
        self.n_agents = n_agents
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Hyperparameters
        self.BUFFER_SIZE = buffer_size
        self.BATCH_SIZE = batch_size
        self.GAMMA = gamma
        self.TAU = TAU
        self.LR_ACTOR = lr_actor
        self.LR_CRITIC = lr_critic
        self.WEIGHT_DECAY = weight_decay

        self.CHECKPOINT_FOLDER = checkpoint_folder

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(self.DEVICE)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(self.DEVICE)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=self.LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(self.DEVICE)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(self.DEVICE)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=self.LR_CRITIC,
                                           weight_decay=self.WEIGHT_DECAY)
        '''
        if os.path.isfile(self.CHECKPOINT_FOLDER + 'checkpoint_actor.pth') and os.path.isfile(self.CHECKPOINT_FOLDER + 'checkpoint_critic.pth'):
            self.actor_local.load_state_dict(torch.load(self.CHECKPOINT_FOLDER + 'checkpoint_actor.pth'))
            self.actor_target.load_state_dict(torch.load(self.CHECKPOINT_FOLDER + 'checkpoint_actor.pth'))

            self.critic_local.load_state_dict(torch.load(self.CHECKPOINT_FOLDER + 'checkpoint_critic.pth'))
            self.critic_target.load_state_dict(torch.load(self.CHECKPOINT_FOLDER + 'checkpoint_critic.pth'))
        '''
        # Noise process
        self.noise = OUNoise((n_agents, action_size), random_seed)

        # Replay memory
        self.memory = ReplayBuffer(device, action_size, self.BUFFER_SIZE,
                                   self.BATCH_SIZE, random_seed)
示例#30
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(
            self,
            state_size=None,  # state space size
            action_size=None,  # action size
            memory=None,
            buffer_size=BUFFER_SIZE,  # replay buffer size
            batch_size=BATCH_SIZE,  # minibatch size
            gamma=GAMMA,  # discount factor
            tau=TAU,  # for soft update of target parameters
            lr_actor=LR_ACTOR,  # learning rate of the actor 
            lr_critic=LR_CRITIC,  # learning rate of the critic
            weight_decay=WEIGHT_DECAY,  # L2 weight decay
            random_seed=RANDOM_SEED):
        self.state_size = state_size
        self.action_size = action_size
        self.buffer_size = buffer_size  # replay buffer size
        self.batch_size = batch_size  # minibatch size
        self.gamma = gamma  # discount factor
        self.tau = tau  # for soft update of target parameters
        self.lr_actor = lr_actor  # learning rate of the actor
        self.lr_critic = lr_critic  # learning rate of the critic
        self.weight_decay = weight_decay  # L2 weight decay
        self.seed = random.seed(random_seed)
        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=lr_critic,
                                           weight_decay=self.weight_decay)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        if not isinstance(memory, ReplayBuffer):
            memory = ReplayBuffer(action_size, buffer_size, batch_size,
                                  random_seed, device)
        self.memory = memory

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences, self.gamma)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, self.tau)
        self.soft_update(self.actor_local, self.actor_target, self.tau)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
示例#31
0
class Agent():
    """Main DDPG agent that extracts experiences and learns from them"""
    def __init__(self, state_size=8, action_size=2, random_seed=0):
        """
        Initializes Agent object.
        @Param:
        1. state_size: dimension of each state.
        2. action_size: number of actions.
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        #Actor network
        self.actor_local = Actor(self.state_size, self.action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        #Critic network
        self.critic_local = Critic(self.state_size, self.action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(self.state_size, self.action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC)

        #Noise proccess
        self.noise = OUNoise(action_size,
                             random_seed)  #define Ornstein-Uhlenbeck process

        #Replay memory
        self.memory = ReplayBuffer(
            self.action_size, BUFFER_SIZE, MINI_BATCH,
            random_seed)  #define experience replay buffer object

        self.time_step = 0

    def reset(self):
        """Resets the noise process to mean"""
        self.noise.reset()

    def act(self, state, add_noise=True):
        """
        Returns a deterministic action given current state.
        @Param:
        1. state: current state, S.
        2. add_noise: (bool) add bias to agent, default = True (training mode)
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(
            device)  #typecast to torch.Tensor
        self.actor_local.eval()  #set in evaluation mode
        with torch.no_grad():  #reset gradients
            action = self.actor_local(state).cpu().data.numpy(
            )  #deterministic action based on Actor's forward pass.
        self.actor_local.train()  #set training mode

        #If training mode, i.e. add_noise = True, add noise to the model to learn a more accurate policy for current state.
        if (add_noise):
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def learn(self, experiences, gamma=GAMMA):
        """
        Learn from a set of experiences picked up from a random sampling of even frequency (not prioritized)
        of experiences when buffer_size = MINI_BATCH.
        Updates policy and value parameters accordingly
        @Param:
        1. experiences: (Tuple[torch.Tensor]) set of experiences, trajectory, tau. tuple of (s, a, r, s', done)
        2. gamma: immediate reward hyper-parameter, 0.99 by default.
        """
        #Extrapolate experience into (state, action, reward, next_state, done) tuples
        states, actions, rewards, next_states, dones = experiences

        #Update Critic network
        actions_next = self.actor_target(
            next_states
        )  # Get predicted next-state actions and Q values from target models
        Q_targets_next = self.critic_target(next_states, actions_next)
        Q_targets = rewards + (gamma * Q_targets_next *
                               (1 - dones))  #  r + γ * Q-values(a,s)

        # Compute critic loss using MSE
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        nn.utils.clip_grad_norm_(self.critic_local.parameters(),
                                 1)  #clip gradients
        self.critic_optimizer.step()

        #Update Actor Network

        # Compute actor loss
        actions_pred = self.actor_local(states)  #gets mu(s)
        actor_loss = -self.critic_local(states,
                                        actions_pred).mean()  #gets V(s,a)
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters. Copies model τ every experience.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, params, random_seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.n_agents = 1
        self.state_size = params['state_size']
        self.action_size = params['action_size']
        self.batch_size = params['batch_size']
        self.gamma = params['gamma']
        self.tau = params['tau']
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(self.state_size, self.action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=params['lr_actor'])

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(self.state_size, self.action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(self.state_size, self.action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=params['lr_critic'],
                                           weight_decay=params['weight_decay'])

        # Noise process
        self.noise = OUNoise(self.action_size, random_seed)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, self.tau)
        self.soft_update(self.actor_local, self.actor_target, self.tau)

    def soft_update(self, local_model, target_model, tau):
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
class Agent():
    """Interacts with and learns from the environment."""
    
    def __init__(self, state_size, action_size, random_seed, num_agents):

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.num_agents = num_agents

        # Actor Networks 
        self.actor_local = Actor(state_size, action_size, random_seed).to(device)
        self.actor_target = Actor(state_size, action_size, random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR)

        # Critic Networks
        self.critic_local = Critic(state_size, action_size, random_seed).to(device)
        self.critic_target = Critic(state_size, action_size, random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
    
    def step(self, timestep, states, actions, rewards, next_states, dones):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Add experience s a r s' d from all agents to replay buffer
        for i in range(self.num_agents):                                                
            self.memory.add(states[i,:], actions[i,:], rewards[i], next_states[i,:], dones[i])   
        
        if timestep % UPDATE_TIMESTEPS ==0:
            # Learn, if enough samples are available in memory
            if len(self.memory) > BATCH_SIZE:
                for i in range(MEMORY_SAMPLE_TIMES):
                    experiences = self.memory.sample()
                    self.learn(experiences, GAMMA)

    def act(self, states, add_noise=True):
        """Returns actions for given state as per current policy."""
        states = torch.from_numpy(states).float().to(device)
        actions = np.zeros((self.num_agents, self.action_size))

        self.actor_local.eval()
        with torch.no_grad():
            for i in range(self.num_agents):
                actions[i, :] = self.actor_local(states[i]).cpu().data.numpy()       

        self.actor_local.train()
        if add_noise:
            for i in range(len(actions)): 
                actions[i, :] += self.noise.sample()
        return np.clip(actions, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)  ## use gradient clipping when training 
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)                     

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class SAC():
    def __init__(self):
        self.V=V(n_state).to(device)
        self.target_V=V(n_state).to(device)
        self.policy=Actor(n_state,max_action).to(device)
        self.Q=Q(n_state,n_action).to(device)
        
        self.optimV=th.optim.Adam(self.V.parameters(),lr=lr)
        self.optimQ=th.optim.Adam(self.Q.parameters(),lr=lr)
        self.optimP=th.optim.Adam(self.policy.parameters(),lr=lr)

        self.memory=replay_memory(memory_size)

    def choose_action(self,s):
        mu,log_std=self.policy(s)

        dist=Normal(mu,th.exp(log_std))
        action=dist.sample()
        action = th.tanh(action)
        
        return action


    def V_learn(self,batch):
        b_s=th.FloatTensor(batch[:,0].tolist()).to(device)
        b_a=th.FloatTensor(batch[:,2].tolist()).to(device)

        mu,log_std=self.policy(b_s)
        dist=Normal(mu,th.exp(log_std))

        z=dist.sample()
        b_a=th.tanh(z)
        prob=dist.log_prob(z)
        qs=self.Q(b_s,b_a)

        v=self.V(b_s)
        target_v=qs-prob

        loss=(v-target_v.detach())**2
        loss=loss.mean()

        self.optimV.zero_grad()
        loss.backward()
        self.optimV.step()



    def Q_learn(self,batch):
        b_s=th.FloatTensor(batch[:,0].tolist()).to(device)
        b_r=th.FloatTensor(batch[:,1].tolist()).to(device)
        b_a=th.FloatTensor(batch[:,2].tolist()).to(device)
        b_s_=th.FloatTensor(batch[:,3].tolist()).to(device)
        b_d=th.FloatTensor(batch[:,4].tolist()).to(device)

        target_q=b_r+(1-b_d)*gamma*self.target_V(b_s_)

        eval_q=self.Q(b_s,b_a)
        loss=(eval_q-target_q.detach())**2
        loss=loss.mean()
        self.optimQ.zero_grad()
        loss.backward()
        self.optimQ.step()
        

    def P_learn(self,batch):
        b_s=th.FloatTensor(batch[:,0].tolist()).to(device)

        norm=Normal(th.zeros((batchsize,1)),th.ones((batchsize,1)))
        #norm=Normal(0,1)
        mu,log_std=self.policy(b_s)

        z=norm.sample()
        b_a=th.tanh(mu+th.exp(log_std)*z.to(device))

        dist=Normal(mu,th.exp(log_std))
        log_prob=dist.log_prob(mu+th.exp(log_std)*z.to(device))- th.log(1 - b_a.pow(2) + 1e-7)
        qs=self.Q(b_s,b_a)

        loss=alpha*log_prob-qs
        loss=loss.mean()

        self.optimP.zero_grad()
        loss.backward()
        self.optimP.step()


    def soft_update(self):
        for param,target_param in zip(self.V.parameters(),self.target_V.parameters()):
            target_param.data.copy_(tau*param.data+(1-tau)*target_param.data)
示例#35
0
class Agent:
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, num_agents, params, seed=0):
        """Initialize an Agent object.
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            num_agents (int): number of agents
            params (dict): hyperparameters
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.params = params
        random.seed(seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, seed).to(device)
        self.actor_target = Actor(state_size, action_size, seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=self.params['lr_actor'])

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, seed).to(device)
        self.critic_target = Critic(state_size, action_size, seed).to(device)
        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(),
            lr=self.params['lr_critic'],
            weight_decay=self.params['weight_decay'])

        # Noise process
        self.noise = OUNoise(action_size, seed)

        # Replay memory
        self.memory = ReplayBuffer(self.params['buffer_size'],
                                   self.params['batch_size'], seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, states, actions, rewards, next_states, dones):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        for i in range(self.num_agents):
            self.memory.add(states[i], actions[i], rewards[i], next_states[i],
                            dones[i])

        # Learn every UPDATE_EVERY time steps
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # Learn, if enough samples are available in memory
            if len(self.memory) > self.params['batch_size']:
                for i in range(LEARN_UPDATES):
                    experiences = self.memory.sample()
                    self.learn(experiences, self.params['gamma'])

    def act(self, states, add_noise=True):
        """Returns actions for given state as per current policy."""
        states = torch.from_numpy(states).float().to(device)
        actions = np.zeros((self.num_agents, self.action_size))
        self.actor_local.eval()
        with torch.no_grad():
            for i, state in enumerate(states):
                actions[i, :] = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            actions += self.noise.sample()
        return np.clip(actions, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Update critic
        self.update_critic(states, actions, rewards, next_states, dones, gamma)
        # Update actor
        self.update_actor(states)
        # Update target networks
        self.soft_update(self.critic_local, self.critic_target,
                         self.params['tau'])
        self.soft_update(self.actor_local, self.actor_target,
                         self.params['tau'])

    def update_actor(self, states):
        """Update actor parameters using given batch of experience tuples."""
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

    def update_critic(self, states, actions, rewards, next_states, dones,
                      gamma):
        """Update critic parameters using given batch of experience tuples."""
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
示例#36
0
class Agent:
    def __init__(self, state_size, action_size, num_agents, random_seed):
        """Initialize an Agent object.
        Params
        =====
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            num_agents (int): number of agents
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(random_seed)
        self.epsilon = EPSILON_MAX

        # Actor Network
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        self.noise = [
            OUNoise(action_size, random_seed) for i in range(self.num_agents)
        ]

        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

        # self.hard_update(self.actor_target, self.actor_local)
        # self.hard_update(self.critic_target, self.critic_local)

    def step(self, state, action, reward, next_state, done, time_step):
        """Save experience in memory and use random samples from buffer to learn."""
        self.memory.add(state, action, reward, next_state, done,
                        self.num_agents)

        if len(self.memory) > BATCH_SIZE and time_step % UPDATE_EVERY == 0:
            # learn LEARN_NUM times every UPDATE_EVERY time steps
            for _ in range(LEARN_NUM):
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):
        """Return actions for given state as per current policy"""
        state = torch.from_numpy(state).float().to(device)

        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()

        self.actor_local.train()

        if add_noise:
            for i in range(self.num_agents):
                agent_action = action[i]
                for j in agent_action:
                    j += self.epsilon * self.noise[i].sample()

        return np.clip(action, -1, 1)

    def reset(self):
        for i in range(self.num_agents):
            self.noise[i].reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
            Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
            where:
                actor_target(state) -> action
                critic_target(state, action) -> Q_value
            Params
            ======
                experiences (Tuple[torch.Tensor]): tuple of (s,a,r,s',done) tuples
                gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------- update critic ---------------------------- #
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        Q_targets = rewards + gamma * Q_targets_next * (1 - dones)

        # compute the loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        # minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------- update actor ----------------------------- #
        # compute the loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()

        # minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ---------------------- update target network ---------------------- #
        self.soft_update(self.actor_local, self.actor_target, TAU)
        self.soft_update(self.critic_local, self.critic_target, TAU)

        # ---------------------- update noise ------------------------------- #
        if self.epsilon - EPSILON_DECAY > EPSILON_MIN:
            self.epsilon -= EPSILON_DECAY
        else:
            self.epsilon = EPSILON_MIN

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters
                θ_target = τ * θ_local + (1 - τ) * θ_target
                Params
                =====
                    local_model: Network weights to be copied from
                    target_model: Network weights to be copied to
                    tau(float): interpolation parameter
                """
        for local_param, target_param in zip(local_model.parameters(),
                                             target_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
示例#37
0
def main():
    env = gym.make(args.env_name)
    env.seed(args.seed)
    torch.manual_seed(args.seed)

    num_inputs = env.observation_space.shape[0]
    num_actions = env.action_space.shape[0]
    running_state = ZFilter((num_inputs,), clip=5)

    print('state size:', num_inputs) 
    print('action size:', num_actions)

    actor = Actor(num_inputs, num_actions, args)
    critic = Critic(num_inputs, args)
    discrim = Discriminator(num_inputs + num_actions, args)

    actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate)
    critic_optim = optim.Adam(critic.parameters(), lr=args.learning_rate, 
                              weight_decay=args.l2_rate) 
    discrim_optim = optim.Adam(discrim.parameters(), lr=args.learning_rate)
    
    # load demonstrations
    expert_demo, _ = pickle.load(open('./expert_demo/expert_demo.p', "rb"))
    demonstrations = np.array(expert_demo)
    print("demonstrations.shape", demonstrations.shape)
    
    writer = SummaryWriter(args.logdir)

    if args.load_model is not None:
        saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model))
        ckpt = torch.load(saved_ckpt_path)

        actor.load_state_dict(ckpt['actor'])
        critic.load_state_dict(ckpt['critic'])
        discrim.load_state_dict(ckpt['discrim'])

        running_state.rs.n = ckpt['z_filter_n']
        running_state.rs.mean = ckpt['z_filter_m']
        running_state.rs.sum_square = ckpt['z_filter_s']

        print("Loaded OK ex. Zfilter N {}".format(running_state.rs.n))

    
    episodes = 0
    train_discrim_flag = True

    for iter in range(args.max_iter_num):
        actor.eval(), critic.eval()
        memory = deque()

        steps = 0
        scores = []

        while steps < args.total_sample_size: 
            state = env.reset()
            score = 0

            state = running_state(state)
            
            for _ in range(10000): 
                if args.render:
                    env.render()

                steps += 1

                mu, std = actor(torch.Tensor(state).unsqueeze(0))
                action = get_action(mu, std)[0]
                next_state, reward, done, _ = env.step(action)
                irl_reward = get_reward(discrim, state, action)

                if done:
                    mask = 0
                else:
                    mask = 1

                memory.append([state, action, irl_reward, mask])

                next_state = running_state(next_state)
                state = next_state

                score += reward

                if done:
                    break
            
            episodes += 1
            scores.append(score)
        
        score_avg = np.mean(scores)
        print('{}:: {} episode score is {:.2f}'.format(iter, episodes, score_avg))
        writer.add_scalar('log/score', float(score_avg), iter)

        actor.train(), critic.train(), discrim.train()
        if train_discrim_flag:
            expert_acc, learner_acc = train_discrim(discrim, memory, discrim_optim, demonstrations, args)
            print("Expert: %.2f%% | Learner: %.2f%%" % (expert_acc * 100, learner_acc * 100))
            if expert_acc > args.suspend_accu_exp and learner_acc > args.suspend_accu_gen:
                train_discrim_flag = False
        train_actor_critic(actor, critic, memory, actor_optim, critic_optim, args)

        if iter % 100:
            score_avg = int(score_avg)

            model_path = os.path.join(os.getcwd(),'save_model')
            if not os.path.isdir(model_path):
                os.makedirs(model_path)

            ckpt_path = os.path.join(model_path, 'ckpt_'+ str(score_avg)+'.pth.tar')

            save_checkpoint({
                'actor': actor.state_dict(),
                'critic': critic.state_dict(),
                'discrim': discrim.state_dict(),
                'z_filter_n':running_state.rs.n,
                'z_filter_m': running_state.rs.mean,
                'z_filter_s': running_state.rs.sum_square,
                'args': args,
                'score': score_avg
            }, filename=ckpt_path)
示例#38
0
class DDPG(object):
    def __init__(self, nb_status, nb_actions, args, writer):
        self.clip_actor_grad = args.clip_actor_grad
        self.nb_status = nb_status * args.window_length
        self.nb_actions = nb_actions
        self.discrete = args.discrete
        self.pic = args.pic
        self.writer = writer
        self.select_time = 0        
        if self.pic:
            self.nb_status = args.pic_status
        
        # Create Actor and Critic Network
        net_cfg = {
            'hidden1':args.hidden1, 
            'hidden2':args.hidden2, 
            'use_bn':args.bn,
            'init_method':args.init_method
        }
        if args.pic:
            self.cnn = CNN(1, args.pic_status)
            self.cnn_target = CNN(1, args.pic_status)
            self.cnn_optim = Adam(self.cnn.parameters(), lr=args.crate)
        self.actor = Actor(self.nb_status, self.nb_actions, **net_cfg)
        self.actor_target = Actor(self.nb_status, self.nb_actions, **net_cfg)
        self.actor_optim  = Adam(self.actor.parameters(), lr=args.prate)

        self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg)
        self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg)
        self.critic_optim  = Adam(self.critic.parameters(), lr=args.rate)

        hard_update(self.actor_target, self.actor) # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)
        if args.pic:
            hard_update(self.cnn_target, self.cnn)
        
        #Create replay buffer
        self.memory = rpm(args.rmsize) # SequentialMemory(limit=args.rmsize, window_length=args.window_length)
        self.random_process = Myrandom(size=nb_actions)

        # Hyper-parameters
        self.batch_size = args.batch_size
        self.tau = args.tau
        self.discount = args.discount
        self.depsilon = 1.0 / args.epsilon

        # 
        self.epsilon = 1.0
        self.s_t = None # Most recent state
        self.a_t = None # Most recent action
        self.use_cuda = args.cuda
        # 
        if self.use_cuda: self.cuda()

    def normalize(self, pic):
        pic = pic.swapaxes(0, 2).swapaxes(1, 2)
        return pic

    def update_policy(self):
        # Sample batch
        state_batch, action_batch, reward_batch, \
            next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size)

        # Prepare for the target q batch
        if self.pic:
            state_batch = np.array([self.normalize(x) for x in state_batch])
            state_batch = to_tensor(state_batch, volatile=True)
            state_batch = self.cnn(state_batch)
            next_state_batch = np.array([self.normalize(x) for x in next_state_batch])
            next_state_batch = to_tensor(next_state_batch, volatile=True)
            next_state_batch = self.cnn_target(next_state_batch)
            next_q_values = self.critic_target([
                next_state_batch,
                self.actor_target(next_state_batch)
            ])
        else:
            next_q_values = self.critic_target([
                to_tensor(next_state_batch, volatile=True),
                self.actor_target(to_tensor(next_state_batch, volatile=True)),
            ])
        # print('batch of picture is ok')
        next_q_values.volatile = False

        target_q_batch = to_tensor(reward_batch) + \
            self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values

        # Critic update
        self.critic.zero_grad()
        if self.pic: self.cnn.zero_grad()

        if self.pic:
            state_batch.volatile = False
            q_batch = self.critic([state_batch, to_tensor(action_batch)])
        else:
            q_batch = self.critic([to_tensor(state_batch), to_tensor(action_batch)])

        # print(reward_batch, next_q_values*self.discount, target_q_batch, terminal_batch.astype(np.float))
        value_loss = criterion(q_batch, target_q_batch)
        value_loss.backward()
        self.critic_optim.step()
        if self.pic: self.cnn_optim.step()

        self.actor.zero_grad()
        if self.pic: self.cnn.zero_grad()

        if self.pic:
            state_batch.volatile = False
            policy_loss = -self.critic([
                state_batch,
                self.actor(state_batch)
            ])
        else:
            policy_loss = -self.critic([
                to_tensor(state_batch),
                self.actor(to_tensor(state_batch))
            ])

        policy_loss = policy_loss.mean()
        policy_loss.backward()

        if self.clip_actor_grad is not None:
            torch.nn.utils.clip_grad_norm(self.actor.parameters(), float(self.clip_actor_grad))

            if self.writer != None:
                mean_policy_grad = np.array(np.mean([np.linalg.norm(p.grad.data.cpu().numpy().ravel()) for p in self.actor.parameters()]))
                #print(mean_policy_grad)
                self.writer.add_scalar('train/mean_policy_grad', mean_policy_grad, self.select_time)

        self.actor_optim.step()
        if self.pic: self.cnn_optim.step()

        # Target update
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)
        if self.pic:
            soft_update(self.cnn_target, self.cnn, self.tau)

        return -policy_loss, value_loss

    def eval(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()
        if(self.pic):
            self.cnn.eval()
            self.cnn_target.eval()

    def train(self):
        self.actor.train()
        self.actor_target.train()
        self.critic.train()
        self.critic_target.train()
        if(self.pic):
            self.cnn.train()
            self.cnn_target.train()

    def cuda(self):
        self.cnn.cuda()
        self.cnn_target.cuda()
        self.actor.cuda()
        self.actor_target.cuda()
        self.critic.cuda()
        self.critic_target.cuda()

    def observe(self, r_t, s_t1, done):
        self.memory.append([self.s_t, self.a_t, r_t, s_t1, done])
        self.s_t = s_t1

    def random_action(self, fix=False):
        action = np.random.uniform(-1.,1.,self.nb_actions)
        self.a_t = action
        if self.discrete and fix == False:
            action = action.argmax()
#        if self.pic:
#            action = np.concatenate((softmax(action[:16]), softmax(action[16:])))
        return action
        
    def select_action(self, s_t, decay_epsilon=True, return_fix=False, noise_level=0):
        self.eval()
        if self.pic:
            s_t = self.normalize(s_t)
            s_t = self.cnn(to_tensor(np.array([s_t])))
        if self.pic:
            action = to_numpy(
                self.actor_target(s_t)
            ).squeeze(0)
        else:
            action = to_numpy(
                self.actor(to_tensor(np.array([s_t])))
            ).squeeze(0)
        self.train()
        noise_level = noise_level * max(self.epsilon, 0)

        if np.random.uniform(0, 1) < noise_level:
            action = self.random_action(fix=True) # episilon greedy            

        if decay_epsilon:
            self.epsilon -= self.depsilon
        self.a_t = action
        
        if return_fix:
            return action
        if self.discrete:
            return action.argmax()
        else:
            return action

    def reset(self, obs):
        self.s_t = obs
        self.random_process.reset_status()

    def load_weights(self, output, num=1):        
        if output is None: return
        self.actor.load_state_dict(
            torch.load('{}/actor{}.pkl'.format(output, num))
        )
        self.actor_target.load_state_dict(
            torch.load('{}/actor{}.pkl'.format(output, num))
        )
        self.critic.load_state_dict(
            torch.load('{}/critic{}.pkl'.format(output, num))
        )
        self.critic_target.load_state_dict(
            torch.load('{}/critic{}.pkl'.format(output, num))
        )

    def save_model(self, output, num):
        if self.use_cuda:
            self.cnn.cpu()
            self.actor.cpu()
            self.critic.cpu()
        torch.save(
            self.actor.state_dict(),
            '{}/actor{}.pkl'.format(output, num)
        )
        torch.save(
            self.critic.state_dict(),
            '{}/critic{}.pkl'.format(output, num)
        )
        if self.use_cuda:
            self.cnn.cuda()
            self.actor.cuda()
            self.critic.cuda()
def buscarActores(pkActores):
    actores = Actor.actores(pkActores)

    return actores
示例#40
0
def actor():
    """Retorna a todos los actores y sus datos"""
    return Actor.all()
示例#41
0
class DDPG(object):
    def __init__(self, nb_status, nb_actions, args, writer):
        self.clip_actor_grad = args.clip_actor_grad
        self.nb_status = nb_status * args.window_length
        self.nb_actions = nb_actions
        self.writer = writer
        self.select_time = 0
        
        # Create Actor and Critic Network
        net_cfg = {
            'hidden1':args.hidden1, 
            'hidden2':args.hidden2, 
            'init_method':args.init_method
        }

        self.actor = Actor(self.nb_status, self.nb_actions, **net_cfg)
        self.actor_target = Actor(self.nb_status, self.nb_actions, **net_cfg)
        self.actor_optim  = Adam(self.actor.parameters(), lr=args.prate)

        self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg)
        self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg)
        self.critic_optim  = Adam(self.critic.parameters(), lr=args.rate)

        hard_update(self.actor_target, self.actor) # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)
        
        #Create replay buffer
        self.memory = rpm(args.rmsize)
        self.random_process = Myrandom(size=nb_actions)

        # Hyper-parameters
        self.batch_size = args.batch_size
        self.tau = args.tau
        self.discount = args.discount
        self.depsilon = 1.0 / args.epsilon

        # 
        self.epsilon = 1.0
        self.s_t = None # Most recent state
        self.a_t = None # Most recent action
        self.use_cuda = args.cuda
        # 
        if self.use_cuda: self.cuda()

    def update_policy(self, train_actor = True):
        # Sample batch
        state_batch, action_batch, reward_batch, \
            next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size)

        # Prepare for the target q batch
        next_q_values = self.critic_target([
            to_tensor(next_state_batch, volatile=True),
            self.actor_target(to_tensor(next_state_batch, volatile=True)),
        ])
        # print('batch of picture is ok')
        next_q_values.volatile = False

        target_q_batch = to_tensor(reward_batch) + \
            self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values

        # Critic update
        self.critic.zero_grad()

        q_batch = self.critic([to_tensor(state_batch), to_tensor(action_batch)])

        # print(reward_batch, next_q_values*self.discount, target_q_batch, terminal_batch.astype(np.float))
        value_loss = nn.MSELoss()(q_batch, target_q_batch)
        value_loss.backward()
        self.critic_optim.step()

        self.actor.zero_grad()

        policy_loss = -self.critic([
            to_tensor(state_batch),
            self.actor(to_tensor(state_batch))
        ])

        policy_loss = policy_loss.mean()
        policy_loss.backward()

        if self.clip_actor_grad is not None:
            torch.nn.utils.clip_grad_norm(self.actor.parameters(), float(self.clip_actor_grad))

            if self.writer != None:
                mean_policy_grad = np.array(np.mean([np.linalg.norm(p.grad.data.cpu().numpy().ravel()) for p in self.actor.parameters()]))
                #print(mean_policy_grad)
                self.writer.add_scalar('train/mean_policy_grad', mean_policy_grad, self.select_time)

        if train_actor:
            self.actor_optim.step()

        # Target update
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)

        return -policy_loss, value_loss

    def eval(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()

    def train(self):
        self.actor.train()
        self.actor_target.train()
        self.critic.train()
        self.critic_target.train()

    def cuda(self):
        self.actor.cuda()
        self.actor_target.cuda()
        self.critic.cuda()
        self.critic_target.cuda()

    def observe(self, r_t, s_t1, done):
        self.memory.append([self.s_t, self.a_t, r_t, s_t1, done])
        self.s_t = s_t1

    def random_action(self):
        action = np.random.uniform(-1.,1.,self.nb_actions)
        self.a_t = action
        return action

    def select_action(self, s_t, decay_epsilon=True, return_fix=False, noise_level=0):
        self.eval()
        # print(s_t.shape)
        action = to_numpy(
            self.actor(to_tensor(np.array([s_t])))
        ).squeeze(0)
            
        self.train()
        noise_level = noise_level * max(self.epsilon, 0)
        
        action = action * (1 - noise_level) + (self.random_process.sample() * noise_level)
        action = np.clip(action, -1., 1.)

        if decay_epsilon:
            self.epsilon -= self.depsilon

        self.a_t = action
        return action

    def reset(self, obs):
        self.s_t = obs
        self.random_process.reset_status()

    def load_weights(self, output, num=1):        
        if output is None: return
        self.actor.load_state_dict(
            torch.load('{}/actor{}.pkl'.format(output, num))
        )
        self.actor_target.load_state_dict(
            torch.load('{}/actor{}.pkl'.format(output, num))
        )
        self.critic.load_state_dict(
            torch.load('{}/critic{}.pkl'.format(output, num))
        )
        self.critic_target.load_state_dict(
            torch.load('{}/critic{}.pkl'.format(output, num))
        )

    def save_model(self, output, num):
        if self.use_cuda:
            self.actor.cpu()
            self.critic.cpu()
        torch.save(
            self.actor.state_dict(),
            '{}/actor{}.pkl'.format(output, num)
        )
        torch.save(
            self.critic.state_dict(),
            '{}/critic{}.pkl'.format(output, num)
        )
        if self.use_cuda:
            self.actor.cuda()
            self.critic.cuda()
示例#42
0
import os
import pickle
import numpy as np

from constants import *
from environment import Game
from model import Actor, Critic
from ppo import PPO
from utils import plot_data
from running_state import *
from replay_memory import *

env = Game()
n_input = env.state_dim

actor = Actor(n_input, N_HIDDEN)
critic = Critic(n_input, N_HIDDEN)

# retrieve previous saved model if exists
if os.path.exists(ACTOR_SAVE_PATH):
    print("Loading saved actor model...")
    actor.load_state_dict(torch.load(ACTOR_SAVE_PATH))
if os.path.exists(CRITIC_SAVE_PATH):
    print("Loading saved critic model...")
    critic.load_state_dict(torch.load(CRITIC_SAVE_PATH))

ppo_agent = PPO(env, actor, critic)

running_state = ZFilter((2, ), clip=5)

statistics = {
示例#43
0
def main():
    env = gym.make(args.env_name)
    env.seed(args.seed)
    torch.manual_seed(args.seed)

    num_inputs = env.observation_space.shape[0]
    num_actions = env.action_space.shape[0]
    running_state = ZFilter((num_inputs,), clip=5)

    print('state size:', num_inputs) 
    print('action size:', num_actions)

    actor = Actor(num_inputs, num_actions, args)
    critic = Critic(num_inputs, args)

    actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate)
    critic_optim = optim.Adam(critic.parameters(), lr=args.learning_rate, 
                              weight_decay=args.l2_rate)

    writer = SummaryWriter(comment="-ppo_iter-" + str(args.max_iter_num))
    
    if args.load_model is not None:
        saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model))
        ckpt = torch.load(saved_ckpt_path)

        actor.load_state_dict(ckpt['actor'])
        critic.load_state_dict(ckpt['critic'])

        running_state.rs.n = ckpt['z_filter_n']
        running_state.rs.mean = ckpt['z_filter_m']
        running_state.rs.sum_square = ckpt['z_filter_s']

        print("Loaded OK ex. Zfilter N {}".format(running_state.rs.n))

    
    episodes = 0    

    for iter in range(args.max_iter_num):
        actor.eval(), critic.eval()
        memory = deque()

        steps = 0
        scores = []

        while steps < args.total_sample_size: 
            state = env.reset()
            score = 0

            state = running_state(state)
            
            for _ in range(10000): 
                if args.render:
                    env.render()

                steps += 1

                mu, std = actor(torch.Tensor(state).unsqueeze(0))
                action = get_action(mu, std)[0]
                next_state, reward, done, _ = env.step(action)

                if done:
                    mask = 0
                else:
                    mask = 1

                memory.append([state, action, reward, mask])

                next_state = running_state(next_state)
                state = next_state

                score += reward

                if done:
                    break
            
            episodes += 1
            scores.append(score)
        
        score_avg = np.mean(scores)
        print('{}:: {} episode score is {:.2f}'.format(iter, episodes, score_avg))
        writer.add_scalar('log/score', float(score_avg), iter)

        actor.train(), critic.train()
        train_model(actor, critic, memory, actor_optim, critic_optim, args)

        if iter % 100:
            score_avg = int(score_avg)

            model_path = os.path.join(os.getcwd(),'save_model')
            if not os.path.isdir(model_path):
                os.makedirs(model_path)

            ckpt_path = os.path.join(model_path, 'ckpt_'+ str(score_avg)+'.pth.tar')

            save_checkpoint({
                'actor': actor.state_dict(),
                'critic': critic.state_dict(),
                'z_filter_n':running_state.rs.n,
                'z_filter_m': running_state.rs.mean,
                'z_filter_s': running_state.rs.sum_square,
                'args': args,
                'score': score_avg
            }, filename=ckpt_path)
示例#44
0
class Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, env, hyper_params, random_seed=0):
        """Initialize an Agent object.

        Params
        ======
            env : the problem environment
            hyper_params : a dictionary of hyper parameters
            random_seed (int): random seed
        """
        self.state_size = env.state_size
        self.action_size = env.action_space_size
        self.hyper_params = hyper_params
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(self.state_size, self.action_size, random_seed).to(device)
        self.actor_target = Actor(self.state_size, self.action_size, random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=hyper_params['lr_actor'])

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(self.state_size, self.action_size, random_seed).to(device)
        self.critic_target = Critic(self.state_size, self.action_size, random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=hyper_params['lr_critic'],
                                           weight_decay=hyper_params['weight_decay'])

        # Noise process
        self.noise = OUNoise((env.num_agents, self.action_size), random_seed)
        self.epsilon = hyper_params['epsilon']

        # Replay memory
        self.memory = ReplayBuffer(self.action_size, hyper_params['memory_size'],
                                   hyper_params['batch_size'], random_seed)

    def step(self, states, actions, rewards, next_states, dones):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones):
            self.memory.add(state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.hyper_params['batch_size']:
            experiences = self.memory.sample()
            self.learn(experiences, self.hyper_params['gamma'])

    def act(self, states, add_noise=True):
        """Returns actions for given state as per current policy."""
        states = torch.FloatTensor(states)
        self.actor_local.eval()
        with torch.no_grad():
            actions = self.actor_local(states)

        self.actor_local.train()
        if add_noise:
            noise = torch.FloatTensor(self.noise.sample())
            actions += (self.epsilon * noise)
            self.epsilon = max(0.01, self.epsilon * self.hyper_params['epsilon_decay'])
        actions = torch.clamp(actions, -0.8, 0.8).detach()
        actions = actions.cpu().numpy()
        return actions

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        # torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target)
        self.soft_update(self.actor_local, self.actor_target)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
        """
        tau = self.hyper_params['tau']
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
示例#45
0
class Agent():
    def __init__(self,
                 state_size,
                 action_size,
                 n_agents=1,
                 buffer_size=int(1e7),
                 batch_size=256,
                 gamma=.99,
                 tau=1e-3,
                 lr_a=1e-4,
                 lr_c=1e-3,
                 weight_decay=0,
                 update_local=10,
                 n_updates=5,
                 random_seed=1):
        """Initialize an Agent object
        
        Params
        =====
            state_size (int): Dimension of states
            action_size (int): Dimension of actions
            n_agents (int): Number of agents
            buffer_size (int): size of replay buffer
            batch_size (int): size of sample
            gamma (float): discount factor
            tau (float): (soft) update of target parameters
            lr_a (float): learning rate of actor
            lr_c (float): learning rate of critic
            weight_decay (float): L2 weight decay
            update_local (int): update local network every x steps
            n_updates (int): number of updates
            seed (int): random seed
        """

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.n_agents = n_agents

        # Hyperparameters
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.lr_a = lr_a
        self.lr_c = lr_c
        self.weight_decay = weight_decay
        self.update_local = update_local
        self.n_updates = n_updates

        # Actor networks
        self.actor_local = \
            Actor(state_size, action_size, seed=random_seed).to(device)
        self.actor_target = \
            Actor(state_size, action_size, seed=random_seed).to(device)
        self.actor_optimizer = \
            optim.Adam(self.actor_local.parameters(), lr=lr_a)

        # Critic networks
        self.critic_local = \
            Critic(state_size, action_size, seed=random_seed).to(device)
        self.critic_target = \
            Critic(state_size, action_size, seed=random_seed).to(device)
        self.critic_optimizer = \
            optim.Adam(self.critic_local.parameters(), lr=lr_c,
                       weight_decay=weight_decay)

        # Replay buffer
        self.memory = ReplayBuffer(action_size, buffer_size, batch_size,
                                   random_seed)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Time step
        self.t_step = 0

    def step(self, state, action, reward, next_state, done, learn=True):
        # Save experience in replay buffer
        #for state, action, reward, next_state, done in zip(state, action, reward, next_state, done):
        self.memory.add(state, action, reward, next_state, done)

        self.t_step += 1
        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size and learn:
            if self.t_step % self.update_local == 0:
                for _ in range(self.n_updates):
                    sample = self.memory.sample()
                    self.__learn(sample, self.gamma)

    def act(self, state, add_noise=True):
        """Returns action given a state according to current policy

        Params
        ======
            state (array_like): current state
            add_noise (bool): handles exploration
        """
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()

        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def __learn(self, sample, gamma):
        """
        Params
        ======
            sample (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = sample

        #----------------- Critic
        # Next actions and actions values
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)

        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local Critic network
        Q_expected = self.critic_local(states, actions)

        # Compute loss
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        #----------------- Actor
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()

        # Minimize loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        #----------------- update target networks
        self.__soft_update(self.critic_local, self.critic_target, self.tau)
        self.__soft_update(self.actor_local, self.actor_target, self.tau)

    def __soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param \
            in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.\
                copy_(tau*local_param.data + (1.0 - tau)*target_param.data)
示例#46
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, num_agents, random_seed):
        """Initialize an Agent object.
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            num_agents (int): number of agents
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(random_seed)

        self.eps = EPS_START
        self.eps_decay = 1 / (EPS_EP_END * LEARN_NUM
                              )  # set decay rate based on epsilon end target
        self.timestep = 0

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise((num_agents, action_size), random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

    def step(self, state, action, reward, next_state, done, agent_number):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        self.timestep += 1
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)
        # Learn, if enough samples are available in memory and at learning interval settings
        if len(self.memory) > BATCH_SIZE and self.timestep % LEARN_EVERY == 0:
            for _ in range(LEARN_NUM):
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA, agent_number)

    def act(self, states, add_noise):
        """Returns actions for both agents as per current policy, given their respective states."""
        states = torch.from_numpy(states).float().to(device)
        actions = np.zeros((self.num_agents, self.action_size))
        self.actor_local.eval()
        with torch.no_grad():
            # get action for each agent and concatenate them
            for agent_num, state in enumerate(states):
                action = self.actor_local(state).cpu().data.numpy()
                actions[agent_num, :] = action
        self.actor_local.train()
        # add noise to actions
        if add_noise:
            actions += self.eps * self.noise.sample()
        return np.clip(actions, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma, agent_number):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        # Construct next actions vector relative to the agent
        if agent_number == 0:
            actions_next = torch.cat((actions_next, actions[:, 2:]), dim=1)
        else:
            actions_next = torch.cat((actions[:, :2], actions_next), dim=1)
        # Compute Q targets for current states (y_i)
        Q_targets_next = self.critic_target(next_states, actions_next)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        # Construct action prediction vector relative to each agent
        if agent_number == 0:
            actions_pred = torch.cat((actions_pred, actions[:, 2:]), dim=1)
        else:
            actions_pred = torch.cat((actions[:, :2], actions_pred), dim=1)
        # Compute actor loss
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

        # update noise decay parameter
        self.eps -= self.eps_decay
        self.eps = max(self.eps, EPS_FINAL)
        self.noise.reset()

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
示例#47
0
    def __init__(self,
                 state_size,
                 action_size,
                 n_agents=1,
                 buffer_size=int(1e7),
                 batch_size=256,
                 gamma=.99,
                 tau=1e-3,
                 lr_a=1e-4,
                 lr_c=1e-3,
                 weight_decay=0,
                 update_local=10,
                 n_updates=5,
                 random_seed=1):
        """Initialize an Agent object
        
        Params
        =====
            state_size (int): Dimension of states
            action_size (int): Dimension of actions
            n_agents (int): Number of agents
            buffer_size (int): size of replay buffer
            batch_size (int): size of sample
            gamma (float): discount factor
            tau (float): (soft) update of target parameters
            lr_a (float): learning rate of actor
            lr_c (float): learning rate of critic
            weight_decay (float): L2 weight decay
            update_local (int): update local network every x steps
            n_updates (int): number of updates
            seed (int): random seed
        """

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.n_agents = n_agents

        # Hyperparameters
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.lr_a = lr_a
        self.lr_c = lr_c
        self.weight_decay = weight_decay
        self.update_local = update_local
        self.n_updates = n_updates

        # Actor networks
        self.actor_local = \
            Actor(state_size, action_size, seed=random_seed).to(device)
        self.actor_target = \
            Actor(state_size, action_size, seed=random_seed).to(device)
        self.actor_optimizer = \
            optim.Adam(self.actor_local.parameters(), lr=lr_a)

        # Critic networks
        self.critic_local = \
            Critic(state_size, action_size, seed=random_seed).to(device)
        self.critic_target = \
            Critic(state_size, action_size, seed=random_seed).to(device)
        self.critic_optimizer = \
            optim.Adam(self.critic_local.parameters(), lr=lr_c,
                       weight_decay=weight_decay)

        # Replay buffer
        self.memory = ReplayBuffer(action_size, buffer_size, batch_size,
                                   random_seed)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Time step
        self.t_step = 0
示例#48
0
class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0  #0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor - 0.99
        self.tau = 0.01  # for soft update of target parameters - 0.01
        
        # Score tracker and learning parameters
        self.best_w = None
        self.best_score = -np.inf
        self.score = -np.inf

    def reset_episode(self):
        self.total_reward = 0.0
        self.count = 0
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
         # Save experience / reward
        self.total_reward += reward
        self.count += 1
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action + self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        self.score = self.total_reward / float(self.count) if self.count else 0.0
        if self.score > self.best_score:
            self.best_score = self.score
            
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack([e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients, 1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)   

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(target_weights), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 - self.tau) * target_weights
        target_model.set_weights(new_weights)