Пример #1
0
def train_ea(n_episodes=1, debug=False, gen_index=0, render=False):
    """
    Train EA process
    """

    batch_steps = 0
    actor = Actor(nb_states, nb_actions)
    actors_params = ea.ask()
    fitness = []

    # evaluate all actors
    for actor_params in actors_params:
        actor.set_params(actor_params)
        f, steps = evaluate(actor,
                            n_episodes=n_episodes,
                            noise=False,
                            render=render,
                            training=False)
        batch_steps += steps
        fitness.append(f)

        # print scores
        if debug:
            prLightPurple('Generation#{}: EA actor fitness:{}'.format(
                gen_index, f))

    # update ea
    ea.tell(fitness)

    return batch_steps
Пример #2
0
def test(n_test, filename, debug=False, render=False):
    """
    Test an agent
    """

    # load weights
    actor = Actor(nb_states, nb_actions)
    actor.load_model(filename)

    # evaluate
    f, _ = evaluate(actor, n_episodes=n_test, noise=False, render=render)

    # print scores
    if debug:
        prLightPurple('Average fitness:{}'.format(f))
Пример #3
0
    def __init__(self, nb_states, nb_actions, memory, args):

        if args.seed > 0:
            self.seed(args.seed)

        self.nb_states = nb_states
        self.nb_actions = nb_actions

        # Create Actor and Critic Network
        self.actor = Actor(self.nb_states, self.nb_actions)
        self.actor_target = Actor(self.nb_states, self.nb_actions)
        self.actor_optim = Adam(self.actor.parameters(), lr=args.actor_lr)

        self.critic = Critic(self.nb_states, self.nb_actions)
        self.critic_target = Critic(self.nb_states, self.nb_actions)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.critic_lr)

        # scale the actor parameters
        self.actor.scale_params(0.1)

        # Make sure target is with the same weight
        hard_update(self.actor_target, self.actor)
        hard_update(self.critic_target, self.critic)

        # Create replay buffer
        self.memory = memory
        self.random_process = OrnsteinUhlenbeckProcess(nb_actions,
                                                       theta=args.ou_theta,
                                                       mu=args.ou_mu,
                                                       sigma=args.ou_sigma)

        # hyper-parameters
        self.reward_scale = 1.
        self.batch_size = args.batch_size
        self.tau = args.tau
        self.discount = args.discount

        if USE_CUDA:
            self.cuda()
Пример #4
0
        np.random.seed(args.seed)
        env.seed(args.seed)

    # replay buffer
    memory = Memory(args.mem_size)

    # DDPG agent
    agent = DDPG(nb_states, nb_actions, memory, args)

    # EA process
    ea = GA(agent.get_actor_size(),
            pop_size=args.pop_size,
            mut_amp=args.mut_amp,
            mut_rate=args.mut_rate,
            elite_frac=args.elite_frac,
            generator=lambda: Actor(nb_states, nb_actions).get_params())

    # Trying ES type algorithms, but without much success
    # ea = OpenES(agent.get_actor_size(), pop_size=args.pop_size, mut_amp=args.mut_amp,
    #           generator=lambda: Actor(nb_states, nb_actions).get_params())

    if args.mode == 'train':
        train(n_gen=args.n_gen,
              n_episodes=args.n_episodes,
              omega=args.omega,
              output=args.output,
              debug=args.debug,
              render=args.render)

    elif args.mode == 'test':
        test(args.n_test, args.filename, render=args.render, debug=args.debug)
Пример #5
0
class DDPG(object):
    def __init__(self, nb_states, nb_actions, memory, args):

        if args.seed > 0:
            self.seed(args.seed)

        self.nb_states = nb_states
        self.nb_actions = nb_actions

        # Create Actor and Critic Network
        self.actor = Actor(self.nb_states, self.nb_actions)
        self.actor_target = Actor(self.nb_states, self.nb_actions)
        self.actor_optim = Adam(self.actor.parameters(), lr=args.actor_lr)

        self.critic = Critic(self.nb_states, self.nb_actions)
        self.critic_target = Critic(self.nb_states, self.nb_actions)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.critic_lr)

        # scale the actor parameters
        self.actor.scale_params(0.1)

        # Make sure target is with the same weight
        hard_update(self.actor_target, self.actor)
        hard_update(self.critic_target, self.critic)

        # Create replay buffer
        self.memory = memory
        self.random_process = OrnsteinUhlenbeckProcess(nb_actions,
                                                       theta=args.ou_theta,
                                                       mu=args.ou_mu,
                                                       sigma=args.ou_sigma)

        # hyper-parameters
        self.reward_scale = 1.
        self.batch_size = args.batch_size
        self.tau = args.tau
        self.discount = args.discount

        if USE_CUDA:
            self.cuda()

    def train(self):

        # Sample batch
        batch = self.memory.sample(self.batch_size)
        state_batch = to_tensor(batch.states).view(-1, self.nb_states)
        action_batch = to_tensor(batch.actions).view(-1, self.nb_actions)
        reward_batch = to_tensor(batch.rewards).view(-1, 1)
        next_state_batch = to_tensor(batch.next_states).view(
            -1, self.nb_states)
        done_batch = to_tensor(batch.dones).view(-1, 1)

        # Prepare for the target q batch
        next_q_values = self.critic_target(
            [next_state_batch,
             self.actor_target(next_state_batch)]).detach()

        target_q_batch = self.reward_scale * reward_batch + \
            self.discount * (1. - done_batch) * next_q_values

        # Critic update
        self.critic_optim.zero_grad()

        q_batch = self.critic([state_batch, action_batch])

        value_loss = criterion(q_batch, target_q_batch)
        value_loss.backward()

        self.critic_optim.step()

        # Actor update
        self.actor_optim.zero_grad()

        policy_loss = -1. * self.critic([state_batch, self.actor(state_batch)])

        policy_loss = policy_loss.mean()
        policy_loss.backward()
        self.actor_optim.step()

        # Target update
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)

    def eval(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()

    def cuda(self):
        self.actor.cuda()
        self.actor_target.cuda()
        self.critic.cuda()
        self.critic_target.cuda()

    def get_actor_size(self):
        return np.shape(self.get_actor_params())[0]

    def get_actor(self):
        return deepcopy(self.actor)

    def set_actor(self, actor):
        self.actor = actor

    def get_critic(self):
        return deepcopy(self.critic)

    def get_actor_params(self):
        return self.actor.get_params()

    def random_action(self):
        action = np.random.uniform(-1., 1., self.nb_actions)
        return action

    def select_action(self, s_t, noise=True):
        """
        Returns action after seeing state 
        """
        action = to_numpy(self.actor(to_tensor(np.array([s_t])))).squeeze(0)
        if noise:
            action += self.is_training * \
                max(self.epsilon, 0) * self.random_process.sample()
        action = np.clip(action, -1., 1.)

        return action

    def reset(self):
        self.random_process.reset_states()

    def load_model(self, filename):
        self.actor.load_model(filename)
        self.critic.load_model(filename)

    def save_model(self, output):
        self.actor.save_model(output)
        self.critic.save_model(output)

    def seed(self, s):
        torch.manual_seed(s)
        if USE_CUDA:
            torch.cuda.manual_seed(s)