示例#1
0
def main(args):
    device = "cuda" if args.cuda else "cpu"
    mp.set_start_method('spawn')
    # Input Experiment Hyperparameters
    hp = SACHP(EXP_NAME=args.name,
               DEVICE=device,
               ENV_NAME=args.env,
               N_ROLLOUT_PROCESSES=3,
               LEARNING_RATE=0.0001,
               EXP_GRAD_RATIO=10,
               BATCH_SIZE=256,
               GAMMA=0.95,
               REWARD_STEPS=3,
               ALPHA=0.015,
               LOG_SIG_MAX=2,
               LOG_SIG_MIN=-20,
               EPSILON=1e-6,
               REPLAY_SIZE=100000,
               REPLAY_INITIAL=512,
               SAVE_FREQUENCY=100000,
               GIF_FREQUENCY=100000,
               TOTAL_GRAD_STEPS=1000000)
    wandb.init(project='RoboCIn-RL',
               name=hp.EXP_NAME,
               entity='robocin',
               config=hp.to_dict())
    current_time = datetime.datetime.now().strftime('%b-%d_%H-%M-%S')
    tb_path = os.path.join(
        'runs', current_time + '_' + hp.ENV_NAME + '_' + hp.EXP_NAME)
    # Training
    sac = SAC(hp)
    buffer = ReplayBuffer(buffer_size=hp.REPLAY_SIZE,
                          observation_space=hp.observation_space,
                          action_space=hp.action_space,
                          device=hp.DEVICE)

    # Playing
    sac.share_memory()
    exp_queue = mp.Queue(maxsize=hp.EXP_GRAD_RATIO)
    finish_event = mp.Event()
    gif_req_m = mp.Value('i', -1)
    data_proc = mp.Process(target=rollout,
                           args=(sac, device, exp_queue, finish_event,
                                 gif_req_m, hp))
    data_proc.start()

    n_grads = 0
    n_samples = 0
    n_episodes = 0
    best_reward = None
    last_gif = None
    try:
        while n_grads < hp.TOTAL_GRAD_STEPS:
            metrics = {}
            ep_infos = list()
            st_time = time.perf_counter()
            # Collect EXP_GRAD_RATIO sample for each grad step
            new_samples = 0
            while new_samples < hp.EXP_GRAD_RATIO:
                exp = exp_queue.get()
                if exp is None:
                    raise Exception  # got None value in queue
                safe_exp = copy.deepcopy(exp)
                del (exp)

                # Dict is returned with end of episode info
                if isinstance(safe_exp, dict):
                    logs = {
                        "ep_info/" + key: value
                        for key, value in safe_exp.items()
                        if 'truncated' not in key
                    }
                    ep_infos.append(logs)
                    n_episodes += 1
                else:
                    if safe_exp.last_state is not None:
                        last_state = safe_exp.last_state
                    else:
                        last_state = safe_exp.state
                    buffer.add(obs=safe_exp.state,
                               next_obs=last_state,
                               action=safe_exp.action,
                               reward=safe_exp.reward,
                               done=False
                               if safe_exp.last_state is not None else True)
                    new_samples += 1
            n_samples += new_samples
            sample_time = time.perf_counter()

            # Only start training after buffer is larger than initial value
            if buffer.size() < hp.REPLAY_INITIAL:
                continue

            # Sample a batch and load it as a tensor on device
            batch = buffer.sample(hp.BATCH_SIZE)
            metrics["train/loss_pi"], metrics["train/loss_Q1"], \
                metrics["train/loss_Q2"], metrics["train/loss_alpha"], \
                metrics["train/alpha"] = sac.update(batch=batch,
                                                    metrics=metrics)

            n_grads += 1
            grad_time = time.perf_counter()
            metrics['speed/samples'] = new_samples / (sample_time - st_time)
            metrics['speed/grad'] = 1 / (grad_time - sample_time)
            metrics['speed/total'] = 1 / (grad_time - st_time)
            metrics['counters/samples'] = n_samples
            metrics['counters/grads'] = n_grads
            metrics['counters/episodes'] = n_episodes
            metrics["counters/buffer_len"] = buffer.size()

            if ep_infos:
                for key in ep_infos[0].keys():
                    metrics[key] = np.mean([info[key] for info in ep_infos])

            # Log metrics
            wandb.log(metrics)
            if hp.SAVE_FREQUENCY and n_grads % hp.SAVE_FREQUENCY == 0:
                save_checkpoint(hp=hp,
                                metrics={
                                    'alpha': sac.alpha,
                                    'n_samples': n_samples,
                                    'n_grads': n_grads,
                                    'n_episodes': n_episodes
                                },
                                pi=sac.pi,
                                Q=sac.Q,
                                pi_opt=sac.pi_opt,
                                Q_opt=sac.Q_opt)

            if hp.GIF_FREQUENCY and n_grads % hp.GIF_FREQUENCY == 0:
                gif_req_m.value = n_grads

    except KeyboardInterrupt:
        print("...Finishing...")
        finish_event.set()

    finally:
        if exp_queue:
            while exp_queue.qsize() > 0:
                exp_queue.get()

        print('queue is empty')

        print("Waiting for threads to finish...")
        data_proc.terminate()
        data_proc.join()

        del (exp_queue)
        del (sac)

        finish_event.set()
示例#2
0
                    if safe_exp.last_state is not None:
                        last_state = safe_exp.last_state
                    else:
                        last_state = safe_exp.state
                    buffer.add(obs=safe_exp.state,
                               next_obs=last_state,
                               action=safe_exp.action,
                               reward=safe_exp.reward,
                               done=False
                               if safe_exp.last_state is not None else True)
                    new_samples += 1
            n_samples += new_samples
            sample_time = time.perf_counter()

            # Only start training after buffer is larger than initial value
            if buffer.size() < hp.REPLAY_INITIAL:
                continue

            # Sample a batch and load it as a tensor on device
            batch = buffer.sample(hp.BATCH_SIZE)
            pi_loss, Q_loss1, Q_loss2, log_pi = loss_sac(
                alpha, hp.GAMMA**hp.REWARD_STEPS, batch, Q, pi, tgt_Q, device)

            # train Entropy parameter

            alpha_loss = -(log_alpha * (log_pi + target_entropy).detach())
            alpha_loss = alpha_loss.mean()

            alpha_optim.zero_grad()
            alpha_loss.backward()
            alpha_optim.step()
示例#3
0
文件: agent.py 项目: shintay/rl
class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size,
                                   self.action_size,
                                   self.action_low,
                                   self.action_high)

        self.actor_target = Actor(self.state_size,
                                    self.action_size,
                                    self.action_low,
                                    self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0.0  #0
        self.exploration_theta = 0.125 # 0.14 | 0.1
        self.exploration_sigma = 0.0009 # 0.001 | 0.2 | 0.001
        self.noise = OUNoise(self.action_size,
                             self.exploration_mu,
                             self.exploration_theta,
                             self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size,
                                   self.batch_size)

        # Algorithm parameters
        self.gamma = 0.998  # 0.99 | 0.9 | discount factor
        self.tau = 0.099  # 0.001| 0.01 | 0.1 | 0.05 |  for soft update of target parameters

        # Score tracker
        self.best_score = -np.inf
        self.score = 0

    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        self.score = 0
        return state

    def step(self, action, reward, next_state, done):
         # Save experience / reward
        self.memory.add(self.last_state,
                        action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if self.memory.size() > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

        # Score tracker
        self.score += reward
        if done:
            if self.score > self.best_score:
                self.best_score = self.score
            # self.best_score = max(self.score, self.best_score)

    def act(self, states):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(states, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action + self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters
           using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element
        # (states, actions, rewards, etc.)
        states = np.vstack([
            xp.state for xp in experiences if xp is not None])
        actions = np.array([
            xp.action for xp in experiences if xp is not None]).astype(
            np.float32).reshape(-1, self.action_size)
        rewards = np.array([
            xp.reward for xp in experiences if xp is not None]).astype(
            np.float32).reshape(-1, 1)
        dones = np.array([
            xp.done for xp in experiences if xp is not None]).astype(
            np.uint8).reshape(-1, 1)
        next_states = np.vstack([
            xp.next_state for xp in experiences if xp is not None])

        # Get predicted next-state actions and Q values from target models
        # Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients, 1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)


    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(target_weights), \
        "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 - self.tau) * target_weights
        target_model.set_weights(new_weights)