Exemplo n.º 1
0
class action_noise_DDPG(DDPG):
    def __init__(self):
        super(action_noise_DDPG, self).__init__()

    def setup(self, nb_pos, nb_laser, nb_actions):
        super(action_noise_DDPG, self).setup(nb_pos, nb_laser, nb_actions)
        self.nb_pos = nb_pos
        self.nb_laser = nb_laser
        exploration_args = Singleton_arger()['exploration']
        self.noise_decay = exploration_args['noise_decay']
        self.noise_coef = 1
        self.rollout_actor = copy.deepcopy(self.actor)
        self.action_noise = OrnsteinUhlenbeckActionNoise(
            mu=np.zeros(nb_actions),
            sigma=float(exploration_args['stddev']) * np.ones(nb_actions))
        if self.with_cuda:
            for net in (self.rollout_actor, ):
                if net is not None:
                    net.cuda()

    def reset_noise(self):
        self.action_noise.reset()

    def before_epoch(self):
        self.apply_noise_decay()

    def apply_noise_decay(self):
        if self.noise_decay > 0:
            self.noise_coef = self.noise_decay * self.noise_coef / (
                self.noise_coef + self.noise_decay)

    def select_action(self, s_t, apply_noise):
        s_t = torch.tensor(np.vstack(s_t),
                           dtype=torch.float32,
                           requires_grad=False).cuda()
        s_t = s_t.split([self.nb_pos, self.nb_laser], dim=1)
        #s_t = torch.tensor(s_t,dtype = torch.float32,requires_grad = False)
        #if self.with_cuda:
        #    s_t = s_t.cuda()
        with torch.no_grad():
            action = self.actor(s_t).cpu().numpy()
        if apply_noise:
            action += max(self.noise_coef, 0) * self.action_noise()
        action = np.clip(action, -1., 1.)
        return action
Exemplo n.º 2
0
    def train(self):

        config = tf.ConfigProto(allow_soft_placement=True,
                                log_device_placement=False)
        config.gpu_options.allow_growth = True

        # Create global step and increment operation
        global_step_tensor = tf.Variable(0,
                                         trainable=False,
                                         name='global_step')
        increment_global_step = tf.assign_add(global_step_tensor, 1)

        # Create model saver
        saver = tf.train.Saver()

        sess = tf.Session(config=config)

        if not self.parameters['restore']:
            sess.run(tf.global_variables_initializer())
        else:
            saver.restore(sess, tf.train.latest_checkpoint('./saves'))

        self.actor_critic.set_moving_to_target(sess)
        run_id = np.random.randint(10000)

        trainwriter = tf.summary.FileWriter(logdir='./logs/' + str(run_id),
                                            graph=sess.graph)

        # Get action noise
        action_noise = OrnsteinUhlenbeckActionNoise(
            mu=np.zeros(self.nA),
            sigma=float(self.parameters['sigma']) * np.ones(self.nA))

        # Fill Replay Memory
        state = self.env.reset()
        fill_amount = 0
        while fill_amount < self.parameters['replay_init_size']:

            action = self.env.action_space.sample()
            next_state, reward, done, _ = self.env.step(action)

            if done:
                state = self.env.reset()
            else:
                fill_amount += 1
                self.memory.add(state, action, reward, done, next_state)
                state = next_state

        # Main Loop
        steps = 0

        for i in range(self.parameters['num_epochs']):

            avg_epoch_rewards = 0
            num_epochs = 1
            for e in range(self.parameters['num_episodes']):

                state = self.env.reset()

                ep_reward = 0

                # Perform rollout
                while True:
                    noise = action_noise()
                    action = self.actor_critic.pi(sess, state[None, ...])
                    action += noise
                    action = np.clip(action, self.env.action_space.low[0],
                                     self.env.action_space.high[0])

                    assert action.shape == self.env.action_space.shape
                    """
					# UNCOMMENT TO PRINT ACTIONS
					a0 = tf.Summary(value=[tf.Summary.Value(tag="action_0", simple_value=action[0,0])])
					trainwriter.add_summary(a0,steps)
					a1 = tf.Summary(value=[tf.Summary.Value(tag="action_1", simple_value=action[0,1])])
					trainwriter.add_summary(a1,steps)
					a2 = tf.Summary(value=[tf.Summary.Value(tag="action_2", simple_value=action[0,2])])
					trainwriter.add_summary(a2,steps)
					steps += 1
					"""

                    next_state, reward, done, _ = self.env.step(action)

                    self.memory.add(state, action, reward, done, next_state)

                    if self.parameters['render_train']:
                        self.env.render()

                    ep_reward += reward

                    if done:

                        reward_summary = tf.Summary(value=[
                            tf.Summary.Value(tag="ep_rewards",
                                             simple_value=ep_reward)
                        ])
                        trainwriter.add_summary(
                            reward_summary,
                            i * self.parameters['num_episodes'] + e)
                        action_noise.reset()
                        break

                    state = next_state

                avg_epoch_rewards = avg_epoch_rewards + (
                    ep_reward - avg_epoch_rewards) / num_epochs
                num_epochs += 1

                # Perform train
                for t in range(self.parameters['num_train_steps']):
                    s_state, s_action, s_reward, s_next_state, s_terminal = self.memory.sample(
                    )
                    # Train actor critic model
                    self.actor_critic.update(sess=sess,
                                             filewriter=trainwriter,
                                             state_batch=s_state,
                                             next_state_batch=s_next_state,
                                             action_batch=s_action,
                                             reward_batch=s_reward,
                                             done_batch=s_terminal)
                    sess.run(increment_global_step)

            # Print out epoch stats here

            table_data = [['Epoch', 'Average Reward'],
                          [
                              str(i) + "/" +
                              str(self.parameters['num_epochs']),
                              str(avg_epoch_rewards)
                          ]]

            table = AsciiTable(table_data, "Training Run: " + str(run_id))

            save_path = saver.save(sess, "./saves/model.ckpt")

            os.system('clear')
            print("Model saved in path: %s" % save_path + "\n" + table.table)
Exemplo n.º 3
0
    def train(self):

        config = tf.ConfigProto(allow_soft_placement=True,
                                log_device_placement=False)
        config.gpu_options.allow_growth = True

        # Create global step and increment operation
        global_step_tensor = tf.Variable(0,
                                         trainable=False,
                                         name='global_step')
        increment_global_step = tf.assign_add(global_step_tensor, 1)

        # Create model saver
        saver = tf.train.Saver(max_to_keep=None)

        sess = tf.Session(config=config)

        if not self.parameters['restore']:
            sess.run(tf.global_variables_initializer())
        else:
            saver.restore(sess, tf.train.latest_checkpoint('./saves'))

        self.actor_critic.set_moving_to_target(sess)
        run_id = np.random.randint(10000)

        trainwriter = tf.summary.FileWriter(logdir='./logs/' + str(run_id),
                                            graph=sess.graph)

        # Get action noise
        action_noise = OrnsteinUhlenbeckActionNoise(
            mu=np.zeros(self.nA),
            sigma=float(self.parameters['sigma']) * np.ones(self.nA))

        # Fill Replay Memory
        state = self.env.reset()
        fill_amount = 0
        while fill_amount < self.parameters['replay_init_size']:

            action = self.env.action_space.sample()
            next_state, reward, done, _ = self.env.step(action)

            if done:
                state = self.env.reset()
            else:
                fill_amount += 1
                self.memory.add(state, action, reward, done, next_state)
                state = next_state

        # Main Loop
        plots = {'critic_loss': [], 'actor_loss': [], 'episode_reward': []}

        plots_dir = './plots/'
        weights_dir = './weights/'
        graph_dir = './graph/'
        if not os.path.exists(plots_dir):
            os.makedirs(plots_dir)
        if not os.path.exists(weights_dir):
            os.makedirs(weights_dir)
        if not os.path.exists(graph_dir):
            os.makedirs(graph_dir)

        saver.export_meta_graph(graph_dir + self.parameters['env'] +
                                '/graph.meta')

        #cumulative step counter
        cumu_step = 0

        for i in range(self.parameters['num_epochs']):

            avg_epoch_rewards = 0
            n_epochs = 1

            for e in range(self.parameters['num_episodes']):

                state = self.env.reset()

                ep_reward = 0
                ep_n_action = 0

                # Perform rollout
                for _ in range(500):
                    noise = action_noise()
                    action = self.actor_critic.pi(sess, state[None, ...])
                    action += noise
                    action = np.clip(action, self.env.action_space.low[0],
                                     self.env.action_space.high[0])

                    assert action.shape == self.env.action_space.shape

                    next_state, reward, done, _ = self.env.step(action)
                    # print(action)
                    # print(next_state)
                    # print(reward)

                    self.memory.add(state, action, reward, done, next_state)

                    if self.parameters['render_train']: self.env.render()

                    ep_reward += reward
                    ep_n_action += 1
                    cumu_step += 1
                    state = next_state

                    # Perform train
                    avg_critic_loss = 0.0
                    avg_actor_loss = 0.0
                    for t in range(self.parameters['num_train_steps']):
                        s_state, s_action, s_reward, s_next_state, s_terminal = self.memory.sample(
                        )
                        # Train actor critic model
                        _, _, critic_loss, actor_loss = self.actor_critic.update(
                            sess=sess,
                            filewriter=trainwriter,
                            state_batch=s_state,
                            next_state_batch=s_next_state,
                            action_batch=s_action,
                            reward_batch=s_reward,
                            done_batch=s_terminal)
                        avg_critic_loss += critic_loss
                        avg_actor_loss += actor_loss

                        sess.run(increment_global_step)

                    avg_critic_loss /= self.parameters['num_train_steps']
                    avg_actor_loss /= self.parameters['num_train_steps']

                    if done:
                        reward_summary = tf.Summary(value=[
                            tf.Summary.Value(tag="ep_rewards",
                                             simple_value=ep_reward)
                        ])
                        trainwriter.add_summary(
                            reward_summary,
                            i * self.parameters['num_episodes'] + e)
                        action_noise.reset()
                        break

                avg_epoch_rewards = avg_epoch_rewards + (
                    ep_reward - avg_epoch_rewards) / n_epochs
                n_epochs += 1


                print('Epoch: {:d} | Reward: {:d} | Avg_Q_loss: {:.4f} | Avg_a_loss: {:.4f} | Episode: {:d} | Step: {:d} | Cumu Step: {:d}'\
                 .format(i+1, int(ep_reward), avg_critic_loss, avg_actor_loss, e+1, ep_n_action, cumu_step))

                if e % 19 == 0:
                    save_path = saver.save(
                        sess,
                        weights_dir + self.parameters['env'] + '/model.ckpt',
                        global_step=i * e + 1)

                plots['episode_reward'].append(ep_reward)
                plots['critic_loss'].append(critic_loss)
                plots['actor_loss'].append(critic_loss)

                pickle.dump(
                    plots,
                    open(plots_dir + self.parameters['env'] + '_plot.pickle',
                         'wb'))
Exemplo n.º 4
0
class DDPG(object):

    def __init__(self, gamma, tau,num_inputs, env,device, results_path=None):

        self.gamma = gamma
        self.tau = tau
        self.min_action,self.max_action = env.action_range()
        self.device = device
        self.num_actions = env.action_space()
        self.noise_stddev = 0.3

        self.results_path = results_path
        self.checkpoint_path = os.path.join(self.results_path, 'checkpoint/')
        os.makedirs(self.checkpoint_path, exist_ok=True)

        # Define the actor
        self.actor = Actor(num_inputs, self.num_actions).to(device)
        self.actor_target = Actor(num_inputs, self.num_actions).to(device)

        # Define the critic
        self.critic = Critic(num_inputs, self.num_actions).to(device)
        self.critic_target = Critic(num_inputs, self.num_actions).to(device)

        # Define the optimizers for both networks
        self.actor_optimizer  = Adam(self.actor.parameters(),  lr=1e-4 )                          # optimizer for the actor network
        self.critic_optimizer = Adam(self.critic.parameters(), lr=1e-4,   weight_decay=0.002)  # optimizer for the critic network

        self.hard_swap()

        self.ou_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(self.num_actions),
                                            sigma=float(self.noise_stddev) * np.ones(self.num_actions))
        self.ou_noise.reset()

    def eval_mode(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic_target.eval()
        self.critic.eval()

    def train_mode(self):
        self.actor.train()
        self.actor_target.train()
        self.critic_target.train()
        self.critic.train()


    def get_action(self, state, episode, action_noise=True):
        x = state.to(self.device)

        # Get the continous action value to perform in the env
        self.actor.eval()  # Sets the actor in evaluation mode
        mu = self.actor(x)
        self.actor.train()  # Sets the actor in training mode
        mu = mu.data

        # During training we add noise for exploration
        if action_noise:
            noise = torch.Tensor(self.ou_noise.noise()).to(self.device) * 1.0/(1.0 + 0.1*episode)
            noise = noise.clamp(0,0.1)
            mu = mu + noise  # Add exploration noise ε ~ p(ε) to the action. Do not use OU noise (https://spinningup.openai.com/en/latest/algorithms/ddpg.html)

        # Clip the output according to the action space of the env
        mu = mu.clamp(self.min_action,self.max_action)

        return mu

    def update_params(self, batch):
        # Get tensors from the batch
        state_batch = torch.cat(batch.state).to(self.device)
        action_batch = torch.cat(batch.action).to(self.device)
        reward_batch = torch.cat(batch.reward).to(self.device)
        done_batch = torch.cat(batch.done).to(self.device)
        next_state_batch = torch.cat(batch.next_state).to(self.device)

        # Get the actions and the state values to compute the targets
        next_action_batch = self.actor_target(next_state_batch)
        next_state_action_values = self.critic_target(next_state_batch, next_action_batch.detach())

        # Compute the target
        reward_batch = reward_batch.unsqueeze(1)
        done_batch = done_batch.unsqueeze(1)
        expected_values = reward_batch + (1.0 - done_batch) * self.gamma * next_state_action_values

        # Update the critic network
        self.critic_optimizer.zero_grad()
        state_action_batch = self.critic(state_batch, action_batch)
        value_loss = F.mse_loss(state_action_batch, expected_values.detach())
        value_loss.backward()
        self.critic_optimizer.step()

        # Update the actor network
        self.actor_optimizer.zero_grad()
        policy_loss = -self.critic(state_batch, self.actor(state_batch))
        policy_loss = policy_loss.mean()
        policy_loss.backward()
        for param in self.actor.parameters():
                param.grad.data.clamp_(-1, 1)
        self.actor_optimizer.step()

       # Update the target networks
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)

        return value_loss.item(), policy_loss.item()
    
    def hard_swap(self):
        # Make sure both targets are with the same weight
        hard_update(self.actor_target, self.actor)
        hard_update(self.critic_target, self.critic)

    def store_model(self):
        print("Storing model at: ", self.checkpoint_path)
        checkpoint = {
            'actor': self.actor.state_dict(),
            'actor_optim': self.actor_optimizer.state_dict(),
            'critic': self.critic.state_dict(),
            'criti_optim': self.critic_optimizer.state_dict()
        }
        torch.save(checkpoint, os.path.join(self.checkpoint_path, 'checkpoint.pth') )

    def load_model(self):
        files = os.listdir(self.checkpoint_path)
        if files:
            print("Loading models checkpoints!")
            model_dicts = torch.load(os.path.join(self.checkpoint_path, 'checkpoint.pth'),map_location=self.device)
            self.actor.load_state_dict(model_dicts['actor'])
            self.actor_optimizer.load_state_dict(model_dicts['actor_optim'])
            self.critic.load_state_dict(model_dicts['critic'])
            self.critic_optimizer.load_state_dict(model_dicts['criti_optim'])
        else:
            print("Checkpoints not found!")