Exemplo n.º 1
0
def train(sess, env, args, actor, critic, actor_noise, reward_result, agent,
          log_name, log_cbf_name):
    # Set up summary Ops
    summary_ops, summary_vars = build_summaries()

    sess.run(tf.global_variables_initializer())

    # Initialize target network weights
    actor.update_target_network()
    critic.update_target_network()

    # Initialize replay memory
    replay_buffer = ReplayBuffer(int(args['buffer_size']),
                                 int(args['random_seed']))

    # Needed to enable BatchNorm.
    # This hurts the performance on Pendulum but could be useful
    # in other environments.
    # tflearn.is_training(True)

    paths = list()

    # Extract the arguments that will be used repeatedly
    episode_length = int(args['max_episode_len'])
    max_episodes = int(args['max_episodes'])
    num_evals = int(args['num_evals'])

    # Evaluate initial performance
    for j in range(num_evals):
        # Without the CBF
        steps, reward, done, _ = evaluate(env, actor, episode_length)
        with open(log_name, "a") as myfile:
            myfile.write(
                str(0) + ', ' + str(steps) + ', ' + str(reward) + ', ' +
                str(done) + '\n')

        # With the CBF
        steps, reward, done, _ = evaluate_with_cbf(env, actor, agent,
                                                   episode_length)
        with open(log_cbf_name, "a") as myfile:
            myfile.write(
                str(0) + ', ' + str(steps) + ', ' + str(reward) + ', ' +
                str(done) + '\n')

    for i in range(max_episodes):

        # Utilize GP from previous iteration while training current iteration
        if agent.firstIter == 1:
            pass
        else:
            agent.GP_model_prev = list(agent.GP_model)
            dynamics_gp.build_GP_model(agent)

        for el in range(5):

            obs, action, rewards, action_bar, action_BAR = [], [], [], [], []

            s = env.reset()
            # Ensure that starting position is in "safe" region
            while not (-0.09 <= env.unwrapped.state[0] <= 0.09
                       and -0.01 <= env.unwrapped.state[1] <= 0.01):
                s = env.reset()

            ep_reward = 0
            ep_ave_max_q = 0

            for j in range(episode_length):

                # env.render()

                # Added exploration noise
                # a = actor.predict(np.reshape(s, (1, 3))) + (1. / (1. + i))
                a = actor.predict(np.reshape(
                    s, (1, actor.s_dim))) + actor_noise()

                # Incorporate barrier function
                action_rl = a[0]

                # Utilize compensation barrier function
                if agent.firstIter == 1:
                    u_BAR_ = [0]
                else:
                    u_BAR_ = agent.bar_comp.get_action(s)[0]

                action_RL = action_rl + u_BAR_

                # Utilize safety barrier function
                if agent.firstIter == 1:
                    [f, g, x,
                     std] = dynamics_gp.get_GP_dynamics(agent, s, action_RL)
                else:
                    [f, g, x, std
                     ] = dynamics_gp.get_GP_dynamics_prev(agent, s, action_RL)
                u_bar_ = cbf.control_barrier(agent, np.squeeze(s), action_RL,
                                             f, g, x, std)
                action_ = action_RL + u_bar_

                s2, r, terminal, info = env.step(action_)

                replay_buffer.add(np.reshape(s, (actor.s_dim, )),
                                  np.reshape(a, (actor.a_dim, )), r, terminal,
                                  np.reshape(s2, (actor.s_dim, )))

                # replay_buffer.add(np.reshape(s, (actor.s_dim,)), np.reshape(action_, (actor.a_dim,)), r,
                #                  terminal, np.reshape(s2, (actor.s_dim,)))

                # Keep adding experience to the memory until
                # there are at least minibatch size samples
                if replay_buffer.size() > int(args['minibatch_size']):
                    s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch(
                        int(args['minibatch_size']))

                    # Calculate targets
                    target_q = critic.predict_target(
                        s2_batch, actor.predict_target(s2_batch))

                    y_i = []
                    for k in range(int(args['minibatch_size'])):
                        if t_batch[k]:
                            y_i.append(r_batch[k])
                        else:
                            y_i.append(r_batch[k] + critic.gamma * target_q[k])

                    # Update the critic given the targets
                    predicted_q_value, _ = critic.train(
                        s_batch, a_batch,
                        np.reshape(y_i, (int(args['minibatch_size']), 1)))

                    ep_ave_max_q += np.amax(predicted_q_value)

                    # Update the actor policy using the sampled gradient
                    a_outs = actor.predict(s_batch)
                    grads = critic.action_gradients(s_batch, a_outs)
                    actor.train(s_batch, grads[0])

                    # Update target networks
                    actor.update_target_network()
                    critic.update_target_network()

                s = s2
                ep_reward += r

                obs.append(s)
                rewards.append(r)
                action_bar.append(u_bar_)
                action_BAR.append(u_BAR_)
                action.append(action_)

                if terminal:
                    # writer.add_summary(summary_str, i)
                    # writer.flush()

                    print(
                        '| Reward: {:d} | Episode: {:d} | Qmax: {:.4f}'.format(
                            int(ep_reward), i, (ep_ave_max_q / float(j))))
                    reward_result[i] = ep_reward
                    path = {
                        "Observation": np.concatenate(obs).reshape((200, 3)),
                        "Action": np.concatenate(action),
                        "Action_bar": np.concatenate(action_bar),
                        "Action_BAR": np.concatenate(action_BAR),
                        "Reward": np.asarray(rewards)
                    }
                    paths.append(path)

                    break
            if el <= 3:
                dynamics_gp.update_GP_dynamics(agent, path)

        if (i <= 4):
            agent.bar_comp.get_training_rollouts(paths)
            barr_loss = agent.bar_comp.train()
        else:
            barr_loss = 0.
        agent.firstIter = 0

        # Evaluate performance of trained model after the episode
        for k in range(num_evals):
            # Without the CBF
            steps, reward, done, _ = evaluate(env, actor, episode_length)
            with open(log_name, "a") as myfile:
                myfile.write(
                    str(i * 5 + 5) + ', ' + str(steps) + ', ' + str(reward) +
                    ', ' + str(done) + '\n')

            # With the CBF
            steps, reward, done, _ = evaluate_with_cbf(env, actor, agent,
                                                       episode_length)
            with open(log_cbf_name, "a") as myfile:
                myfile.write(
                    str(i * 5 + 5) + ', ' + str(steps) + ', ' + str(reward) +
                    ', ' + str(done) + '\n')

    # Save the final model as a matlab file
    relu1_vars = tflearn.variables.get_layer_variables_by_name('relu1')
    relu2_vars = tflearn.variables.get_layer_variables_by_name('relu2')
    out_vars = tflearn.variables.get_layer_variables_by_name('out_layer')

    weights = [
        actor.model.get_weights(relu1_vars[0]),
        actor.model.get_weights(relu2_vars[0]),
        actor.model.get_weights(out_vars[0])
    ]
    biases = [
        actor.model.get_weights(relu1_vars[1]),
        actor.model.get_weights(relu2_vars[1]),
        actor.model.get_weights(out_vars[1])
    ]

    savemat(args['log_path'] + '/final_model.mat',
            mdict={
                'W': weights,
                'b': biases
            })

    return [summary_ops, summary_vars, paths]
Exemplo n.º 2
0
    def rollout(self):
        #Initialize variables
        paths = list()
        timesteps = 0
        self.num_epi = 0

        #Utilize GP from previous iteration while training current iteration
        if (self.firstIter == 1):
            pass
        else:
            self.GP_model_prev = self.GP_model.copy()
            dynamics_gp.build_GP_model(self)

        #Iterate through the specified number of episodes
        while timesteps < self.args.timesteps_per_batch:
            self.num_epi += 1

            #Reset the environment
            obs, action, rewards, done, action_dist_mu, action_dist_logstd, action_bar, action_BAR, action_RL_mu_, action_RL_ = [], [], [], [], [], [], [], [], [], []
            prev_obs = self.env.reset()
            obs = np.expand_dims(np.squeeze(prev_obs), 0)

            #Simulate dynamics for specified time
            for i in range(self.args.max_path_length):
                #self.env.render()
                prev_obs_expanded = np.expand_dims(np.squeeze(prev_obs), 0)
                #prev_obs_expanded = prev_obs
                #Agent takes actions from sampled action and action distribution parameters based on observation
                #All have shape of [1, action size]
                action_rl, action_dist_mu_rl, action_dist_logstd_ = self.act(
                    prev_obs)

                #Utilize compensation barrier function
                u_BAR_ = self.bar_comp.get_action(prev_obs)
                action_RL = action_rl + u_BAR_
                action_dist_mu_RL = action_dist_mu_rl + u_BAR_

                t = 0.05 * i
                # Get GP dynamics
                if (self.firstIter == 1):
                    [f, g, x, std
                     ] = dynamics_gp.get_GP_dynamics(self, prev_obs_expanded,
                                                     action_RL, t)
                else:
                    [f, g, x, std] = dynamics_gp.get_GP_dynamics_prev(
                        self, prev_obs_expanded, action_RL, t)

                #Utilize safety barrier function
                u_bar_ = cbf.control_barrier(self,
                                             np.squeeze(prev_obs_expanded),
                                             action_dist_mu_RL, f, g, x, std)
                #action_ = action_RL + u_bar_
                action_dist_mu_ = action_dist_mu_RL + u_bar_

                #Stochastic action
                action_ = np.random.normal(loc=action_dist_mu_,
                                           scale=np.exp(action_dist_logstd_))

                #Store observation and action/distribution
                obs = np.append(obs, prev_obs_expanded, axis=0)
                action_RL_mu_.append(action_dist_mu_rl)
                action_RL_.append(action_rl)
                action_bar.append(u_bar_)
                action_BAR.append(u_BAR_)
                action.append(action_)
                action_dist_mu.append(action_dist_mu_)
                action_dist_logstd.append(action_dist_logstd_)

                # Simulate dynamics after action
                next_obs, reward_, done_ = self.env.step(action_)
                reward_ = np.squeeze(reward_)
                #next_obs, reward_, done_, _ = self.env.step(action_)

                #Get results
                done.append(done_)
                rewards.append(reward_)
                prev_obs = next_obs

                if i == self.args.max_path_length - 1:
                    obs = obs[1:self.args.max_path_length + 1, :]
                    path = {
                        "Observation": obs,
                        "Action": np.concatenate(action),
                        "Action_RL_mu": np.concatenate(action_RL_mu_),
                        "Action_RL": np.concatenate(action_RL_),
                        "Action_mu": np.concatenate(action_dist_mu),
                        "Action_bar": np.concatenate(action_bar),
                        "Action_BAR": np.concatenate(action_BAR),
                        "Action_logstd": np.concatenate(action_dist_logstd),
                        "Done": np.asarray(done),
                        "Reward": np.asarray(rewards)
                    }
                    paths.append(path)
                    break

            #For timing purposes, only update GP dynamics for certain number of timesteps
            if (timesteps < 500):
                dynamics_gp.update_GP_dynamics(self, path)
            timesteps += len(rewards)
        #print('%d episodes, %d steps collected for batch' % (self.num_epi, timesteps))
        self.firstIter = 0
        return paths
Exemplo n.º 3
0
def train(sess, env, args, actor, critic, actor_noise, reward_result, agent):

    # Set up summary Ops
    summary_ops, summary_vars = build_summaries()

    sess.run(tf.global_variables_initializer())

    # Initialize target network weights
    actor.update_target_network()
    critic.update_target_network()

    # Initialize replay memory
    replay_buffer = ReplayBuffer(int(args['buffer_size']),
                                 int(args['random_seed']))

    # Needed to enable BatchNorm.
    # This hurts the performance on Pendulum but could be useful
    # in other environments.
    tflearn.is_training(True)

    paths = list()

    for i in range(int(args['max_episodes'])):

        #Utilize GP from previous iteration while training current iteration
        if (agent.firstIter == 1):
            pass
        else:
            agent.GP_model_prev = agent.GP_model.copy()
            dynamics_gp.build_GP_model(agent)

        for el in range(5):

            obs, action, rewards, action_bar, action_BAR = [], [], [], [], []

            s1 = env.reset()
            s = np.copy(s1)
            ep_reward = 0
            ep_ave_max_q = 0

            for j in range(int(args['max_episode_len'])):

                #env.render()

                # Added exploration noise
                #a = actor.predict(np.reshape(s, (1, 3))) + (1. / (1. + i))
                a = actor.predict(np.reshape(
                    s, (1, actor.s_dim))) + actor_noise()

                #Incorporate barrier function
                action_rl = a[0]

                #Utilize compensation barrier function
                if (agent.firstIter == 1):
                    u_BAR_ = [0]
                    #u_BAR_ = agent.bar_comp.get_action(s)[0]
                else:
                    u_BAR_ = [0]
                    #u_BAR_ = agent.bar_comp.get_action(s)[0]

                action_RL = action_rl + u_BAR_

                t = 0.05 * j
                #Utilize safety barrier function
                if (agent.firstIter == 1):
                    [f, g, x, std
                     ] = dynamics_gp.get_GP_dynamics(agent, s, action_RL, t)
                else:
                    [f, g, x, std] = dynamics_gp.get_GP_dynamics_prev(
                        agent, s, action_RL, t)
                u_bar_ = cbf.control_barrier(agent, np.squeeze(s), action_RL,
                                             f, g, x, std)
                action_ = action_RL + u_bar_

                s2, r, terminal = env.step(action_)

                #replay_buffer.add(np.reshape(s, (actor.s_dim,)), np.reshape(a, (actor.a_dim,)), r,
                #                  terminal, np.reshape(s2, (actor.s_dim,)))

                replay_buffer.add(np.reshape(s, (actor.s_dim, )),
                                  np.reshape(action_, (actor.a_dim, )), r,
                                  terminal, np.reshape(s2, (actor.s_dim, )))

                # Keep adding experience to the memory until
                # there are at least minibatch size samples
                if replay_buffer.size() > int(args['minibatch_size']):
                    s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch(
                        int(args['minibatch_size']))

                    # Calculate targets
                    target_q = critic.predict_target(
                        s2_batch, actor.predict_target(s2_batch))

                    y_i = []
                    for k in range(int(args['minibatch_size'])):
                        if t_batch[k]:
                            y_i.append(r_batch[k])
                        else:
                            y_i.append(r_batch[k] + critic.gamma * target_q[k])

                    # Update the critic given the targets
                    predicted_q_value, _ = critic.train(
                        s_batch, a_batch,
                        np.reshape(y_i, (int(args['minibatch_size']), 1)))

                    ep_ave_max_q += np.amax(predicted_q_value)

                    # Update the actor policy using the sampled gradient
                    a_outs = actor.predict(s_batch)
                    grads = critic.action_gradients(s_batch, a_outs)
                    actor.train(s_batch, grads[0])

                    # Update target networks
                    actor.update_target_network()
                    critic.update_target_network()

                obs.append(s)
                rewards.append(r)
                action_bar.append(u_bar_)
                action_BAR.append(u_BAR_)
                action.append(action_)

                s = np.copy(s2)
                ep_reward += r

                if j == 80 - 1:

                    #writer.add_summary(summary_str, i)
                    #writer.flush()

                    print(
                        '| Reward: {:d} | Episode: {:d} | Qmax: {:.4f}'.format(
                            int(ep_reward), i, (ep_ave_max_q / float(j))))
                    reward_result[i] = ep_reward
                    path = {
                        "Observation": np.concatenate(obs).reshape((80, 15)),
                        "Action": np.concatenate(action),
                        "Action_bar": np.concatenate(action_bar),
                        "Action_BAR": np.concatenate(action_BAR),
                        "Reward": np.asarray(rewards)
                    }
                    paths.append(path)
                    break
            if el <= 3:
                dynamics_gp.update_GP_dynamics(agent, path)

        agent.bar_comp.get_training_rollouts(paths)
        barr_loss = agent.bar_comp.train()
        agent.firstIter = 0

    return [summary_ops, summary_vars, paths]