Пример #1
0
    def __init__(self):
        dat = loadmat('car_data_formatted_arc')
        x = np.copy(np.squeeze(dat['car_dat']))
        aaa = np.arange(len(x))
        random.shuffle(aaa)
        self.data = x[aaa]
        y = np.copy(np.squeeze(dat['car_dat']))
        self.data_orig = y[aaa]
        self.count = 0
        self.episode = -1
        self.L = 100
        self.numCars = 5
        self.dt = 0.1
        self.collision_flag = 0
        
        self.state = np.copy(np.squeeze(self.data[0][0]))
        self.bot_state = np.copy(np.squeeze(self.data[0][0][9:12]))

        self.prior = BasePrior()

        
        self.action_space = spaces.Box(low=-7.0, high = 3.0, shape = (1,))

        high = np.array([
            np.finfo(np.float32).max,
            np.finfo(np.float32).max,
            np.finfo(np.float32).max,
            np.finfo(np.float32).max,
            np.finfo(np.float32).max,
            np.finfo(np.float32).max])
        self.observation_space = spaces.Box(-high, high)
Пример #2
0
    def __init__(self, args, sess):
        self.args = args
        self.sess = sess
        [A, B] = get_linear_dynamics()
        self.prior = BasePrior(A, B)

        self.env = gym.make(self.args.env_name)
        self.args.max_path_length = self.env.spec.timestep_limit
        self.agent = TRPO(self.args, self.env, self.sess, self.prior)
Пример #3
0
    def __init__(self):
        dat = loadmat('car_data_formatted_arc')
        x = np.copy(np.squeeze(dat['car_dat']))
        aaa = np.arange(len(x))
        random.shuffle(aaa)
        self.data = x[aaa]
        y = np.copy(np.squeeze(dat['car_dat']))
        self.data_orig = y[aaa]
        self.count = 0
        self.episode = -1
        self.L = 100
        self.numCars = 5
        self.dt = 0.1
        self.collision_flag = 0

        self.state = np.copy(np.squeeze(self.data[0][0]))
        self.bot_state = np.copy(np.squeeze(self.data[0][0][9:12]))

        self.prior = BasePrior()
Пример #4
0
    def train(self, replay_buffer, minibatch_size):
        # Get dynamics and initialize prior controller
        prior = BasePrior()

        #self.sess.as_default()

        # Needed to enable BatchNorm
        #tflearn.is_training(True)

        #Sample a batch from the replay buffer
        s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch(
            minibatch_size)

        # Calculate targets
        target_q = self.critic.predict_target(
            s2_batch, self.actor.predict_target(s2_batch))
        y_i = []
        for k in range(minibatch_size):
            if t_batch[k]:
                y_i.append(r_batch[k])
            else:
                y_i.append(r_batch[k] + self.critic.gamma * target_q[k])

        # Update the critic given the targets
        predicted_q_value, _ = self.critic.train(
            s_batch, a_batch, np.reshape(y_i, (minibatch_size, 1)))
        ep_ave_max_q += np.amax(predicted_q_value)

        # Update the actor policy using the sampled gradient
        a_outs = self.actor.predict(s_batch)
        grads = self.critic.action_gradients(s_batch, a_outs)
        self.actor.train(s_batch, grads[0])

        # Update target networks
        self.actor.update_target_network()
        self.critic.update_target_network()
Пример #5
0
class allCars():

    # Import data, cut it into episodes, and shuffle
    def __init__(self):
        dat = loadmat('car_data_formatted_arc')
        x = np.copy(np.squeeze(dat['car_dat']))
        aaa = np.arange(len(x))
        random.shuffle(aaa)
        self.data = x[aaa]
        y = np.copy(np.squeeze(dat['car_dat']))
        self.data_orig = y[aaa]
        self.count = 0
        self.episode = -1
        self.L = 100
        self.numCars = 5
        self.dt = 0.1
        self.collision_flag = 0

        self.state = np.copy(np.squeeze(self.data[0][0]))
        self.bot_state = np.copy(np.squeeze(self.data[0][0][9:12]))

        self.prior = BasePrior()

    # Get state for the car (headways and velocities)
    def getState(self):
        s = self.getState_arc()
        x = np.copy(s)
        s[0] = 0
        s[3] = x[0] - x[3]
        s[6] = x[3] - x[6]
        s[9] = x[6] - x[9]
        s[12] = x[9] - x[12]
        return s[[6, 7, 9, 10, 12, 13]]

    def getState_arc(self):
        s = np.copy(np.squeeze(self.data[self.episode][self.count]))
        s[9:12] = np.copy(self.bot_state)
        return s

    # Get the reward for a given state/action
    def getReward(self, action):
        s = self.getState_arc()
        if (action > 0):
            r = -np.abs(s[10]) * action
        else:
            r = 0
        if (s[6] - s[9]) < 2:
            r = r - 50.
            if (self.collision_flag == 0):
                print("Collision Penalty")
                print(self.episode)
            self.collision_flag = 1
        if (s[9] - s[12]) < 2:
            r = r - 50.
            if (self.collision_flag == 0):
                print("Collision Penalty")
                print(self.episode)
            self.collision_flag = 1
        if (s[6] - s[9]) < 10:
            r = r - np.abs(100 / (s[6] - s[9]))
        if (s[9] - s[12]) < 10:
            r = r - np.abs(100 / (s[9] - s[12]))
        return r

    # Reset environment first time (using only control prior)
    def reset_inc(self):
        self.collision_flag = 0
        self.count = 0
        self.episode += 1
        self.state = np.copy(np.squeeze(self.data[self.episode][self.count]))
        self.bot_state = np.copy(
            np.squeeze(self.data[self.episode][self.count][9:12]))
        return self.getState()

    # Reset environment second time (this time with learning)
    def reset(self):
        self.collision_flag = 0
        self.count = 0
        self.state = np.copy(
            np.squeeze(self.data_orig[self.episode][self.count]))
        self.bot_state = np.copy(
            np.squeeze(self.data_orig[self.episode][self.count][9:12]))
        return self.getState()

    # Simulate next step
    def stepNext(self, a):
        self.dt = 0.1
        x = self.bot_state[0] + self.bot_state[1] * self.dt
        xdot = self.bot_state[1] + a * self.dt
        xdoubledot = a
        self.bot_state[0] = x
        self.bot_state[1] = xdot
        self.bot_state[2] = xdoubledot
        self.state = self.data[self.episode][self.count]

    # Take next step given action
    def step(self, action):
        #Take action for all cars
        self.count += 1
        s = self.getState()
        if (s[2] <= 6.):
            action = action - np.random.normal(2.0)
        if (s[4] <= 6.):
            action = action + np.random.normal(2.0)
        if (action < -7.):
            action = -7.
        if (action > 3.):
            action = 3.

        self.stepNext(action)
        s = self.getState()
        r = self.getReward(action)
        if (self.count == self.L - 1):
            self.count = 0
            return s, r, True, action
        else:
            return s, r, False, action

    # Compute control prior action
    def getPrior(self):
        s = self.getState_arc()
        return self.prior.computePrior(s)
Пример #6
0
def train(sess, env, args, actor, critic, actor_noise, reward_result):

    # Set up summary Ops
    summary_ops, summary_vars = build_summaries()

    sess.run(tf.global_variables_initializer())

    # Get dynamics and initialize prior controller
    prior = BasePrior()

    # Initialize target network weights
    actor.update_target_network()
    critic.update_target_network()

    # Initialize replay memory
    replay_buffer = ReplayBuffer(int(args['buffer_size']),
                                 int(args['random_seed']))

    # Needed to enable BatchNorm.
    tflearn.is_training(True)

    paths = list()

    lambda_store = np.zeros((int(args['max_episode_len']), 1))

    for i in range(int(args['max_episodes'])):

        s = env.reset_inc()

        ep_reward = 0.
        ep_ave_max_q = 0

        obs, action, act_prior, rewards, obs_ref, prior_ref, collisions = [], [], [], [], [], [], []

        #Get reward using baseline controller
        s0 = np.copy(s)
        ep_reward_opt = 0.
        for kk in range(int(args['max_episode_len'])):
            a = env.getPrior()
            prior_ref.append(np.array([a]))
            s0, r, stop_c, act = env.step(a)
            ep_reward_opt += r
            obs_ref.append(s0)
            if (stop_c):
                break

        # Get reward using regRL algorithm
        s = env.reset()

        for j in range(int(args['max_episode_len'])):

            # Set control prior regularization weight
            lambda_mix = 15.
            lambda_store[j] = lambda_mix

            # Get control prior
            a_prior = env.getPrior()

            # Rl control with exploration noise
            ab = actor.predict(np.reshape(s, (1, actor.s_dim))) + actor_noise()

            # Mix the actions (RL controller + control prior)
            act = ab[0] / (1 + lambda_mix) + (lambda_mix /
                                              (1 + lambda_mix)) * a_prior

            # Take action and observe next state/reward
            s2, r, terminal, act = env.step(act)
            collisions.append(env.collision_flag)
            act = np.array(act, ndmin=1)

            # Add info from time step to the replay buffer
            replay_buffer.add(np.reshape(s, (actor.s_dim, )),
                              np.reshape(ab, (actor.a_dim, )), r, terminal,
                              np.reshape(s2, (actor.s_dim, )),
                              np.reshape(a_prior, (actor.a_dim, )))

            # Keep adding experience to the memory until
            # there are at least minibatch size samples
            if replay_buffer.size() > int(args['minibatch_size']):

                #Sample a batch from the replay buffer
                s_batch, a_batch_0, r_batch, t_batch, s2_batch, a_prior_batch = \
                    replay_buffer.sample_batch(int(args['minibatch_size']))

                a_batch = a_batch_0

                # Calculate targets
                target_q = critic.predict_target(
                    s2_batch, actor.predict_target(s2_batch))
                y_i = []
                for k in range(int(args['minibatch_size'])):
                    if t_batch[k]:
                        y_i.append(r_batch[k])
                    else:
                        y_i.append(r_batch[k] + critic.gamma * target_q[k])

                # Update the critic given the targets
                predicted_q_value, _ = critic.train(
                    s_batch, a_batch,
                    np.reshape(y_i, (int(args['minibatch_size']), 1)))
                ep_ave_max_q += np.amax(predicted_q_value)

                # Update the actor policy using the sampled gradient
                a_outs = actor.predict(s_batch)
                grads = critic.action_gradients(s_batch, a_outs)
                actor.train(s_batch, grads[0])

                # Update target networks
                actor.update_target_network()
                critic.update_target_network()

            s = s2
            ep_reward += r

            obs.append(s)
            rewards.append(r)
            action.append(act)
            act_prior.append(np.array([a_prior]))

            # Collect results at end of episode
            if terminal:
                print('| Reward: {:d} | Episode: {:d} | Qmax: {:.4f}'.format(int(ep_reward-ep_reward_opt), \
                        i, (ep_ave_max_q / float(j))))
                reward_result[0, i] = ep_reward
                reward_result[1, i] = ep_reward_opt
                reward_result[2, i] = np.mean(lambda_store)
                reward_result[3, i] = max(collisions)
                path = {
                    "Observation": np.concatenate(obs).reshape((-1, 6)),
                    "Observation_ref": np.concatenate(obs_ref).reshape(
                        (-1, 6)),
                    "Action": np.concatenate(action),
                    "Action_Prior": np.concatenate(act_prior),
                    "Action_Prior_Ref": np.concatenate(prior_ref),
                    "Reward": np.asarray(rewards)
                }
                paths.append(path)

                break

    return [summary_ops, summary_vars, paths]
Пример #7
0
def train(sess, env, args, actor, critic, actor_noise, reward_result):

    # Set up summary Ops
    summary_ops, summary_vars = build_summaries()

    sess.run(tf.global_variables_initializer())

    # Get dynamics and initialize prior controller
    [A, B] = get_linear_dynamics()
    prior = BasePrior(A, B)
    # Initialize target network weights
    actor.update_target_network()
    critic.update_target_network()

    # Initialize replay memory
    replay_buffer = ReplayBuffer(int(args['buffer_size']),
                                 int(args['random_seed']))

    paths = list()

    for i in range(int(args['max_episodes'])):

        s = env.reset()

        ep_reward = 0.
        ep_ave_max_q = 0

        obs, action, rewards = [], [], []

        #Get optimal reward using optimal control
        s0 = np.copy(s)
        ep_reward_opt = 0.
        for kk in range(int(args['max_episode_len'])):
            a_prior = prior.getControl_h(s0)
            a = a_prior
            s0, r, stop_c, _ = env.step(a)
            ep_reward_opt += r
            if (stop_c):
                break

        # Get reward using regRL algorithm
        env.reset()
        s = env.unwrapped.reset(s)

        for j in range(int(args['max_episode_len'])):

            # Set control prior regularization weight
            lambda_mix = 5.

            # Prior control
            a_prior = prior.getControl_h(s)

            # Rl control with exploration noise
            a = actor.predict(np.reshape(s, (1, actor.s_dim))) + actor_noise()
            #a = actor.predict(np.reshape(s, (1, actor.s_dim))) + (1. / (1. + i))

            # Mix the actions (RL controller + control prior)
            act = a[0] / (1 + lambda_mix) + (lambda_mix /
                                             (1 + lambda_mix)) * a_prior

            # Take action and observe next state/reward
            s2, r, terminal, info = env.step(act)

            # Add info from time step to the replay buffer
            replay_buffer.add(
                np.reshape(s, (actor.s_dim, )), np.reshape(a, (actor.a_dim, )),
                r, terminal, np.reshape(s2, (actor.s_dim, )),
                np.reshape((lambda_mix / (1 + lambda_mix)) * a_prior,
                           (actor.a_dim, )))

            # Keep adding experience to the memory until
            # there are at least minibatch size samples
            if replay_buffer.size() > int(args['minibatch_size']):

                #Sample a batch from the replay buffer
                s_batch, a_batch_0, r_batch, t_batch, s2_batch, a_prior_batch = \
                    replay_buffer.sample_batch(int(args['minibatch_size']))

                a_batch = a_batch_0

                # Calculate targets
                target_q = critic.predict_target(
                    s2_batch, actor.predict_target(s2_batch))
                y_i = []
                for k in range(int(args['minibatch_size'])):
                    if t_batch[k]:
                        y_i.append(r_batch[k])
                    else:
                        y_i.append(r_batch[k] + critic.gamma * target_q[k])

                # Update the critic given the targets
                predicted_q_value, _ = critic.train(
                    s_batch, a_batch,
                    np.reshape(y_i, (int(args['minibatch_size']), 1)))
                ep_ave_max_q += np.amax(predicted_q_value)

                # Update the actor policy using the sampled gradient
                a_outs = actor.predict(s_batch)
                grads = critic.action_gradients(s_batch, a_outs)
                actor.train(s_batch, grads[0])

                # Update target networks
                actor.update_target_network()
                critic.update_target_network()

                # Calculate TD-Error for each state
                base_q = critic.predict_target(s_batch,
                                               actor.predict_target(s_batch))
                target_q = critic.predict_target(
                    s2_batch, actor.predict_target(s2_batch))

            s = s2
            ep_reward += r

            obs.append(s)
            rewards.append(r)
            action.append(a[0])

            # Collect results at end of episode
            if terminal:
                for ii in range(len(obs)):
                    obs[ii] = obs[ii].reshape((4, 1))
                print('| Reward: {:d} | Episode: {:d} | Qmax: {:.4f}'.format(int(ep_reward - ep_reward_opt), \
                        i, (ep_ave_max_q / float(j))))
                reward_result[0, i] = ep_reward
                reward_result[1, i] = ep_reward_opt
                path = {
                    "Observation": np.concatenate(obs).reshape((-1, 4)),
                    "Action": np.concatenate(action),
                    "Reward": np.asarray(rewards)
                }
                paths.append(path)
                print(ep_reward)
                break

    return [summary_ops, summary_vars, paths]
Пример #8
0
                               TIMESTAMP)

    #env = gym.make(ENVIRONMENT)
    env = allCars()
    #env = wrappers.Monitor(env, os.path.join(SUMMARY_DIR, ENVIRONMENT), video_callable=None)
    ppo = PPO(env, SUMMARY_DIR, gpu=True)

    if MODEL_RESTORE_PATH is not None:
        ppo.restore_model(MODEL_RESTORE_PATH)

    t, terminal = 0, False
    buffer_s, buffer_a, buffer_r, buffer_v, buffer_terminal = [], [], [], [], []
    rolling_r = RunningStats()

    # Get prior and set tuning parameters for adaptive regularization weight
    prior = BasePrior()
    lambda_store = np.zeros(BATCH + 1)
    lambda_all = np.zeros(EP_MAX + 1)
    lambda_max = 8
    factor = 0.2

    reward_total, reward_diff = [], []

    for episode in range(EP_MAX + 1):

        # Baseline reward using only control prior
        sp = env.reset_inc()
        reward_prior = 0.
        while True:
            a_prior = env.getPrior()
            sp, reward_p, done_p, _ = env.step(a_prior)
Пример #9
0
    TIMESTAMP = datetime.now().strftime("%Y%m%d-%H%M%S")
    SUMMARY_DIR = os.path.join(OUTPUT_RESULTS_DIR, "PPO", ENVIRONMENT, TIMESTAMP)

    env = gym.make(ENVIRONMENT)
    ppo = PPO(env, SUMMARY_DIR, gpu=True)

    if MODEL_RESTORE_PATH is not None:
        ppo.restore_model(MODEL_RESTORE_PATH)

    t, terminal = 0, False
    buffer_s, buffer_a, buffer_r, buffer_v, buffer_terminal = [], [], [], [], []
    rolling_r = RunningStats()

    # Initialize control prior
    [A,B] = get_linear_dynamics()
    prior = BasePrior(A,B)
    # Set fixed regularization weight
    # lambda_mix = 4.

    reward_total, reward_diff, reward_lqr_prior, reward_h_prior = [], [], [], []

    for episode in range(EP_MAX + 1):

        # Baseline reward using only control prior
        s0 = env.reset()
        sp = np.copy(s0)
        reward_prior = 0.
        while True:
            a_prior = prior.getControl_h(sp)
            a_prior = np.squeeze(np.asarray(a_prior))
            sp, reward_p, done_p, _ = env.step(a_prior)