示例#1
0
def plot_comparison(dyn_model, state_cb, pub_act, pub_cmd, rate):
    """
    Write a function to generate plots comparing the behavior of the model predictions for each element of the state to the actual ground truth, using randomly sampled actions.
    """
    print('Plotting nn dynamics results')
    rand_cont = RandomController()
    s = state_cb.reset(pub_act, pub_cmd)
    env_state_traj = s
    model_state_traj = s
    steps = 100
    for i in range(steps):
        a = rand_cont.get_action(None)
        # Step environment
        env_s, _ = state_cb.step(a, pub_act, pub_cmd)
        env_state_traj = np.vstack((env_state_traj, env_s))
        # Step model
        if i == 0:
            model_s = dyn_model.predict(model_state_traj, a)
        else:
            model_s = dyn_model.predict(model_state_traj[i, :], a)
        model_state_traj = np.vstack((model_state_traj, model_s))

    body = 10
    # for i in range(body*12,(body+1)*12):
    for i in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 108, 109, 110]:
        plt.figure()
        env_state = plt.plot(np.arange(steps + 1),
                             env_state_traj[:, i].reshape((steps + 1)),
                             label='env state')
        model_state = plt.plot(np.arange(steps + 1),
                               model_state_traj[:, i].reshape((steps + 1)),
                               label='model state')
        state = i % 12
        if state == 0:
            plt.title('Body ' + str(body) + ', x position')
        elif state == 1:
            plt.title('Body ' + str(body) + ', y position')
        elif state == 2:
            plt.title('Body ' + str(body) + ', z position')
        elif state == 3:
            plt.title('Body ' + str(body) + ', x angle')
        elif state == 4:
            plt.title('Body ' + str(body) + ', y angle')
        elif state == 5:
            plt.title('Body ' + str(body) + ', z angle')
        elif state == 6:
            plt.title('Body ' + str(body) + ', x velocty')
        elif state == 7:
            plt.title('Body ' + str(body) + ', y velocity')
        elif state == 8:
            plt.title('Body ' + str(body) + ', z velocity')
        elif state == 9:
            plt.title('Body ' + str(body) + ', x angular velocity')
        elif state == 10:
            plt.title('Body ' + str(body) + ', y angular velocity')
        elif state == 11:
            plt.title('Body ' + str(body) + ', z angular velocity')
        plt.legend()
        plt.draw()
    plt.show()
示例#2
0
def plot_comparison(dyn_model, pub_act, pub_cmd, rate):
    """
    Write a function to generate plots comparing the behavior of the model predictions for each element of the state to the actual ground truth, using randomly sampled actions.
    """
    rand_cont = RandomController()
    s = reset(pub_cmd, rate)
    env_state_traj = s
    model_state_traj = s
    steps = 100
    for i in range(steps):
        a = rand_cont.get_action(None)
        # Step environment
        env_s, _ = step(a, pub_act, pub_cmd, rate)
        env_state_traj = np.vstack((env_state_traj, env_s))
        # Step model
        if i == 0:
            model_s = dyn_model.predict(model_state_traj, a)
        else:
            model_s = dyn_model.predict(model_state_traj[i, :], a)
        model_state_traj = np.vstack((model_state_traj, model_s))

    for i in range(len(s)):
        plt.figure()
        env_state = plt.plot(np.arange(steps + 1),
                             env_state_traj[:, i].reshape((steps + 1)),
                             label='env state')
        model_state = plt.plot(np.arange(steps + 1),
                               model_state_traj[:, i].reshape((steps + 1)),
                               label='model state')
        plt.title('State ' + str(i))
        plt.legend()
        plt.draw()
    plt.show()
def plot_comparison(env, dyn_model):
    """
    Write a function to generate plots comparing the behavior of the model predictions for each element of the state to the actual ground truth, using randomly sampled actions. 
    """
    """ YOUR CODE HERE """
    horizon = 100
    ob = env.reset()
    pred = ob[np.newaxis, :]
    obs, next_obs, acs, rewards = [], [], [], []
    preds = []
    steps = 0
    RC = RandomController(env)
    for _ in range(100):
        obs.append(ob)
        preds.append(pred)
        ac = RC.get_action(ob)
        acs.append(ac)
        ob, rew, done, _ = env.step(ac)
        pred = dyn_model.predict(pred, ac[np.newaxis, :])
        next_obs.append(ob)
        rewards.append(rew)
        steps += 1
        if done or steps > horizon:
            break
    path = {
        "observations": np.array(obs),
        "next_observations": np.array(next_obs),
        "rewards": np.array(rewards),
        "actions": np.array(acs),
        "predictions": np.array(preds)
    }

    print(path['observations'].shape)
    print(path['predictions'].shape)
    plt.plot(path['observations'][:, 0])
    plt.plot(path['predictions'][:, 0, 0])
    plt.show()
    pass
示例#4
0
def train_PG(
             exp_name='',
             env_name='',
             n_iter=100, 
             gamma=1.0, 
             min_timesteps_per_batch=1000, 
             max_path_length=None,
             learning_rate=5e-3, 
             reward_to_go=False, 
             animate=True, 
             logdir=None, 
             normalize_advantages=False,
             nn_baseline=False, 
             seed=0,
             # network arguments
             n_layers=1,
             size=32,

             # mb mpc arguments
             model_learning_rate=1e-3,
             onpol_iters=10,
             dynamics_iters=260,
             batch_size=512,
             num_paths_random=10, 
             num_paths_onpol=10, 
             num_simulated_paths=1000,
             env_horizon=1000, 
             mpc_horizon=10,
             m_n_layers=2,
             m_size=500,
             ):

    start = time.time()

    # Configure output directory for logging
    logz.configure_output_dir(logdir)

    # Log experimental parameters
    args = inspect.getargspec(train_PG)[0]
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_params(params)

    # Set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)

    # Make the gym environment
    # env = gym.make(env_name)
    env = HalfCheetahEnvNew()
    cost_fn = cheetah_cost_fn
    activation=tf.nn.relu
    output_activation=None

    # Is this env continuous, or discrete?
    discrete = isinstance(env.action_space, gym.spaces.Discrete)

    # Maximum length for episodes
    # max_path_length = max_path_length or env.spec.max_episode_steps
    max_path_length = max_path_length

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]

    # Print environment infomation
    print("-------- env info --------")
    print("Environment name: ", env_name)
    print("Action space is discrete: ", discrete)
    print("Action space dim: ", ac_dim)
    print("Observation space dim: ", ob_dim)
    print("Max_path_length ", max_path_length)




    #========================================================================================#
    # Random data collection
    #========================================================================================#

    random_controller = RandomController(env)
    data_buffer_model = DataBuffer()
    data_buffer_ppo = DataBuffer_general(10000, 4)

    # sample path
    print("collecting random data .....  ")
    paths = sample(env, 
               random_controller, 
               num_paths=num_paths_random, 
               horizon=env_horizon, 
               render=False,
               verbose=False)

    # add into buffer
    for path in paths:
        for n in range(len(path['observations'])):
            data_buffer_model.add(path['observations'][n], path['actions'][n], path['next_observations'][n])

    print("data buffer size: ", data_buffer_model.size)

    normalization = compute_normalization(data_buffer_model)

    #========================================================================================#
    # Tensorflow Engineering: Config, Session, Variable initialization
    #========================================================================================#
    tf_config = tf.ConfigProto() 
    tf_config.allow_soft_placement = True
    tf_config.intra_op_parallelism_threads =4
    tf_config.inter_op_parallelism_threads = 1
    sess = tf.Session(config=tf_config)

    dyn_model = NNDynamicsModel(env=env, 
                                n_layers=n_layers, 
                                size=size, 
                                activation=activation, 
                                output_activation=output_activation, 
                                normalization=normalization,
                                batch_size=batch_size,
                                iterations=dynamics_iters,
                                learning_rate=learning_rate,
                                sess=sess)

    mpc_controller = MPCcontroller(env=env, 
                                   dyn_model=dyn_model, 
                                   horizon=mpc_horizon, 
                                   cost_fn=cost_fn, 
                                   num_simulated_paths=num_simulated_paths)


    policy_nn = policy_network_ppo(sess, ob_dim, ac_dim, discrete, n_layers, size, learning_rate)

    if nn_baseline:
        value_nn = value_network(sess, ob_dim, n_layers, size, learning_rate)

    sess.__enter__() # equivalent to `with sess:`

    tf.global_variables_initializer().run()


    #========================================================================================#
    # Training Loop
    #========================================================================================#

    total_timesteps = 0

    for itr in range(n_iter):
        print("********** Iteration %i ************"%itr)

        if MPC:
            dyn_model.fit(data_buffer_model)
        returns = []
        costs = []

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []

        while True:
            # print("data buffer size: ", data_buffer_model.size)
            current_path = {'observations': [], 'actions': [], 'reward': [], 'next_observations':[]}

            ob = env.reset()
            obs, acs, mpc_acs, rewards = [], [], [], []
            animate_this_episode=(len(paths)==0 and (itr % 10 == 0) and animate)
            steps = 0
            return_ = 0
 
            while True:
                # print("steps ", steps)
                if animate_this_episode:
                    env.render()
                    time.sleep(0.05)
                obs.append(ob)

                if MPC:
                    mpc_ac = mpc_controller.get_action(ob)
                else:
                    mpc_ac = random_controller.get_action(ob)

                ac = policy_nn.predict(ob, mpc_ac)

                ac = ac[0]

                if not PG:
                    ac = mpc_ac

                acs.append(ac)
                mpc_acs.append(mpc_ac)

                current_path['observations'].append(ob)

                ob, rew, done, _ = env.step(ac)

                current_path['reward'].append(rew)
                current_path['actions'].append(ac)
                current_path['next_observations'].append(ob)

                return_ += rew
                rewards.append(rew)

                steps += 1
                if done or steps > max_path_length:
                    break


            if MPC:
                # cost & return
                cost = path_cost(cost_fn, current_path)
                costs.append(cost)
                returns.append(return_)
                print("total return: ", return_)
                print("costs: ", cost)

                # add into buffers
                for n in range(len(current_path['observations'])):
                    data_buffer_model.add(current_path['observations'][n], current_path['actions'][n], current_path['next_observations'][n])

            for n in range(len(current_path['observations'])):
                data_buffer_ppo.add(current_path['observations'][n], current_path['actions'][n], current_path['reward'][n], current_path['next_observations'][n])
        
            path = {"observation" : np.array(obs), 
                    "reward" : np.array(rewards), 
                    "action" : np.array(acs),
                    "mpc_action" : np.array(mpc_acs)}



            paths.append(path)
            timesteps_this_batch += pathlength(path)
            # print("timesteps_this_batch", timesteps_this_batch)
            if timesteps_this_batch > min_timesteps_per_batch:
                break
        total_timesteps += timesteps_this_batch


        print("data_buffer_ppo.size:", data_buffer_ppo.size)


        # Build arrays for observation, action for the policy gradient update by concatenating 
        # across paths
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_na = np.concatenate([path["action"] for path in paths])
        mpc_ac_na = np.concatenate([path["mpc_action"] for path in paths])


        # Computing Q-values
     
        if reward_to_go:
            q_n = []
            for path in paths:
                for t in range(len(path["reward"])):
                    t_ = 0
                    q = 0
                    while t_ < len(path["reward"]):
                        if t_ >= t:
                            q += gamma**(t_-t) * path["reward"][t_]
                        t_ += 1
                    q_n.append(q)
            q_n = np.asarray(q_n)

        else:
            q_n = []
            for path in paths:
                for t in range(len(path["reward"])):
                    t_ = 0
                    q = 0
                    while t_ < len(path["reward"]):
                        q += gamma**t_ * path["reward"][t_]
                        t_ += 1
                    q_n.append(q)
            q_n = np.asarray(q_n)


        # Computing Baselines
        if nn_baseline:

            # b_n = sess.run(baseline_prediction, feed_dict={sy_ob_no :ob_no})
            b_n = value_nn.predict(ob_no)
            b_n = normalize(b_n)
            b_n = denormalize(b_n, np.std(q_n), np.mean(q_n))
            adv_n = q_n - b_n
        else:
            adv_n = q_n.copy()

        # Advantage Normalization
        if normalize_advantages:
            adv_n = normalize(adv_n)

        # Optimizing Neural Network Baseline
        if nn_baseline:
            b_n_target = normalize(q_n)
            value_nn.fit(ob_no, b_n_target)
                # sess.run(baseline_update_op, feed_dict={sy_ob_no :ob_no, sy_baseline_target_n:b_n_target})


        # Performing the Policy Update

        # policy_nn.fit(ob_no, ac_na, adv_n)
        policy_nn.fit(ob_no, ac_na, adv_n, mpc_ac_na)

        # sess.run(update_op, feed_dict={sy_ob_no :ob_no, sy_ac_na:ac_na, sy_adv_n:adv_n})

        # Log diagnostics
        returns = [path["reward"].sum() for path in paths]
        ep_lengths = [pathlength(path) for path in paths]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        logz.dump_tabular()
        logz.pickle_tf_vars()