Exemplo n.º 1
0
    def get_test_batch(self,
                       num_tasks,
                       resample=False,
                       task=None,
                       controller='Rand',
                       task_range=(0, 7),
                       task_fun=np.random.randint):

        if controller == 'Rand':
            self.controller = RandomController(self.env)
        elif controller == "MPC":
            self.controller = MPCcontroller(self.env)

        if resample:
            # random sample
            if task is None:
                learner_env_goals = sample_goals(num_tasks, task_range,
                                                 task_fun)
            else:
                learner_env_goals = task

            for i in range(num_tasks):
                task = learner_env_goals[i]
                paths = sample(self.env,
                               task,
                               self.controller,
                               num_paths=self.num_paths_random,
                               horizon=self.env_horizon,
                               ignore_done=True,
                               K=self.K,
                               M=self.M)  # 10
                data_x, data_y = self._data_process(paths)
                data_x = data_x[np.newaxis, :]
                data_y = data_y[np.newaxis, :]

                if i == 0:
                    x = data_x
                    y = data_y
                else:
                    x = np.concatenate([x, data_x], axis=0)
                    y = np.concatenate([y, data_y], axis=0)

        data_x, data_y = [], []
        for t in range(num_tasks):
            for h in range(self.env_horizon):
                data_x.append(x[t, h:(h + self.K + self.M), :])
                data_y.append(y[t, h:(h + self.K + self.M), :])
        data_x = np.array(data_x)
        data_y = np.array(data_y)

        # dataset = tf.data.Dataset.from_tensor_slices((data_x, data_y)).shuffle(
        # 	buffer_size=self.env_horizon * self.num_tasks).batch(
        # 	self.env_horizon).repeat()
        # # create the iterator
        # iter = dataset.make_one_shot_iterator()
        #
        # iterator = iter.get_next()

        return data_x, data_y
Exemplo n.º 2
0
    def get_dataset(self,
                    resample=False,
                    task=None,
                    controller='Rand',
                    task_range=(0, 7),
                    task_fun=np.random.randint):

        if controller == 'Rand':
            self.controller = RandomController(self.env)
        elif controller == "MPC":
            self.controller = MPCcontroller(self.env)

        if resample:
            # random sample
            if task is None:
                learner_env_goals = sample_goals(self.num_tasks, task_range,
                                                 task_fun)
            else:
                learner_env_goals = task

            for i in range(self.num_tasks):
                task = learner_env_goals[i]
                paths = sample(self.env,
                               task,
                               self.controller,
                               num_paths=self.num_paths_random,
                               horizon=self.env_horizon,
                               ignore_done=True,
                               K=self.K,
                               M=self.M)  # 10
                data_x, data_y = self._data_process(paths)
                data_x = data_x[np.newaxis, :]
                data_y = data_y[np.newaxis, :]

                if i == 0:
                    self.x = data_x
                    self.y = data_y
                else:
                    self.x = np.concatenate([self.x, data_x], axis=0)
                    self.y = np.concatenate([self.y, data_y], axis=0)
        # end = time.time()
        # runtime1 = end - start
        # print('time ', runtime1)
        print('env_horizon:', self.env_horizon)
        print('len of x:', len(self.x))
        return len(self.x)
Exemplo n.º 3
0
def train(
    env,
    cost_fn,
    logdir=None,
    render=False,
    learning_rate=1e-3,
    onpol_iters=10,
    dynamics_iters=60,
    batch_size=512,
    num_paths_random=10,
    num_paths_onpol=10,
    num_simulated_paths=10000,
    env_horizon=1000,
    mpc_horizon=15,
    n_layers=2,
    size=500,
    activation=tf.nn.relu,
    output_activation=None,
    clip_param=0.2,
    entcoeff=0.0,
    gamma=0.99,
    lam=0.95,
    optim_epochs=10,
    optim_batchsize=64,
    schedule='linear',
    bc_lr=1e-3,
    ppo_lr=3e-4,
    timesteps_per_actorbatch=1000,
    MPC=True,
    BEHAVIORAL_CLONING=True,
    PPO=True,
):

    start = time.time()

    logz.configure_output_dir(logdir)
    merged_summary, summary_writer, ppo_return_op, mpc_return_op, model_loss_op, reward_loss_op, ppo_std_op, mpc_std_op = build_summary_ops(
        logdir, env)

    print("-------- env info --------")
    print("Environment: ", FLAGS.env_name)
    print("observation_space: ", env.observation_space.shape)
    print("action_space: ", env.action_space.shape)
    print("action_space low: ", env.action_space.low)
    print("action_space high: ", env.action_space.high)

    print("BEHAVIORAL_CLONING: ", BEHAVIORAL_CLONING)
    print("PPO: ", PPO)
    print("MPC-AUG: ", MPC)

    print(" ")

    random_controller = RandomController(env)

    # Creat buffers
    model_data_buffer = DataBufferGeneral(FLAGS.MODELBUFFER_SIZE, 5)
    ppo_data_buffer = DataBufferGeneral(10000, 4)
    bc_data_buffer = DataBufferGeneral(2000, 2)

    # Random sample path

    print("collecting random data .....  ")
    paths = sample(env,
                   random_controller,
                   num_paths=num_paths_random,
                   horizon=env_horizon,
                   render=False,
                   verbose=False)

    # add into buffer
    for path in paths:
        for n in range(len(path['observations'])):
            model_data_buffer.add([
                path['observations'][n], path['actions'][n],
                path['rewards'][n], path['next_observations'][n],
                path['next_observations'][n] - path['observations'][n]
            ])

    print("model data buffer size: ", model_data_buffer.size)

    normalization = compute_normalization(model_data_buffer)

    #========================================================
    #
    # Build dynamics model and MPC controllers and Behavioral cloning network.
    #
    # tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1)

    tf_config = tf.ConfigProto()

    tf_config.gpu_options.allow_growth = True

    sess = tf.Session(config=tf_config)

    policy_nn = MlpPolicy(sess=sess,
                          env=env,
                          hid_size=128,
                          num_hid_layers=2,
                          clip_param=clip_param,
                          entcoeff=entcoeff)

    if FLAGS.LEARN_REWARD:
        print("Learn reward function")
        dyn_model = NNDynamicsRewardModel(env=env,
                                          normalization=normalization,
                                          batch_size=batch_size,
                                          iterations=dynamics_iters,
                                          learning_rate=learning_rate,
                                          sess=sess)

        mpc_ppo_controller = MPCcontrollerPolicyNetReward(
            env=env,
            dyn_model=dyn_model,
            explore=FLAGS.MPC_EXP,
            policy_net=policy_nn,
            self_exp=FLAGS.SELFEXP,
            horizon=mpc_horizon,
            num_simulated_paths=num_simulated_paths)
    else:
        print("Use predefined cost function")
        dyn_model = NNDynamicsModel(env=env,
                                    n_layers=n_layers,
                                    size=size,
                                    activation=activation,
                                    output_activation=output_activation,
                                    normalization=normalization,
                                    batch_size=batch_size,
                                    iterations=dynamics_iters,
                                    learning_rate=learning_rate,
                                    sess=sess)

        mpc_ppo_controller = MPCcontrollerPolicyNet(
            env=env,
            dyn_model=dyn_model,
            explore=FLAGS.MPC_EXP,
            policy_net=policy_nn,
            self_exp=FLAGS.SELFEXP,
            horizon=mpc_horizon,
            cost_fn=cost_fn,
            num_simulated_paths=num_simulated_paths)

    mpc_controller = MPCcontroller(env=env,
                                   dyn_model=dyn_model,
                                   horizon=mpc_horizon,
                                   cost_fn=cost_fn,
                                   num_simulated_paths=num_simulated_paths)
    # if not PPO:
    #     mpc_ppo_controller = mpc_controller

    #========================================================
    #
    # Tensorflow session building.
    #
    sess.__enter__()
    tf.global_variables_initializer().run()

    # init or load checkpoint with saver
    saver = tf.train.Saver()

    checkpoint = tf.train.get_checkpoint_state(logdir)

    if checkpoint and checkpoint.model_checkpoint_path and FLAGS.LOAD_MODEL:
        saver.restore(sess, checkpoint.model_checkpoint_path)
        print("checkpoint loaded:", checkpoint.model_checkpoint_path)
    else:
        print("Could not find old checkpoint")
        if not os.path.exists(logdir):
            os.mkdir(logdir)

    #========================================================
    #
    # Prepare for rollouts
    #

    episodes_so_far = 0
    timesteps_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards
    max_timesteps = num_paths_onpol * env_horizon
    bc = False
    ppo_mpc = False
    mpc_returns = 0
    model_loss = 0
    for itr in range(onpol_iters):

        print(" ")

        print("onpol_iters: ", itr)

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)

        print("bc learning_rate: ", bc_lr)
        print("ppo learning_rate: ", ppo_lr)

        ################## fit mpc model
        if MPC:
            model_loss, reward_loss = dyn_model.fit(model_data_buffer)

        ################## ppo seg data
        ppo_data_buffer.clear()

        # ppo_seg = traj_segment_generator_ppo(policy_nn, env, env_horizon)
        ppo_mpc = False
        mpc = False
        ppo_seg = traj_segment_generator(policy_nn, mpc_controller,
                                         mpc_ppo_controller, bc_data_buffer,
                                         env, mpc, ppo_mpc, env_horizon)

        add_vtarg_and_adv(ppo_seg, gamma, lam)

        ob, ac, rew, nxt_ob, atarg, tdlamret = \
        ppo_seg["ob"], ppo_seg["ac"], ppo_seg["rew"], ppo_seg["nxt_ob"], ppo_seg["adv"], ppo_seg["tdlamret"]

        atarg = (atarg - atarg.mean()
                 ) / atarg.std()  # standardized advantage function estimate

        # add into buffer
        for n in range(len(ob)):
            ppo_data_buffer.add([ob[n], ac[n], atarg[n], tdlamret[n]])
            model_data_buffer.add(
                [ob[n], ac[n], rew[n], nxt_ob[n], nxt_ob[n] - ob[n]])

        ppo_std = np.std(ac, axis=0)
        print("ppo_std: ", ppo_std)

        ################## mpc augmented seg data

        if MPC:
            print("MPC AUG PPO")

            ppo_mpc = True
            mpc = True
            mpc_seg = traj_segment_generator(policy_nn, mpc_controller,
                                             mpc_ppo_controller,
                                             bc_data_buffer, env, mpc, ppo_mpc,
                                             env_horizon)
            add_vtarg_and_adv(mpc_seg, gamma, lam)

            ob, ac, mpcac, rew, nxt_ob, atarg, tdlamret = mpc_seg[
                "ob"], mpc_seg["ac"], mpc_seg["mpcac"], mpc_seg[
                    "rew"], mpc_seg["nxt_ob"], mpc_seg["adv"], mpc_seg[
                        "tdlamret"]
            atarg = (atarg - atarg.mean()) / atarg.std(
            )  # standardized advantage function estimate

            mpc_returns = mpc_seg["ep_rets"]
            mpc_std = np.std(mpcac)

        if not MPC:
            mpc_std = 0

        ################## mpc random seg data

        if FLAGS.mpc_rand:
            print("MPC Random base policy")

            ppo_mpc = False
            mpc = True
            mpc_random_seg = traj_segment_generator(policy_nn, mpc_controller,
                                                    mpc_ppo_controller,
                                                    bc_data_buffer, env, mpc,
                                                    ppo_mpc, env_horizon)
            add_vtarg_and_adv(mpc_random_seg, gamma, lam)

            ob, ac, mpcac, rew, nxt_ob, atarg, tdlamret = mpc_random_seg[
                "ob"], mpc_random_seg["ac"], mpc_random_seg[
                    "mpcac"], mpc_random_seg["rew"], mpc_random_seg[
                        "nxt_ob"], mpc_random_seg["adv"], mpc_random_seg[
                            "tdlamret"]
            atarg = (atarg - atarg.mean()) / atarg.std(
            )  # standardized advantage function estimate

            mpc_rand_returns = mpc_random_seg["ep_rets"]

        ################# PPO deterministic evaluation
        ppo_determinisitc_return = policy_net_eval(sess,
                                                   env,
                                                   policy_nn,
                                                   env_horizon,
                                                   stochastic=False)

        ################## optimization

        print("ppo_data_buffer size", ppo_data_buffer.size)
        print("bc_data_buffer size", bc_data_buffer.size)
        print("model data buffer size: ", model_data_buffer.size)

        # optim_batchsize = optim_batchsize or ob.shape[0]

        if hasattr(policy_nn, "ob_rms"):
            policy_nn.ob_rms.update(ob)  # update running mean/std for policy
        policy_nn.assign_old_eq_new(
        )  # set old parameter values to new parameter values

        for op_ep in range(optim_epochs):
            # losses = [] # list of tuples, each of which gives the loss for a minibatch
            # for i in range(int(timesteps_per_actorbatch/optim_batchsize)):

            if PPO:
                sample_ob_no, sample_ac_na, sample_adv_n, sample_b_n_target = ppo_data_buffer.sample(
                    optim_batchsize)
                newlosses = policy_nn.lossandupdate_ppo(
                    sample_ob_no, sample_ac_na, sample_adv_n,
                    sample_b_n_target, cur_lrmult, ppo_lr * cur_lrmult)
                # losses.append(newlosses)

            if BEHAVIORAL_CLONING and bc:
                sample_ob_no, sample_ac_na = bc_data_buffer.sample(
                    optim_batchsize)
                # print("sample_ob_no", sample_ob_no.shape)
                # print("sample_ac_na", sample_ac_na.shape)

                policy_nn.update_bc(sample_ob_no, sample_ac_na,
                                    bc_lr * cur_lrmult)

            if op_ep % (100) == 0 and BEHAVIORAL_CLONING and bc:
                print('epcho: ', op_ep)
                policy_net_eval(sess, env, policy_nn, env_horizon)

        ################## print and save data
        seg = ppo_seg

        ep_lengths = seg["ep_lens"]
        returns = seg["ep_rets"]

        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values

        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)

        # log ppo
        logz.log_tabular("TimeSoFar", time.time() - start)
        logz.log_tabular("TimeEp", time.time() - tstart)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsSoFar", timesteps_so_far)
        logz.log_tabular("Condition", "PPO")
        logz.dump_tabular()

        # log ppo deterministic
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", ppo_determinisitc_return)
        logz.log_tabular("Condition", "PPO_DETERMINISTIC")
        logz.dump_tabular()

        # log mpc
        if MPC:
            logz.log_tabular("TimeSoFar", time.time() - start)
            logz.log_tabular("TimeEp", time.time() - tstart)
            logz.log_tabular("Iteration", itr)
            logz.log_tabular("AverageReturn", np.mean(mpc_returns))
            logz.log_tabular("StdReturn", np.std(mpc_returns))
            logz.log_tabular("MaxReturn", np.max(mpc_returns))
            logz.log_tabular("MinReturn", np.min(mpc_returns))
            logz.log_tabular("EpLenMean", np.mean(ep_lengths))
            logz.log_tabular("EpLenStd", np.std(ep_lengths))
            logz.log_tabular("TimestepsSoFar", timesteps_so_far)
            logz.log_tabular("Condition", "MPC_PPO")
            logz.dump_tabular()

        if FLAGS.mpc_rand:
            logz.log_tabular("TimeSoFar", time.time() - start)
            logz.log_tabular("TimeEp", time.time() - tstart)
            logz.log_tabular("Iteration", itr)
            logz.log_tabular("AverageReturn", np.mean(mpc_rand_returns))
            logz.log_tabular("StdReturn", np.std(mpc_rand_returns))
            logz.log_tabular("MaxReturn", np.max(mpc_rand_returns))
            logz.log_tabular("MinReturn", np.min(mpc_rand_returns))
            logz.log_tabular("EpLenMean", np.mean(ep_lengths))
            logz.log_tabular("EpLenStd", np.std(ep_lengths))
            logz.log_tabular("TimestepsSoFar", timesteps_so_far)
            logz.log_tabular("Condition", "MPC_RAND")
            logz.dump_tabular()

        # logz.pickle_tf_vars()
        tstart = time.time()

        ################### TF Summaries
        summary_str = sess.run(merged_summary,
                               feed_dict={
                                   ppo_return_op: np.mean(returns),
                                   mpc_return_op: np.mean(mpc_returns),
                                   model_loss_op: model_loss,
                                   ppo_std_op: ppo_std,
                                   reward_loss_op: reward_loss,
                                   mpc_std_op: mpc_std,
                               })
        summary_writer.add_summary(summary_str, itr)
        summary_writer.flush()

        ################ TF SAVE
        if itr % FLAGS.SAVE_ITER == 0 and itr != 0:
            save_path = saver.save(sess, logdir + "/model.ckpt")
            print("Model saved in path: %s" % save_path)
Exemplo n.º 4
0
def main(args):
    tf.set_random_seed(args.seed)
    np.random.seed(args.seed)

    env_name = args.env_name  # HalfCheetah-v2  My3LineDirect-v1
    print(env_name)

    if args.env_name == 'HalfCheetahEnvDisableEnv-v0':
        cost_fn = cheetah_cost_fn
        sample_task_fun = np.random.randint
    elif args.env_name == 'HalfCheetahVaryingEnv-v0':
        cost_fn = cheetah_cost_fn
        sample_task_fun = np.random.uniform
    else:
        print('env is error!!! ')

    env = gym.make(env_name)
    dim_input = env.observation_space.shape[0] + env.action_space.shape[0]
    dim_output = env.observation_space.shape[0]

    logdir = configure_log_dir(logname=env_name, txt=args.note)
    # save args prameters
    with open(logdir + '/info.txt', 'wt') as f:
        print('Hello World!\n', file=f)
        print(args, file=f)

    mpc_horizon = args.mpc_horizon
    num_simulated_paths = args.simulated_paths  #10000

    dyn_model = Dynamics(
        args.env_name,
        args.NumOfExp,
        args.model_type,
        args.loss_type,
        dim_input,
        dim_output,
        beta=args.beta,  #args.beta,
        max_epochs=args.max_epochs,
        is_train=args.is_train,
        norm=args.norm,
        task_Note=args.note,
        restore_checkpoint=args.restore_checkpoint,
        restore_dir=args.restore_dir,
        logdir=logdir)

    mpc_controller = MPCcontroller(
        env=env,
        dyn_model=dyn_model,
        horizon=mpc_horizon,
        cost_fn=cost_fn,
        num_simulated_paths=num_simulated_paths,
    )
    logger = Logger(logdir, csvname='log')

    num_itr = args.num_itr
    experiences, costs = [], []
    print('MPC is beginning...')
    for itr in range(num_itr):
        reward, model_loss_mean = rollout(
            env,
            mpc_controller,
            task_goal=args.task_goal,
            dyn_model=dyn_model,
            experiences=experiences,
            NumOfExp=args.NumOfExp,
            horizon=args.horizon,
            cost_fn=cheetah_cost_fn,
            render=False,
            verbose=False,
            save_video=False,
            ignore_done=True,
        )

        #print(time.asctime( time.localtime(time.time()) ), ' itr :', itr, 'Average reward :' , cost)
        log.infov(
            "Itr {}/{} Accumulated Reward: {:.4f}  Model loss mean:{:.4f}".
            format(itr, num_itr, reward, model_loss_mean))

        logger.log({
            'itr': itr,
            'Accumulated Reward': reward,
            'Model loss mean': model_loss_mean,
        })

    print('MPC is over....')

    logger.write(display=False)
Exemplo n.º 5
0
def train(env, 
         cost_fn,
         logdir=None,
         render=False,
         learning_rate=1e-3,
         onpol_iters=10,
         dynamics_iters=60,
         batch_size=512,
         num_paths_random=10, 
         num_paths_onpol=10, 
         num_simulated_paths=10000,
         env_horizon=1000, 
         mpc_horizon=15,
         n_layers=2,
         size=500,
         activation=tf.nn.relu,
         output_activation=None,
         clip_param=0.2 , 
         entcoeff=0.0,
         gamma=0.99,
         lam=0.95,
         optim_epochs=10,
         optim_batchsize=64,
         schedule='linear',
         bc_lr=1e-3,
         ppo_lr=3e-4,
         timesteps_per_actorbatch=1000,
         MPC = True,
         BEHAVIORAL_CLONING = True,
         PPO = True,
         ):

    start = time.time()

    logz.configure_output_dir(logdir)


    print("-------- env info --------")
    print("observation_space: ", env.observation_space.shape)
    print("action_space: ", env.action_space.shape)
    print("BEHAVIORAL_CLONING: ", BEHAVIORAL_CLONING)
    print("PPO: ", PPO)
    print("MPC-AUG: ", MPC)
    print(" ")


    # initialize buffers
    model_data_buffer = DataBufferGeneral(1000000, 5)
    ppo_data_buffer = DataBufferGeneral(10000, 4)
    bc_data_buffer = DataBufferGeneral(BC_BUFFER_SIZE, 2)

    # random sample path
    print("collecting random data .....  ")
    random_controller = RandomController(env)
    paths = sample(env, 
               random_controller, 
               num_paths=num_paths_random, 
               horizon=env_horizon, 
               render=False,
               verbose=False)

    # add into buffer
    for path in paths:
        for n in range(len(path['observations'])):
            model_data_buffer.add([path['observations'][n],
                                 path['actions'][n], 
                                 path['rewards'][n], 
                                 path['next_observations'][n], 
                                 path['next_observations'][n] - path['observations'][n]])


    print("model data buffer size: ", model_data_buffer.size)

    normalization = compute_normalization(model_data_buffer)

    #========================================================
    # 
    # Build dynamics model and MPC controllers and Behavioral cloning network.
    # 
    # tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) 
    tf_config = tf.ConfigProto() 

    tf_config.gpu_options.allow_growth = True

    sess = tf.Session(config=tf_config)

    dyn_model = NNDynamicsRewardModel(env=env, 
                                    normalization=normalization,
                                    batch_size=batch_size,
                                    iterations=dynamics_iters,
                                    learning_rate=learning_rate,
                                    sess=sess)

    mpc_controller = MPCcontroller(env=env, 
                                   dyn_model=dyn_model, 
                                   horizon=mpc_horizon, 
                                   cost_fn=cost_fn, 
                                   num_simulated_paths=num_simulated_paths)

    policy_nn = MlpPolicy(sess=sess, env=env, hid_size=256, num_hid_layers=2, clip_param=clip_param , entcoeff=entcoeff)

    mpc_ppo_controller = MPCcontrollerPolicyNetReward(env=env, 
                                   dyn_model=dyn_model, 
                                   policy_net=policy_nn,
                                   self_exp=False,
                                   horizon=mpc_horizon, 
                                   num_simulated_paths=num_simulated_paths)



    #========================================================
    # 
    # Tensorflow session building.
    # 
    sess.__enter__()
    tf.global_variables_initializer().run()

    # init or load checkpoint with saver
    saver = tf.train.Saver()

    checkpoint = tf.train.get_checkpoint_state(CHECKPOINT_DIR)

    if checkpoint and checkpoint.model_checkpoint_path and LOAD_MODEL:
        saver.restore(sess, checkpoint.model_checkpoint_path)
        print("checkpoint loaded:", checkpoint.model_checkpoint_path)
    else:
        print("Could not find old checkpoint")
        if not os.path.exists(CHECKPOINT_DIR):
          os.mkdir(CHECKPOINT_DIR)  

    #========================================================
    # 
    # Prepare for rollouts
    # 

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards
    max_timesteps = num_paths_onpol * env_horizon
    bc = False
    ppo_mpc = False
    mpc_returns = 0

    for itr in range(onpol_iters):

        print(" ")

        print("onpol_iters: ", itr)

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult =  max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
            

        print("bc learning_rate: ",  bc_lr)
        print("ppo learning_rate: ",  ppo_lr)


        ################## fit mpc model
        if MPC:
            dyn_model.fit(model_data_buffer)


        ################## ppo seg data
        if PPO:
            ppo_data_buffer.clear()

            # ppo_seg = traj_segment_generator_ppo(policy_nn, env, env_horizon)
            mpc = False
            ppo_seg = traj_segment_generator(policy_nn, mpc_controller, mpc_ppo_controller, bc_data_buffer, env, mpc, ppo_mpc, env_horizon)

            add_vtarg_and_adv(ppo_seg, gamma, lam)

            ob, ac, rew, nxt_ob, atarg, tdlamret = \
            ppo_seg["ob"], ppo_seg["ac"], ppo_seg["rew"], ppo_seg["nxt_ob"], ppo_seg["adv"], ppo_seg["tdlamret"]

            atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate

            # add into buffer
            for n in range(len(ob)):
                ppo_data_buffer.add([ob[n], ac[n], atarg[n], tdlamret[n]])

                if MPC:
                    model_data_buffer.add([ob[n], ac[n], rew[n], nxt_ob[n], nxt_ob[n]-ob[n]])


        ################## mpc augmented seg data

        if itr % MPC_AUG_GAP == 0 and MPC:
            print("MPC AUG PPO")

            ppo_mpc = True
            mpc = True
            mpc_seg = traj_segment_generator(policy_nn, mpc_controller, mpc_ppo_controller, bc_data_buffer, env, mpc, ppo_mpc, env_horizon)
            add_vtarg_and_adv(mpc_seg, gamma, lam)

            ob, ac, mpcac, rew, nxt_ob, atarg, tdlamret = mpc_seg["ob"], mpc_seg["ac"], mpc_seg["mpcac"], mpc_seg["rew"], mpc_seg["nxt_ob"], mpc_seg["adv"], mpc_seg["tdlamret"]
            atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate

            # add into buffer
            for n in range(len(ob)):
                # if PPO:
                #     ppo_data_buffer.add([ob[n], ac[n], atarg[n], tdlamret[n]])

                if BEHAVIORAL_CLONING and bc:
                    bc_data_buffer.add([ob[n], mpcac[n]])

                if MPC:
                    model_data_buffer.add([ob[n], mpcac[n], rew[n], nxt_ob[n], nxt_ob[n]-ob[n]])

            mpc_returns = mpc_seg["ep_rets"]

        seg = ppo_seg

        # check if seg is good
        ep_lengths = seg["ep_lens"]
        returns =  seg["ep_rets"]

        # saver.save(sess, CHECKPOINT_DIR)
        if BEHAVIORAL_CLONING:
            if np.mean(returns) > 100:
                bc = True
            else:
                bc = False

            print("BEHAVIORAL_CLONING: ", bc)


            bc_return = behavioral_cloning_eval(sess, env, policy_nn, env_horizon)

            if bc_return > 100:
                ppo_mpc = True
            else:
                ppo_mpc = False


        ################## optimization

        print("ppo_data_buffer size", ppo_data_buffer.size)
        print("bc_data_buffer size", bc_data_buffer.size)
        print("model data buffer size: ", model_data_buffer.size)

        # optim_batchsize = optim_batchsize or ob.shape[0]

        if hasattr(policy_nn, "ob_rms"): policy_nn.ob_rms.update(ob) # update running mean/std for policy
        policy_nn.assign_old_eq_new() # set old parameter values to new parameter values
        
        for op_ep in range(optim_epochs):
            # losses = [] # list of tuples, each of which gives the loss for a minibatch
            # for i in range(int(timesteps_per_actorbatch/optim_batchsize)):

            if PPO:
                sample_ob_no, sample_ac_na, sample_adv_n, sample_b_n_target = ppo_data_buffer.sample(optim_batchsize)
                newlosses = policy_nn.lossandupdate_ppo(sample_ob_no, sample_ac_na, sample_adv_n, sample_b_n_target, cur_lrmult, ppo_lr*cur_lrmult)
                # losses.append(newlosses)

            if BEHAVIORAL_CLONING and bc:
                sample_ob_no, sample_ac_na = bc_data_buffer.sample(optim_batchsize)
                # print("sample_ob_no", sample_ob_no.shape)
                # print("sample_ac_na", sample_ac_na.shape)

                policy_nn.update_bc(sample_ob_no, sample_ac_na, bc_lr*cur_lrmult)

            if op_ep % (100) == 0 and BEHAVIORAL_CLONING and bc:
                print('epcho: ', op_ep)
                behavioral_cloning_eval(sess, env, policy_nn, env_horizon)


        ################## print and save data

        lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values


        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1



        # if np.mean(returns) > 1000:
        #     filename = "seg_data.pkl"
        #     pickle.dump(seg, open(filename, 'wb'))
        #     print("saved", filename)


        logz.log_tabular("TimeSoFar", time.time() - start)
        logz.log_tabular("TimeEp", time.time() - tstart)
        logz.log_tabular("Iteration", iters_so_far)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("MpcReturn", np.mean(mpc_returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        # logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", timesteps_so_far)
        logz.dump_tabular()
        logz.pickle_tf_vars()
        tstart = time.time()
Exemplo n.º 6
0
def train(env,
          cost_fn,
          exp_name='test',
          logdir=None,
          render=False,
          learning_rate=1e-3,
          onpol_iters=10,
          dynamics_iters=60,
          batch_size=512,
          num_paths_random=10,
          num_paths_onpol=10,
          num_simulated_paths=10000,
          env_horizon=1000,
          mpc_horizon=15,
          n_layers=2,
          size=500,
          activation=tf.nn.relu,
          output_activation=None):
    """
    Arg:
        onpol_iters: Number of iterations of onpolicy aggregation for the loop
                     to run. 

        dynamics_iters: Number of iterations of training for the dynamics model
        |_              which happen per iteration of the aggregation loop.

        batch_size: Batch size for dynamics training.

        num_paths_random: Number of paths/trajectories/rollouts generated 
        |                 by a random agent. We use these to train our 
        |_                initial dynamics model.
    
        num_paths_onpol: Number of paths to collect at each iteration of
        |_               aggregation, using the MPC policy.

        num_simulated_paths: How many fictitious rollouts the MPC policy
        |                    should generate each time it is asked for an
        |_                   action.

        env_horizon: Number of timesteps in each path.

        mpc_horizon: The MPC policy generates actions by imagining 
        |            fictitious rollouts, and picking the first action
        |            of the best fictitious rollout. This argument is
        |            how many timesteps should be in each fictitious
        |_           rollout.

        n_layers/size/activations: Neural network architecture arguments. 
    """
    # Configure output directory for logging
    logz.configure_output_dir(logdir)

    # Log experimental parameters
    args = inspect.getargspec(train)[0]
    locals_ = locals()
    locals_['cost_fn'] = 'cost_fn'
    locals_['activation'] = 'activation'
    locals_['env'] = 'env'
    params = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_params(params)

    #========================================================
    #
    # First, we need a lot of data generated by a random
    # agent, with which we'll begin to train our dynamics
    # model.

    random_controller = RandomController(env)

    paths = sample(env=env,
                   controller=random_controller,
                   num_paths=num_paths_random,
                   horizon=env_horizon,
                   verbose=False)

    #========================================================
    #
    # The random data will be used to get statistics (mean
    # and std) for the observations, actions, and deltas
    # (where deltas are o_{t+1} - o_t). These will be used
    # for normalizing inputs and denormalizing outputs
    # from the dynamics network.
    #
    normalization = {
        "observations":
        compute_normalization(paths["observations"]),
        "actions":
        compute_normalization(paths["actions"]),
        "deltas":
        compute_normalization(paths["next_observations"] -
                              paths["observations"])
    }

    #========================================================
    #
    # Build dynamics model and MPC controllers.
    #
    sess = tf.Session()

    dyn_model = NNDynamicsModel(env=env,
                                n_layers=n_layers,
                                size=size,
                                activation=activation,
                                output_activation=output_activation,
                                normalization=normalization,
                                batch_size=batch_size,
                                iterations=dynamics_iters,
                                learning_rate=learning_rate,
                                sess=sess)

    mpc_controller = MPCcontroller(env=env,
                                   dyn_model=dyn_model,
                                   horizon=mpc_horizon,
                                   cost_fn=cost_fn,
                                   num_simulated_paths=num_simulated_paths)

    #========================================================
    #
    # Tensorflow session building.
    #
    sess.__enter__()
    tf.global_variables_initializer().run()

    #========================================================
    #
    # Take multiple iterations of onpolicy aggregation at each iteration
    # refitting the dynamics model to current dataset and then taking onpolicy
    # samples and aggregating to the dataset.
    # TODO: implement mixing ratio for new and old data as described in
    # https://arxiv.org/abs/1708.02596
    #
    for itr in range(onpol_iters):

        shuffle_indexes = np.random.permutation(paths["observations"].shape[0])
        for key in ['observations', 'actions', 'next_observations', 'rewards']:
            paths[key] = paths[key][shuffle_indexes]

        dyn_model.fit(paths)

        newpaths = sample(env=env,
                          controller=mpc_controller,
                          num_paths=num_paths_onpol,
                          horizon=env_horizon,
                          verbose=False)

        # LOGGING
        # Statistics for performance of MPC policy using
        # our learned dynamics model
        costs = path_cost(cost_fn, newpaths)
        returns = newpaths["acc_rewards"]

        logz.log_tabular('Iteration', itr)
        # In terms of cost function which your MPC controller uses to plan
        logz.log_tabular('AverageCost', np.mean(costs))
        logz.log_tabular('StdCost', np.std(costs))
        logz.log_tabular('MinimumCost', np.min(costs))
        logz.log_tabular('MaximumCost', np.max(costs))
        # In terms of true environment reward of your rolled out trajectory
        # using the MPC controller
        logz.log_tabular('AverageReturn', np.mean(returns))
        logz.log_tabular('StdReturn', np.std(returns))
        logz.log_tabular('MinimumReturn', np.min(returns))
        logz.log_tabular('MaximumReturn', np.max(returns))
        logz.dump_tabular()

        for key in ['observations', 'actions', 'next_observations', 'rewards']:
            paths[key] = np.concatenate([paths[key], newpaths[key]])
Exemplo n.º 7
0
                        }

                        for tag, value in info.items():
                            logger.scalar_summary(tag, value, i + 1)

            print('Epoch ', (epoch + 1), '/', epoch_size,
                  'Train loss %.3f' % loss_train.data[0],
                  'Validation loss %.3f' % loss.data[0])


env = gym.make(env_name)

mpc_controller = MPCcontroller(
    env=env,
    dyn_model=dyn_model,
    horizon=mpc_horizon,
    cost_fn=cost_fn,
    num_simulated_paths=num_simulated_paths,
)

dataset = MotionDataset(Trainset_file)
train_loader = DataLoader(dataset=dataset,
                          batch_size=batch_size,
                          shuffle=True,
                          num_workers=0)

test_dataset = MotionDataset(Testset_file)
test_loader = DataLoader(dataset=test_dataset,
                         batch_size=test_dataset.len,
                         shuffle=True,
                         num_workers=0)
Exemplo n.º 8
0
def main():
    nb_total_steps = 1000
    nb_iterations = 40
    hidden_layers = [256, 256]
    writer = tensorboardX.SummaryWriter()

    args = parse_args(__doc__, ['env'])


    env = gym.make(args.env) 

    ctrl = rand_ctrl = RandomController(env)


    # ipdb.set_trace()
    print('#inputs : %d' % ctrl.nb_inputs())
    print('#actions: %d' % ctrl.nb_actions())

    # f_net = make_net(
    #     [ctrl.nb_inputs() + ctrl.nb_actions()] + hidden_layers + [ctrl.nb_inputs()],
    #     [nn.ReLU() for _ in hidden_layers],
    # )
    f_net = MOENetwork(
        nb_inputs=ctrl.nb_inputs() + ctrl.nb_actions(),
        nb_experts=4,
        gait_layers=[64],
        expert_layers=[64, ctrl.nb_inputs()],
    )

    data = collect_data(env, ctrl, nb_total_steps*10)


    # ipdb.set_trace()

    dynamics = DynamicsModel(env, f_net, data.get_all(), writer=writer)
    # cost_func = lambda s,a,sn: -sn[3].item()  # refers to vx
    cost_func = get_cost(args.env)  # refers to vx

    # data.calc_normalizations()
    # dynamics.fit(data)

    mpc_ctrl = MPCcontroller(env, dynamics.predict, cost_func, num_simulated_paths=100, horizon=10, num_mpc_steps=10)
    eval_args = EvaluationArgs(nb_burnin_steps=4, nb_episodes=10, horizons=[1, 2, 4, 8, 16, 32])

    for i in range(nb_iterations):
        print('Iteration', i)
        new_data = collect_data(env, ctrl, nb_total_steps)
        dynamics.fit(*new_data.get_all())
        data.extend(new_data)
        dynamics.fit(*data.sample(sample_size=4*nb_total_steps))
        evaluate_and_log_dynamics(
            dynamics.predict, env, rand_ctrl, writer=writer, i_step=i, args=eval_args
        )
        evaluate_and_log_dynamics(
            dynamics.predict, env, mpc_ctrl, writer=writer, i_step=i, args=eval_args
        )
        # dynamics.fit(*data.get_all())
        if random.random() > 0.5:
            ctrl = rand_ctrl
        else:
            ctrl = mpc_ctrl
    
    env = gym.make(args.env)

    ctrl = MPCcontroller(env, dynamics.predict, cost_func, num_simulated_paths=1000, num_mpc_steps=4)

    # TODO

    env.render(mode='human')
    obs = env.reset()

    for _ in range(100):
        # time.sleep(1. / 60.)
        obs, r, done, _ = env.step(ctrl.get_action(obs))
        # print('  ', cost_func(obs))
        if done:
            print("done:", r, obs)
            time.sleep(1)
            ctrl.reset()
            obs = env.reset()
    ipdb.set_trace()
Exemplo n.º 9
0
def train(
    env,
    cost_fn,
    logdir=None,
    render=False,
    learning_rate=1e-3,
    onpol_iters=10,
    dynamics_iters=60,
    batch_size=512,
    num_paths_random=10,
    num_paths_onpol=10,
    num_simulated_paths=10000,
    env_horizon=1000,
    mpc_horizon=15,
    n_layers=2,
    size=500,
    activation=tf.nn.relu,
    output_activation=None,
    clip_param=0.2,
    entcoeff=0.0,
    gamma=0.99,
    lam=0.95,
    optim_epochs=10,
    optim_batchsize=64,
    schedule='linear',
    optim_stepsize=3e-4,
    timesteps_per_actorbatch=1000,
    BEHAVIORAL_CLONING=True,
    PPO=True,
):

    start = time.time()

    logz.configure_output_dir(logdir)

    print("-------- env info --------")
    print("observation_space: ", env.observation_space.shape)
    print("action_space: ", env.action_space.shape)
    print("BEHAVIORAL_CLONING: ", BEHAVIORAL_CLONING)
    print("PPO: ", PPO)

    print(" ")

    random_controller = RandomController(env)
    model_data_buffer = DataBuffer()

    ppo_data_buffer = DataBuffer_general(BC_BUFFER_SIZE, 6)
    bc_data_buffer = DataBuffer_general(BC_BUFFER_SIZE, 2)

    # sample path
    print("collecting random data .....  ")
    paths = sample(env,
                   random_controller,
                   num_paths=num_paths_random,
                   horizon=env_horizon,
                   render=False,
                   verbose=False)

    # add into buffer
    for path in paths:
        for n in range(len(path['observations'])):
            model_data_buffer.add(path['observations'][n], path['actions'][n],
                                  path['next_observations'][n])

    print("model data buffer size: ", model_data_buffer.size)

    normalization = compute_normalization(model_data_buffer)

    #========================================================
    #
    # Build dynamics model and MPC controllers and Behavioral cloning network.
    #
    sess = tf.Session()

    dyn_model = NNDynamicsModel(env=env,
                                n_layers=n_layers,
                                size=size,
                                activation=activation,
                                output_activation=output_activation,
                                normalization=normalization,
                                batch_size=batch_size,
                                iterations=dynamics_iters,
                                learning_rate=learning_rate,
                                sess=sess)

    mpc_controller = MPCcontroller(env=env,
                                   dyn_model=dyn_model,
                                   horizon=mpc_horizon,
                                   cost_fn=cost_fn,
                                   num_simulated_paths=num_simulated_paths)

    policy_nn = MlpPolicy_bc(sess=sess,
                             env=env,
                             hid_size=64,
                             num_hid_layers=2,
                             clip_param=clip_param,
                             entcoeff=entcoeff)

    bc_net = BCnetwork(sess, env, BATCH_SIZE_BC, learning_rate)

    mpc_controller_bc_ppo = MPCcontroller_BC_PPO(
        env=env,
        dyn_model=dyn_model,
        bc_ppo_network=policy_nn,
        horizon=mpc_horizon,
        cost_fn=cost_fn,
        num_simulated_paths=num_simulated_paths)

    #========================================================
    #
    # Tensorflow session building.
    #
    sess.__enter__()
    tf.global_variables_initializer().run()

    # init or load checkpoint with saver
    saver = tf.train.Saver()

    checkpoint = tf.train.get_checkpoint_state(CHECKPOINT_DIR)

    if checkpoint and checkpoint.model_checkpoint_path and LOAD_MODEL:
        saver.restore(sess, checkpoint.model_checkpoint_path)
        print("checkpoint loaded:", checkpoint.model_checkpoint_path)
    else:
        print("Could not find old checkpoint")
        if not os.path.exists(CHECKPOINT_DIR):
            os.mkdir(CHECKPOINT_DIR)

    #========================================================
    #
    # Prepare for rollouts
    #

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards
    max_timesteps = num_paths_onpol * env_horizon

    for itr in range(onpol_iters):

        print("onpol_iters: ", itr)
        dyn_model.fit(model_data_buffer)

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)

        # saver.save(sess, CHECKPOINT_DIR)
        behavioral_cloning_eval(sess, env, policy_nn, env_horizon)

        ppo_data_buffer.clear()
        seg = traj_segment_generator(policy_nn, mpc_controller,
                                     mpc_controller_bc_ppo, bc_data_buffer,
                                     env, env_horizon)
        add_vtarg_and_adv(seg, gamma, lam)

        ob, ac, rew, nxt_ob, atarg, tdlamret = seg["ob"], seg["ac"], seg[
            "rew"], seg["nxt_ob"], seg["adv"], seg["tdlamret"]
        vpredbefore = seg["vpred"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()
                 ) / atarg.std()  # standardized advantage function estimate

        for n in range(len(ob)):
            ppo_data_buffer.add(
                (ob[n], ac[n], rew[n], nxt_ob[n], atarg[n], tdlamret[n]))
            bc_data_buffer.add((ob[n], ac[n]))
            model_data_buffer.add(ob[n], ac[n], nxt_ob[n])

        print("ppo_data_buffer size", ppo_data_buffer.size)
        print("bc_data_buffer size", bc_data_buffer.size)
        print("model data buffer size: ", model_data_buffer.size)

        # optim_batchsize = optim_batchsize or ob.shape[0]

        # behavioral_cloning(sess, env, bc_net, mpc_controller, env_horizon, bc_data_buffer, Training_epoch=1000)

        if hasattr(policy_nn, "ob_rms"):
            policy_nn.ob_rms.update(ob)  # update running mean/std for policy
        policy_nn.assign_old_eq_new(
        )  # set old parameter values to new parameter values

        for op_ep in range(optim_epochs):
            # losses = [] # list of tuples, each of which gives the loss for a minibatch
            # for i in range(int(timesteps_per_actorbatch/optim_batchsize)):

            if PPO:
                sample_ob_no, sample_ac_na, sample_rew, sample_nxt_ob_no, sample_adv_n, sample_b_n_target = ppo_data_buffer.sample(
                    optim_batchsize)
                newlosses = policy_nn.lossandupdate_ppo(
                    sample_ob_no, sample_ac_na, sample_adv_n,
                    sample_b_n_target, cur_lrmult, optim_stepsize * cur_lrmult)
                # losses.append(newlosses)

            if BEHAVIORAL_CLONING:
                sample_ob_no, sample_ac_na = bc_data_buffer.sample(
                    optim_batchsize)
                policy_nn.update_bc(sample_ob_no, sample_ac_na,
                                    optim_stepsize * cur_lrmult)

            if op_ep % 100 == 0:
                print('epcho: ', op_ep)
                behavioral_cloning_eval(sess, env, policy_nn, env_horizon)

        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values

        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1

        ep_lengths = seg["ep_lens"]
        returns = seg["ep_rets"]

        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", iters_so_far)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        # logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", timesteps_so_far)
        logz.dump_tabular()
        logz.pickle_tf_vars()
Exemplo n.º 10
0
def train(env, 
         cost_fn,
         logdir=None,
         render=False,
         learning_rate=1e-3,
         onpol_iters=10,
         dynamics_iters=60,
         batch_size=512,
         num_paths_random=10, 
         num_paths_onpol=10, 
         num_simulated_paths=10000,
         env_horizon=1000, 
         mpc_horizon=15,
         n_layers=2,
         size=500,
         activation=tf.nn.relu,
         output_activation=None,
         clip_param=0.2 , 
         entcoeff=0.0,
         gamma=0.99,
         lam=0.95,
         optim_epochs=10,
         optim_batchsize=64,
         schedule='linear',
         bc_lr=1e-3,
         ppo_lr=3e-4,
         timesteps_per_actorbatch=1000,
         MPC = True,
         BEHAVIORAL_CLONING = True,
         PPO = True,
         ):

    start = time.time()


    print("-------- env info --------")
    print("Environment: ", FLAGS.env_name)
    print("observation_space: ", env.observation_space.shape)
    print("action_space: ", env.action_space.shape)
    print("action_space low: ", env.action_space.low)
    print("action_space high: ", env.action_space.high)

    print("BEHAVIORAL_CLONING: ", BEHAVIORAL_CLONING)
    print("PPO: ", PPO)
    print("MPC-AUG: ", MPC)

    print(" ")


    random_controller = RandomController(env)

    # Creat buffers
    model_data_buffer = DataBufferGeneral(FLAGS.MODELBUFFER_SIZE, 5)
    ppo_data_buffer = DataBufferGeneral(10000, 4)
    bc_data_buffer = DataBufferGeneral(2000, 2)

    # Random sample path

    print("collecting random data .....  ")
    paths = sample(env, 
               random_controller, 
               num_paths=num_paths_random, 
               horizon=env_horizon, 
               render=False,
               verbose=False)

    # add into buffer
    for path in paths:
        for n in range(len(path['observations'])):
            model_data_buffer.add([path['observations'][n],
                                 path['actions'][n], 
                                 path['rewards'][n], 
                                 path['next_observations'][n], 
                                 path['next_observations'][n] - path['observations'][n]])

    print("model data buffer size: ", model_data_buffer.size)

    normalization = compute_normalization(model_data_buffer)

    #========================================================
    # 
    # Build dynamics model and MPC controllers and Behavioral cloning network.
    # 
    # tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) 

    tf_config = tf.ConfigProto() 

    tf_config.gpu_options.allow_growth = True

    sess = tf.Session(config=tf_config)

    policy_nn = MlpPolicy(sess=sess, env=env, hid_size=128, num_hid_layers=2, clip_param=clip_param , entcoeff=entcoeff)

    if FLAGS.LEARN_REWARD:
        print("Learn reward function")
        dyn_model = NNDynamicsRewardModel(env=env, 
                                        normalization=normalization,
                                        batch_size=batch_size,
                                        iterations=dynamics_iters,
                                        learning_rate=learning_rate,
                                        sess=sess)

        mpc_ppo_controller = MPCcontrollerPolicyNetReward(env=env, 
                                       dyn_model=dyn_model, 
                                       explore=FLAGS.MPC_EXP,
                                       policy_net=policy_nn,
                                       self_exp=FLAGS.SELFEXP,
                                       horizon=mpc_horizon, 
                                       num_simulated_paths=num_simulated_paths)
    else:
        print("Use predefined cost function")
        dyn_model = NNDynamicsModel(env=env, 
                                    n_layers=n_layers, 
                                    size=size, 
                                    activation=activation, 
                                    output_activation=output_activation, 
                                    normalization=normalization,
                                    batch_size=batch_size,
                                    iterations=dynamics_iters,
                                    learning_rate=learning_rate,
                                    sess=sess)

        mpc_ppo_controller = MPCcontrollerPolicyNet(env=env, 
                                       dyn_model=dyn_model, 
                                       explore=FLAGS.MPC_EXP,
                                       policy_net=policy_nn,
                                       self_exp=FLAGS.SELFEXP,
                                       horizon=mpc_horizon, 
                                       cost_fn=cost_fn, 
                                       num_simulated_paths=num_simulated_paths)

    mpc_controller = MPCcontroller(env=env, 
                                   dyn_model=dyn_model, 
                                   horizon=mpc_horizon, 
                                   cost_fn=cost_fn, 
                                   num_simulated_paths=num_simulated_paths)
    # if not PPO:
    #     mpc_ppo_controller = mpc_controller

    #========================================================
    # 
    # Tensorflow session building.
    # 
    sess.__enter__()
    tf.global_variables_initializer().run()

    # init or load checkpoint with saver
    saver = tf.train.Saver()

    checkpoint = tf.train.get_checkpoint_state(FLAGS.model_path)

    print("checkpoint", checkpoint)

    if checkpoint and checkpoint.model_checkpoint_path and FLAGS.LOAD_MODEL:
        saver.restore(sess, checkpoint.model_checkpoint_path)
        print("checkpoint loaded:", checkpoint.model_checkpoint_path)
    else:
        print("Could not find old checkpoint")
        if not os.path.exists(FLAGS.model_path):
          os.mkdir(FLAGS.model_path)  

    #========================================================
    # 
    # Prepare for rollouts
    # 

    tstart = time.time()


    states_true = []
    states_predict = []
    rewards_true = []
    rewards_predict = []
    ob = env.reset()
    ob_pre = np.expand_dims(ob, axis=0)

    states_true.append(ob)
    states_predict.append(ob_pre)

    for step in range(100):
        # ac = env.action_space.sample() # not used, just so we have the datatype
        ac, _ = policy_nn.act(ob, stochastic=True)
        ob, rew, done, _ = env.step(ac)
        ob_pre, r_pre = dyn_model.predict(ob_pre, ac)
        states_true.append(ob)
        rewards_true.append(rew)
        states_predict.append(ob_pre)
        rewards_predict.append(r_pre[0][0])

    states_true = np.asarray(states_true)
    states_predict = np.asarray(states_predict)
    states_predict = np.squeeze(states_predict, axis=1)
    rewards_true = np.asarray(rewards_true)
    rewards_predict = np.asarray(rewards_predict)

    print("states_true", states_true.shape)
    print("states_predict", states_predict.shape)
    print("rewards_true", rewards_true.shape)
    print("rewards_predict", rewards_predict.shape)

    np.savetxt('./data/eval_model/states_true.out', states_true, delimiter=',') 
    np.savetxt('./data/eval_model/states_predict.out', states_predict, delimiter=',') 

    np.savetxt('./data/eval_model/rewards_true.out', rewards_true, delimiter=',') 
    np.savetxt('./data/eval_model/rewards_predict.out', rewards_predict, delimiter=',') 
Exemplo n.º 11
0
def train(env,
          cost_fn,
          logdir=None,
          render=False,
          learning_rate=1e-3,
          onpol_iters=10,
          dynamics_iters=60,
          batch_size=512,
          num_paths_random=10,
          num_paths_onpol=10,
          num_simulated_paths=10000,
          env_horizon=1000,
          mpc_horizon=15,
          n_layers=2,
          size=500,
          activation=tf.nn.relu,
          output_activation=None):
    """

    Arguments:

    onpol_iters                 Number of iterations of onpolicy aggregation for the loop to run. 

    dynamics_iters              Number of iterations of training for the dynamics model
    |_                          which happen per iteration of the aggregation loop.

    batch_size                  Batch size for dynamics training.

    num_paths_random            Number of paths/trajectories/rollouts generated 
    |                           by a random agent. We use these to train our 
    |_                          initial dynamics model.
    
    num_paths_onpol             Number of paths to collect at each iteration of
    |_                          aggregation, using the Model Predictive Control policy.

    num_simulated_paths         How many fictitious rollouts the MPC policy
    |                           should generate each time it is asked for an
    |_                          action.

    env_horizon                 Number of timesteps in each path.

    mpc_horizon                 The MPC policy generates actions by imagining 
    |                           fictitious rollouts, and picking the first action
    |                           of the best fictitious rollout. This argument is
    |                           how many timesteps should be in each fictitious
    |_                          rollout.

    n_layers/size/activations   Neural network architecture arguments. 

    """

    logz.configure_output_dir(logdir)

    # Log experimental parameters
    args = inspect.getargspec(train)[0]
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    print(params)
    # the three lines below are to override the functions passed in, which aren't serializable
    params["activation"] = "relu"
    params["cost_fn"] = "cheetah_cost_fn"
    params["env"] = "HalfCheetahEnvNew"
    logz.save_params(params)

    returns_file = "returns.csv"
    returns_array = []

    #========================================================
    #
    # First, we need a lot of data generated by a random
    # agent, with which we'll begin to train our dynamics
    # model.

    random_controller = RandomController(env)
    """ YOUR CODE HERE """
    data = sample(env,
                  random_controller,
                  num_paths=num_paths_random,
                  horizon=env_horizon,
                  render=False,
                  verbose=False)

    #========================================================
    #
    # The random data will be used to get statistics (mean
    # and std) for the observations, actions, and deltas
    # (where deltas are o_{t+1} - o_t). These will be used
    # for normalizing inputs and denormalizing outputs
    # from the dynamics network.
    #
    """ YOUR CODE HERE """
    normalization = compute_normalization(data)

    #========================================================
    #
    # Build dynamics model and MPC controllers.
    #
    sess = tf.Session()

    dyn_model = NNDynamicsModel(env=env,
                                n_layers=n_layers,
                                size=size,
                                activation=activation,
                                output_activation=output_activation,
                                normalization=normalization,
                                batch_size=batch_size,
                                iterations=dynamics_iters,
                                learning_rate=learning_rate,
                                sess=sess)

    mpc_controller = MPCcontroller(env=env,
                                   dyn_model=dyn_model,
                                   horizon=mpc_horizon,
                                   cost_fn=cost_fn,
                                   num_simulated_paths=num_simulated_paths)

    #========================================================
    #
    # Tensorflow session building.
    #
    sess.__enter__()
    tf.global_variables_initializer().run()

    #========================================================
    #
    # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then
    # taking onpolicy samples and aggregating to the dataset.
    # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in
    # https://arxiv.org/abs/1708.02596
    #
    for itr in range(onpol_iters):
        """ YOUR CODE HERE """

        print(itr)
        # learn/fit dynamics model using the Adam optimization algorithm
        l = dyn_model.fit(data)
        print(l)

        # sample a set of on-policy trajectories from the environment
        new_data = sample(env,
                          mpc_controller,
                          num_paths=num_paths_onpol,
                          horizon=env_horizon,
                          render=render,
                          verbose=False)

        # append transition to dataset
        data += new_data

        # compute costs
        costs = np.array([path_cost(cost_fn, path) for path in new_data])
        print(costs)

        # compute returns
        returns = np.array(
            [new_data[i]["returns"] for i in range(len(new_data))])
        print(returns)

        returns_array.append(returns)
        np.array(returns_array).dump(returns_file)

        # LOGGING
        # Statistics for performance of MPC policy using
        # our learned dynamics model
        logz.log_tabular('Iteration', itr)
        # In terms of cost function which your MPC controller uses to plan
        logz.log_tabular('AverageCost', np.mean(costs))
        logz.log_tabular('StdCost', np.std(costs))
        logz.log_tabular('MinimumCost', np.min(costs))
        logz.log_tabular('MaximumCost', np.max(costs))
        # In terms of true environment reward of your rolled out trajectory using the MPC controller
        logz.log_tabular('AverageReturn', np.mean(returns))
        logz.log_tabular('StdReturn', np.std(returns))
        logz.log_tabular('MinimumReturn', np.min(returns))
        logz.log_tabular('MaximumReturn', np.max(returns))

        logz.dump_tabular()
Exemplo n.º 12
0
def train(env, 
         cost_fn,
         logdir=None,
         render=False,
         learning_rate=1e-3,
         onpol_iters=10,
         dynamics_iters=60,
         batch_size=512,
         num_paths_random=10, 
         num_paths_onpol=10, 
         num_simulated_paths=10000,
         env_horizon=1000, 
         mpc_horizon=15,
         n_layers=2,
         size=500,
         activation=tf.nn.relu,
         output_activation=None
         ):
    # tracker = SummaryTracker()

    """

    Arguments:

    onpol_iters                 Number of iterations of onpolicy aggregation for the loop to run. 

    dynamics_iters              Number of iterations of training for the dynamics model
    |_                          which happen per iteration of the aggregation loop.

    batch_size                  Batch size for dynamics training.

    num_paths_random            Number of paths/trajectories/rollouts generated 
    |                           by a random agent. We use these to train our 
    |_                          initial dynamics model.
    
    num_paths_onpol             Number of paths to collect at each iteration of
    |_                          aggregation, using the Model Predictive Control policy.

    num_simulated_paths         How many fictitious rollouts the MPC policy
    |                           should generate each time it is asked for an
    |_                          action.

    env_horizon                 Number of timesteps in each path.

    mpc_horizon                 The MPC policy generates actions by imagining 
    |                           fictitious rollouts, and picking the first action
    |                           of the best fictitious rollout. This argument is
    |                           how many timesteps should be in each fictitious
    |_                          rollout.

    n_layers/size/activations   Neural network architecture arguments. 

    """

    logz.configure_output_dir(logdir)

    #========================================================
    # 
    # First, we need a lot of data generated by a random
    # agent, with which we'll begin to train our dynamics
    # model.

    """ YOUR CODE HERE """

    # Print env info
    print("-------- env info --------")
    print("observation_space: ", env.observation_space.shape)
    print("action_space: ", env.action_space.shape)
    print(" ")


    random_controller = RandomController(env)
    data_buffer = DataBuffer()
    bc_data_buffer = DataBuffer_SA(BC_BUFFER_SIZE)

    # sample path
    print("collecting random data .....  ")
    paths = sample(env, 
               random_controller, 
               num_paths=num_paths_random, 
               horizon=env_horizon, 
               render=False,
               verbose=False)

    # add into buffer
    for path in paths:
        for n in range(len(path['observations'])):
            data_buffer.add(path['observations'][n], path['actions'][n], path['next_observations'][n])



    #========================================================
    # 
    # The random data will be used to get statistics (mean
    # and std) for the observations, actions, and deltas
    # (where deltas are o_{t+1} - o_t). These will be used
    # for normalizing inputs and denormalizing outputs
    # from the dynamics network. 
    # 
    print("data buffer size: ", data_buffer.size)

    normalization = compute_normalization(data_buffer)

    #========================================================
    # 
    # Build dynamics model and MPC controllers and Behavioral cloning network.
    # 
    sess = tf.Session()

    dyn_model = NNDynamicsModel(env=env, 
                                n_layers=n_layers, 
                                size=size, 
                                activation=activation, 
                                output_activation=output_activation, 
                                normalization=normalization,
                                batch_size=batch_size,
                                iterations=dynamics_iters,
                                learning_rate=learning_rate,
                                sess=sess)

    mpc_controller = MPCcontroller(env=env, 
                                   dyn_model=dyn_model, 
                                   horizon=mpc_horizon, 
                                   cost_fn=cost_fn, 
                                   num_simulated_paths=num_simulated_paths)

    bc_net = BCnetwork(sess, env, BATCH_SIZE_BC, learning_rate)

    mpc_controller_bc = MPCcontroller_BC(env=env, 
                                   dyn_model=dyn_model, 
                                   bc_network=bc_net,
                                   horizon=mpc_horizon, 
                                   cost_fn=cost_fn, 
                                   num_simulated_paths=num_simulated_paths)


    #========================================================
    # 
    # Tensorflow session building.
    # 
    sess.__enter__()
    tf.global_variables_initializer().run()

    # init or load checkpoint with saver
    saver = tf.train.Saver()

    checkpoint = tf.train.get_checkpoint_state(CHECKPOINT_DIR)

    if checkpoint and checkpoint.model_checkpoint_path and LOAD_MODEL:
        saver.restore(sess, checkpoint.model_checkpoint_path)
        print("checkpoint loaded:", checkpoint.model_checkpoint_path)
    else:
        print("Could not find old checkpoint")
        if not os.path.exists(CHECKPOINT_DIR):
          os.mkdir(CHECKPOINT_DIR)  
    #========================================================
    # 
    # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset. 
    # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596
    # 

    for itr in range(onpol_iters):
        """ YOUR CODE HERE """
        print("onpol_iters: ", itr)

        dyn_model.fit(data_buffer)

        saver.save(sess, CHECKPOINT_DIR)

        returns = []
        costs = []

        for w in range(num_paths_onpol):
            print("paths_onpol: ", w, " running.....")
            print("data buffer size: ", data_buffer.size)

            st = env.reset_model()
            path = {'observations': [], 'actions': [], 'next_observations':[]}
            # tracker.print_diff()

            return_ = 0

            for i in range(env_horizon):
                if render:
                    env.render()
                # print("env_horizon: ", i)   

                if BEHAVIORAL_CLONING:
                    if bc_data_buffer.size > 2000:
                        at = mpc_controller_bc.get_action(st)
                    else:
                        at = mpc_controller.get_action(st)
                else:
                    at = mpc_controller.get_action(st)
                    # at = random_controller.get_action(st)

                st_next, env_reward, _, _ = env._step(at)
                path['observations'].append(st)
                path['actions'].append(at)
                path['next_observations'].append(st_next)
                st = st_next
                return_ += env_reward

            # cost & return
            cost = path_cost(cost_fn, path)
            costs.append(cost)
            returns.append(return_)
            print("total return: ", return_)
            print("costs: ", cost)

            # add into buffers
            for n in range(len(path['observations'])):
                data_buffer.add(path['observations'][n], path['actions'][n], path['next_observations'][n])
                bc_data_buffer.add(path['observations'][n], path['actions'][n])

        if BEHAVIORAL_CLONING:
            behavioral_cloning(sess, env, bc_net, mpc_controller, env_horizon, bc_data_buffer, Training_epoch=1000)




        # LOGGING
        # Statistics for performance of MPC policy using
        # our learned dynamics model
        logz.log_tabular('Iteration', itr)
        # logz.log_tabular('Average_BC_Return', np.mean(bc_returns))

        # In terms of cost function which your MPC controller uses to plan
        logz.log_tabular('AverageCost', np.mean(costs))
        logz.log_tabular('StdCost', np.std(costs))
        logz.log_tabular('MinimumCost', np.min(costs))
        logz.log_tabular('MaximumCost', np.max(costs))
        # In terms of true environment reward of your rolled out trajectory using the MPC controller
        logz.log_tabular('AverageReturn', np.mean(returns))
        logz.log_tabular('StdReturn', np.std(returns))
        logz.log_tabular('MinimumReturn', np.min(returns))
        logz.log_tabular('MaximumReturn', np.max(returns))

        logz.dump_tabular()
Exemplo n.º 13
0
def train(env,
          cost_fn,
          logdir=None,
          render=False,
          learning_rate=1e-3,
          onpol_iters=10,
          dynamics_iters=60,
          batch_size=512,
          num_paths_random=10,
          num_paths_onpol=10,
          num_simulated_paths=10000,
          env_horizon=1000,
          mpc_horizon=15,
          n_layers=2,
          size=500,
          activation=tf.nn.relu,
          output_activation=None):
    """

    Arguments:

    onpol_iters                 Number of iterations of onpolicy aggregation for the loop to run. 

    dynamics_iters              Number of iterations of training for the dynamics model
    |_                          which happen per iteration of the aggregation loop.

    batch_size                  Batch size for dynamics training.

    num_paths_random            Number of paths/trajectories/rollouts generated 
    |                           by a random agent. We use these to train our 
    |_                          initial dynamics model.
    
    num_paths_onpol             Number of paths to collect at each iteration of
    |_                          aggregation, using the Model Predictive Control policy.

    num_simulated_paths         How many fictitious rollouts the MPC policy
    |                           should generate each time it is asked for an
    |_                          action.

    env_horizon                 Number of timesteps in each path.

    mpc_horizon                 The MPC policy generates actions by imagining 
    |                           fictitious rollouts, and picking the first action
    |                           of the best fictitious rollout. This argument is
    |                           how many timesteps should be in each fictitious
    |_                          rollout.

    n_layers/size/activations   Neural network architecture arguments. 

    """

    logz.configure_output_dir(logdir)

    # Log experimental parameters
    args = inspect.getargspec(train)[0]
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    del params['cost_fn']
    del params['activation']
    del params['output_activation']
    del params['env']
    logz.save_params(params)

    #========================================================
    #
    # First, we need a lot of data generated by a random
    # agent, with which we'll begin to train our dynamics
    # model.

    random_controller = RandomController(env)
    """ YOUR CODE HERE """
    # Sample from random controller
    paths = sample(env, random_controller, num_paths_random, env_horizon,
                   render, True)
    # Build data set
    data = dict()
    data['observations'] = np.concatenate(
        [path['observations'] for path in paths])
    data['actions'] = np.concatenate([path['actions'] for path in paths])
    next_observations = np.concatenate(
        [path['next_observations'] for path in paths])
    data['deltas'] = next_observations - data['observations']

    #========================================================
    #
    # The random data will be used to get statistics (mean
    # and std) for the observations, actions, and deltas
    # (where deltas are o_{t+1} - o_t). These will be used
    # for normalizing inputs and denormalizing outputs
    # from the dynamics network.
    #
    """ YOUR CODE HERE """
    normalization = compute_normalization(data)

    #========================================================
    #
    # Build dynamics model and MPC controllers.
    #
    sess = tf.Session()

    dyn_model = NNDynamicsModel(env=env,
                                n_layers=n_layers,
                                size=size,
                                activation=activation,
                                output_activation=output_activation,
                                normalization=normalization,
                                batch_size=batch_size,
                                iterations=dynamics_iters,
                                learning_rate=learning_rate,
                                sess=sess)

    mpc_controller = MPCcontroller(env=env,
                                   dyn_model=dyn_model,
                                   horizon=mpc_horizon,
                                   cost_fn=cost_fn,
                                   num_simulated_paths=num_simulated_paths)

    #========================================================
    #
    # Tensorflow session building.
    #
    sess.__enter__()
    tf.global_variables_initializer().run()

    #========================================================
    #
    # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset.
    # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596
    #
    for itr in range(onpol_iters):
        """ YOUR CODE HERE """
        # Refit dynamic model
        dyn_model.fit(data)
        # Sample on-policy trajectories
        paths = sample(env, mpc_controller, num_paths_onpol, env_horizon,
                       render, True)
        # Summarize trajectories
        costs = [path_cost(cost_fn, path) for path in paths]
        returns = [np.sum(path['rewards']) for path in paths]
        # Aggregate data
        onpol_observations = np.concatenate(
            [path['observations'] for path in paths])
        onpol_actions = np.concatenate([path['actions'] for path in paths])
        onpol_next_observations = np.concatenate(
            [path['next_observations'] for path in paths])
        onpol_deltas = onpol_next_observations - onpol_observations
        data['observations'] = np.append(data['observations'],
                                         onpol_observations, 0)
        data['actions'] = np.append(data['actions'], onpol_actions, 0)
        data['deltas'] = np.append(data['deltas'], onpol_deltas, 0)

        # LOGGING
        # Statistics for performance of MPC policy using
        # our learned dynamics model
        logz.log_tabular('Iteration', itr)
        # In terms of cost function which your MPC controller uses to plan
        logz.log_tabular('AverageCost', np.mean(costs))
        logz.log_tabular('StdCost', np.std(costs))
        logz.log_tabular('MinimumCost', np.min(costs))
        logz.log_tabular('MaximumCost', np.max(costs))
        # In terms of true environment reward of your rolled out trajectory using the MPC controller
        logz.log_tabular('AverageReturn', np.mean(returns))
        logz.log_tabular('StdReturn', np.std(returns))
        logz.log_tabular('MinimumReturn', np.min(returns))
        logz.log_tabular('MaximumReturn', np.max(returns))

        logz.dump_tabular()
Exemplo n.º 14
0
def train(env,
          cost_fn,
          logdir=None,
          render=False,
          learning_rate=1e-3,
          onpol_iters=1,
          dynamics_iters=60,
          batch_size=512,
          num_paths_random=10,
          num_paths_onpol=1,
          num_simulated_paths=10000,
          env_horizon=1000,
          mpc_horizon=15,
          n_layers=2,
          size=100,
          activation=tf.nn.relu,
          output_activation=None):
    """

    Arguments:

    onpol_iters                 Number of iterations of onpolicy aggregation for the loop to run. 

    dynamics_iters              Number of iterations of training for the dynamics model
    |_                          which happen per iteration of the aggregation loop.

    batch_size                  Batch size for dynamics training.

    num_paths_random            Number of paths/trajectories/rollouts generated 
    |                           by a random agent. We use these to train our 
    |_                          initial dynamics model.
    
    num_paths_onpol             Number of paths to collect at each iteration of
    |_                          aggregation, using the Model Predictive Control policy.

    num_simulated_paths         How many fictitious rollouts the MPC policy
    |                           should generate each time it is asked for an
    |_                          action.

    env_horizon                 Number of timesteps in each path.

    mpc_horizon                 The MPC policy generates actions by imagining 
    |                           fictitious rollouts, and picking the first action
    |                           of the best fictitious rollout. This argument is
    |                           how many timesteps should be in each fictitious
    |_                          rollout.

    n_layers/size/activations   Neural network architecture arguments. 

    """

    logz.configure_output_dir(logdir)

    #========================================================
    #
    # First, we need a lot of data generated by a random
    # agent, with which we'll begin to train our dynamics
    # model.

    random_controller = RandomController(env)

    paths, rewards, costs = sample(env, random_controller, num_paths_random)
    obs = np.concatenate([path["observations"] for path in paths])
    acs = np.concatenate([path["actions"] for path in paths])
    n_obs = np.concatenate([path["next_observations"] for path in paths])
    delta = n_obs - obs
    data = {'observations': obs, 'actions': acs, 'delta': delta}

    #========================================================
    #
    # The random data will be used to get statistics (mean
    # and std) for the observations, actions, and deltas
    # (where deltas are o_{t+1} - o_t). These will be used
    # for normalizing inputs and denormalizing outputs
    # from the dynamics network.
    #
    mean_obs, std_obs, mean_deltas, std_deltas, mean_actions, std_actions = compute_normalization(
        data)
    normalization = dict()
    normalization['observations'] = [mean_obs, std_obs]
    normalization['actions'] = [mean_actions, std_actions]
    normalization['delta'] = [mean_deltas, std_deltas]
    #========================================================
    #
    # Build dynamics model and MPC controllers.
    #
    sess = tf.Session()

    dyn_model = NNDynamicsModel(env=env,
                                n_layers=n_layers,
                                size=size,
                                activation=activation,
                                output_activation=output_activation,
                                normalization=normalization,
                                batch_size=batch_size,
                                iterations=dynamics_iters,
                                learning_rate=learning_rate,
                                sess=sess)

    mpc_controller = MPCcontroller(env=env,
                                   dyn_model=dyn_model,
                                   horizon=mpc_horizon,
                                   cost_fn=cost_fn,
                                   num_simulated_paths=num_simulated_paths)

    #========================================================
    #
    # Tensorflow session building.
    #
    sess.__enter__()
    tf.global_variables_initializer().run()

    #========================================================
    #
    # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset.
    # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596
    #
    print("onpol_iter", onpol_iters)
    for itr in range(onpol_iters):
        """ YOUR CODE HERE """
        print(data['observations'].shape)
        #print(data['observations'].shape)
        dyn_model.fit(data)

        # Generate trajectories from MPC controllers

        pathsM, returns, costs = sample(env, mpc_controller, num_paths_onpol)
        obs = np.concatenate([path["observations"] for path in pathsM])
        acs = np.concatenate([path["actions"] for path in pathsM])
        n_obs = np.concatenate([path["next_observations"] for path in pathsM])
        delta = n_obs - obs
        data = {
            'observations': np.concatenate((data['observations'], obs)),
            'actions': np.concatenate((data['actions'], acs)),
            'delta': np.concatenate((data['delta'], delta))
        }

        # LOGGING
        # Statistics for performance of MPC policy using
        # our learned dynamics model
        logz.log_tabular('Iteration', itr)
        # In terms of cost function which your MPC controller uses to plan
        logz.log_tabular('AverageCost', np.mean(costs))
        logz.log_tabular('StdCost', np.std(costs))
        logz.log_tabular('MinimumCost', np.min(costs))
        logz.log_tabular('MaximumCost', np.max(costs))
        # In terms of true environment reward of your rolled out trajectory using the MPC controller
        logz.log_tabular('AverageReturn', np.mean(returns))
        logz.log_tabular('StdReturn', np.std(returns))
        logz.log_tabular('MinimumReturn', np.min(returns))
        logz.log_tabular('MaximumReturn', np.max(returns))

        logz.dump_tabular()
Exemplo n.º 15
0
def train(
    env,
    cost_fn,
    logdir=None,
    render=False,
    learning_rate=1e-3,
    onpol_iters=10,
    dynamics_iters=60,
    batch_size=512,
    num_paths_random=10,
    num_paths_onpol=10,
    num_simulated_paths=1000,
    env_horizon=1000,
    mpc_horizon=15,
    n_layers=2,
    size=500,
    activation=tf.nn.relu,
    output_activation=None,
):
    """

    Arguments:

    onpol_iters                 Number of iterations of onpolicy aggregation for the loop to run. 

    dynamics_iters              Number of iterations of training for the dynamics model
    |_                          which happen per iteration of the aggregation loop.

    batch_size                  Batch size for dynamics training.

    num_paths_random            Number of paths/trajectories/rollouts generated 
    |                           by a random agent. We use these to train our 
    |_                          initial dynamics model.
    
    num_paths_onpol             Number of paths to collect at each iteration of
    |_                          aggregation, using the Model Predictive Control policy.

    num_simulated_paths         How many fictitious rollouts the MPC policy
    |                           should generate each time it is asked for an
    |_                          action.

    env_horizon                 Number of timesteps in each path.

    mpc_horizon                 The MPC policy generates actions by imagining 
    |                           fictitious rollouts, and picking the first action
    |                           of the best fictitious rollout. This argument is
    |                           how many timesteps should be in each fictitious
    |_                          rollout.

    n_layers/size/activations   Neural network architecture arguments. 

    """

    logz.configure_output_dir(logdir)

    #========================================================
    #
    # First, we need a lot of data generated by a random
    # agent, with which we'll begin to train our dynamics
    # model.

    random_controller = RandomController(env)
    """ YOUR CODE HERE """
    paths_rand = sample(env,
                        random_controller,
                        num_paths=num_paths_random,
                        horizon=env_horizon,
                        render=render,
                        verbose=False)
    #========================================================
    #
    # The random data will be used to get statistics (mean
    # and std) for the observations, actions, and deltas
    # (where deltas are o_{t+1} - o_t). These will be used
    # for normalizing inputs and denormalizing outputs
    # from the dynamics network.
    #
    """ YOUR CODE HERE """
    normalization = compute_normalization(paths_rand)

    gamma = 0.99

    #========================================================
    #
    # Build dynamics model and MPC controllers.
    #
    sess = tf.Session()

    dyn_model = NNDynamicsModel(env=env,
                                n_layers=n_layers,
                                size=size,
                                activation=activation,
                                output_activation=output_activation,
                                normalization=normalization,
                                batch_size=batch_size,
                                iterations=dynamics_iters,
                                learning_rate=learning_rate,
                                sess=sess)

    mpc_controller = MPCcontroller(env=env,
                                   dyn_model=dyn_model,
                                   horizon=mpc_horizon,
                                   cost_fn=cost_fn,
                                   num_simulated_paths=num_simulated_paths)

    #========================================================
    #
    # Tensorflow session building.
    #
    sess.__enter__()
    tf.global_variables_initializer().run()

    #========================================================
    #
    # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset.
    # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596
    #
    # prefit dynamic before on policy dagger:
    print("****** Pretrain dynamic Model *******")
    losses = []
    obs_rand = np.concatenate([path["observation"] for path in paths_rand])
    action_rand = np.concatenate([path["action"] for path in paths_rand])
    next_ob_rand = np.concatenate([path["obs_next"] for path in paths_rand])
    data_size_rand = obs_rand.shape[0]
    for i in range(1000):
        # obtain batch size from random policy
        batch_idx_rand = np.random.randint(data_size_rand, size=batch_size)
        batch_ob_rand = obs_rand[batch_idx_rand, :]
        batch_ac_rand = action_rand[batch_idx_rand, :]
        batch_nxt_rand = next_ob_rand[batch_idx_rand, :]
        # obtain batch size from on policy
        batch_ob = np.copy(batch_ob_rand)
        batch_ac = np.copy(batch_ac_rand)
        batch_nxt = np.copy(batch_nxt_rand)
        loss = dyn_model.fit(batch_ob, batch_ac, batch_nxt)
        losses.append(loss)
        if (i % 20 == 0):
            print('loss', loss)

    costs = []
    returns = []
    paths_rl = []
    for itr in range(onpol_iters):
        """ YOUR CODE HERE """
        # fit dynamic model
        if itr > 0:
            obs_rl = np.concatenate([path["observation"] for path in paths_rl])
            action_rl = np.concatenate([path["action"] for path in paths_rl])
            next_ob_rl = np.concatenate(
                [path["obs_next"] for path in paths_rl])
        obs_rand = np.concatenate([path["observation"] for path in paths_rand])
        action_rand = np.concatenate([path["action"] for path in paths_rand])
        next_ob_rand = np.concatenate(
            [path["obs_next"] for path in paths_rand])
        # print obs[128,:].shape
        data_size_rand = obs_rand.shape[0]
        if itr > 0:
            data_size_rl = obs_rl.shape[0]
        # batch_size=128
        losses = []
        # fit model function
        for i in range(dynamics_iters):
            # obtain batch size from random policy
            batch_idx_rand = np.random.randint(data_size_rand,
                                               size=batch_size / 20)
            batch_ob_rand = obs_rand[batch_idx_rand, :]
            batch_ac_rand = action_rand[batch_idx_rand, :]
            batch_nxt_rand = next_ob_rand[batch_idx_rand, :]
            # obtain batch size from on policy
            if itr > 0:
                batch_idx_rl = np.random.randint(data_size_rl,
                                                 size=batch_size * 19 / 20)
                batch_ob_rl = obs_rl[batch_idx_rl, :]
                batch_ac_rl = action_rl[batch_idx_rl, :]
                batch_nxt_rl = next_ob_rl[batch_idx_rl, :]
                # mix them
                batch_ob = np.concatenate((batch_ob_rand, batch_ob_rl))
                batch_ac = np.concatenate((batch_ac_rand, batch_ac_rl))
                batch_nxt = np.concatenate((batch_nxt_rand, batch_nxt_rl))
            else:
                batch_ob = np.copy(batch_ob_rand)
                batch_ac = np.copy(batch_ac_rand)
                batch_nxt = np.copy(batch_nxt_rand)
            loss = dyn_model.fit(batch_ob, batch_ac, batch_nxt)
            losses.append(loss)
            # if(i%20==0):
            #     print('loss', loss)
        print("on policy dagger ", itr)
        ob = env.reset()
        observes, acs, rewards, obs_2, returns = [], [], [], [], []
        steps = 0
        g = 0
        max_path_length = mpc_controller.horizon
        timesteps_this_batch = 0
        while True:
            while True:
                observes.append(ob)
                ac = mpc_controller.get_action(ob)
                # print ac
                acs.append(ac)
                # print ac
                ob, rew, done, _ = env.step(ac)
                g += rew * gamma**steps
                obs_2.append(ob)
                rewards.append(rew)
                returns.append(g)
                steps += 1
                if done or steps > max_path_length:
                    terminated = done
                    break
            path = {
                "observation": np.array(observes),
                "reward": np.array(rewards),
                "action": np.array(acs),
                "obs_next": np.array(obs_2),
                "return": np.array(returns)
            }
            paths_rl.append(path)
            timesteps_this_batch += pathlength(path)
            print g
            if timesteps_this_batch > batch_size:
                break
        trajectory_cost = trajectory_cost_fn(cheetah_cost_fn,
                                             path["observation"],
                                             path["action"], path["obs_next"])
        costs.append(trajectory_cost)
        returns.append(path["return"][-1])

        # print batch_ob.shape

        # LOGGING
        # Statistics for performance of MPC policy using
        # our learned dynamics model
        logz.log_tabular('Iteration', itr)
        # In terms of cost function which your MPC controller uses to plan
        logz.log_tabular('AverageCost', np.mean(costs))
        logz.log_tabular('StdCost', np.std(costs))
        logz.log_tabular('MinimumCost', np.min(costs))
        logz.log_tabular('MaximumCost', np.max(costs))
        # In terms of true environment reward of your rolled out trajectory using the MPC controller
        logz.log_tabular('AverageReturn', np.mean(returns))
        logz.log_tabular('StdReturn', np.std(returns))
        logz.log_tabular('MinimumReturn', np.min(returns))
        logz.log_tabular('MaximumReturn', np.max(returns))

        logz.dump_tabular()
Exemplo n.º 16
0
def train(env,
         cost_fn,
         load_model,
         model_path,
         logdir=None,
         render=False,
         learning_rate_dyn=1e-3,
         learning_rate_policy=1e-4,
         onpol_iters=10,
         dynamics_iters=60,
         policy_iters=100,
         batch_size=512,
         num_paths_random=10,
         num_paths_onpol=5,
         num_simulated_paths=10000,
         env_horizon=1000,
         mpc_horizon=15,
         n_layers=2,
         size=500,
         activation=tf.nn.relu,
         output_activation=None,
         ):

    """

    Arguments:

    onpol_iters                 Number of iterations of onpolicy aggregation for the loop to run.

    dynamics_iters              Number of iterations of training for the dynamics model
    |_                          which happen per iteration of the aggregation loop.

    batch_size                  Batch size for dynamics training.

    num_paths_random            Number of paths/trajectories/rollouts generated
    |                           by a random agent. We use these to train our
    |_                          initial dynamics model.

    num_paths_onpol             Number of paths to collect at each iteration of
    |_                          aggregation, using the Model Predictive Control policy.

    num_simulated_paths         How many fictitious rollouts the MPC policy
    |                           should generate each time it is asked for an
    |_                          action.

    env_horizon                 Number of timesteps in each path.

    mpc_horizon                 The MPC policy generates actions by imagining
    |                           fictitious rollouts, and picking the first action
    |                           of the best fictitious rollout. This argument is
    |                           how many timesteps should be in each fictitious
    |_                          rollout.

    n_layers/size/activations   Neural network architecture arguments.

    """

    #logz.configure_output_dir(logdir)

    #========================================================
    #
    # First, we need a lot of data generated by a random
    # agent, with which we'll begin to train our dynamics
    # model.

    random_controller = RandomController(env)

    """ YOUR CODE HERE """
    data = sample(env, random_controller, num_paths_random, env_horizon)


    #========================================================
    #
    # The random data will be used to get statistics (mean
    # and std) for the observations, actions, and deltas
    # (where deltas are o_{t+1} - o_t). These will be used
    # for normalizing inputs and denormalizing outputs
    # from the dynamics network.
    #
    """ YOUR CODE HERE """
    normalization = compute_normalization(data)


    #========================================================
    #
    # Build dynamics model and MPC controllers.
    #
    sess = tf.Session()

    dyn_model = NNDynamicsModel(env=env,
                                n_layers=n_layers,
                                size=size,
                                activation=activation,
                                output_activation=output_activation,
                                normalization=normalization,
                                batch_size=batch_size,
                                iterations=dynamics_iters,
                                learning_rate=learning_rate_dyn,
                                sess=sess)

    policy = NNPolicy(env=env,
                    normalization=normalization,
                    batch_size=batch_size,
                    iterations=policy_iters,
                    learning_rate=learning_rate_policy,
                    sess=sess,
                    model_path=model_path,
                    save_path="./policy/",
                    load_model=load_model)

    mpc_controller = MPCcontroller(env=env,
                                   dyn_model=dyn_model,
                                   horizon=mpc_horizon,
                                   cost_fn=cost_fn,
                                   num_simulated_paths=num_simulated_paths)

    lqr_controller = LQRcontroller(env=env,
                                   delta=0.005,
                                   T=50,
                                   dyn_model=dyn_model,
                                   cost_fn=cost_fn,
                                   iterations=1)

    comm = MPI.COMM_WORLD
    size = comm.Get_size()
    rank = comm.Get_rank()

    #========================================================
    #
    # Tensorflow session building.
    #

    sess.__enter__()
    tf.global_variables_initializer().run()

    #========================================================
    #
    # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset.
    # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596
    #

    # training the MPC controller as well as dynamics
    for itr in range(onpol_iters):

        print("fitting dynamics for worker ", rank)
        dyn_model.fit(data)
        print("sampling new trajectories from worker ", rank)
        new_data = sample(env, lqr_controller, num_paths_onpol, env_horizon)

        data += new_data
        comm.send(new_data, 0)

        if rank == 0:
            costs, returns = [], []

            for path in data:

                costs.append(path_cost(cost_fn, path))
                returns.append(np.sum(path['rewards']))

            print("returns ",returns)

            for i in range(1, size):
                data += comm.recv(source=i)

            print("fitting policy...")
            policy.fit(data)

            # LOGGING
            # Statistics for performance of MPC policy using
            # our learned dynamics model
            logz.log_tabular('Iteration', itr)
            # In terms of cost function which your MPC controller uses to plan
            logz.log_tabular('AverageCost', np.mean(costs))
            logz.log_tabular('StdCost', np.std(costs))
            logz.log_tabular('MinimumCost', np.min(costs))
            logz.log_tabular('MaximumCost', np.max(costs))
            # In terms of true environment reward of your rolled out trajectory using the MPC controller
            logz.log_tabular('AverageReturn', np.mean(returns))
            logz.log_tabular('StdReturn', np.std(returns))
            logz.log_tabular('MinimumReturn', np.min(returns))
            logz.log_tabular('MaximumReturn', np.max(returns))

            logz.dump_tabular()

    # applying the learned neural policy
    if rank == 0:
        ob = env.reset()

        while True:
            a = policy.get_action(ob.reshape((1, ob.shape[0])))

            # control clipping to be added

            next_ob, reward, done, info = env.step(a[0])
            print("action", a)
            print("predicted ob", dyn_model.predict(ob, a))
            print("actual ob", (next_ob - normalization[0]) / (normalization[1] + 1e-10))
            env.render()
            ob = next_ob
            if done:
                ob = env.reset()
Exemplo n.º 17
0
def train(env,
          cost_fn,
          logdir=None,
          render=False,
          learning_rate=1e-3,
          onpol_iters=10,
          dynamics_iters=60,
          batch_size=512,
          num_paths_random=10,
          num_paths_onpol=10,
          num_simulated_paths=10000,
          env_horizon=1000,
          mpc_horizon=15,
          n_layers=2,
          size=500,
          activation=tf.nn.relu,
          output_activation=None):
    """

    Arguments:

    onpol_iters                 Number of iterations of onpolicy aggregation
    |_                          for the loop to run.

    dynamics_iters              Number of iterations of training for the
    |                           dynamics model which happen per iteration of
    |_                          the aggregation loop.

    batch_size                  Batch size for dynamics training.

    num_paths_random            Number of paths/trajectories/rollouts generated
    |                           by a random agent. We use these to train our
    |_                          initial dynamics model.

    num_paths_onpol             Number of paths to collect at each iteration of
    |                           aggregation, using the Model Predictive Control
    |_                          policy.

    num_simulated_paths         How many fictitious rollouts the MPC policy
    |                           should generate each time it is asked for an
    |_                          action.

    env_horizon                 Number of timesteps in each path.

    mpc_horizon                 The MPC policy generates actions by imagining
    |                           fictitious rollouts, and picking the first
    |                           action of the best fictitious rollout. This
    |                           argument is how many timesteps should be in
    |_                          each fictitious rollout.

    n_layers/size/activations   Neural network architecture arguments.

    """

    d("env                         = {}".format(env))
    d("env.observation_space       = {}".format(env.observation_space))
    d("env.action_space            = {}".format(env.action_space))
    d("env.observation_space.shape = {}".format(env.observation_space.shape))
    d("env.action_space.shape      = {}".format(env.action_space.shape))
    d("logdir                      = {}".format(logdir))
    d("render                      = {}".format(render))
    d("learning_rate               = {}".format(learning_rate))
    d("onpol_iters                 = {}".format(onpol_iters))
    d("dynamics_iters              = {}".format(dynamics_iters))
    d("batch_size                  = {}".format(batch_size))
    d("num_paths_random            = {}".format(num_paths_random))
    d("num_paths_onpol             = {}".format(num_paths_onpol))
    d("num_simulated_paths         = {}".format(num_simulated_paths))
    d("env_horizon                 = {}".format(env_horizon))
    d("mpc_horizon                 = {}".format(mpc_horizon))
    d("n_layers                    = {}".format(n_layers))
    d("size                        = {}".format(size))

    logz.configure_output_dir(logdir)

    #===========================================================================
    # First, we need a lot of data generated by a random agent, with which
    # we'll begin to train our dynamics model.
    d("Generating random rollouts.")
    random_controller = RandomController(env)
    random_paths = sample(env=env,
                          controller=random_controller,
                          num_paths=num_paths_random,
                          horizon=env_horizon,
                          render=render)
    d("Done generating random rollouts.")

    #===========================================================================
    # The random data will be used to get statistics (mean and std) for the
    # observations, actions, and deltas (where deltas are o_{t+1} - o_t). These
    # will be used for normalizing inputs and denormalizing outputs from the
    # dynamics network.
    d("Normalizing random rollouts.")
    data = paths_to_data(random_paths)
    normalization = compute_normalization(data)
    d("Done normalizing random rollouts.")

    mean_obs, std_obs, mean_deltas, std_deltas, mean_action, std_action = normalization
    d("mean_obs    = {}".format(mean_obs))
    d("std_obs     = {}".format(std_obs))
    d("mean_deltas = {}".format(mean_deltas))
    d("std_deltas  = {}".format(std_deltas))
    d("mean_action = {}".format(mean_action))
    d("std_action  = {}".format(std_action))

    #===========================================================================
    # Build dynamics model and MPC controllers.
    sess = tf.Session()

    dyn_model = NNDynamicsModel(env=env,
                                n_layers=n_layers,
                                size=size,
                                activation=activation,
                                output_activation=output_activation,
                                normalization=normalization,
                                batch_size=batch_size,
                                iterations=dynamics_iters,
                                learning_rate=learning_rate,
                                sess=sess)

    mpc_controller = MPCcontroller(env=env,
                                   dyn_model=dyn_model,
                                   horizon=mpc_horizon,
                                   cost_fn=cost_fn,
                                   num_simulated_paths=num_simulated_paths)

    #===========================================================================
    # Tensorflow session building.
    sess.__enter__()
    tf.global_variables_initializer().run()

    #===========================================================================
    # Take multiple iterations of onpolicy aggregation at each iteration
    # refitting the dynamics model to current dataset and then taking onpolicy
    # samples and aggregating to the dataset.
    #
    # Note: You don't need to use a mixing ratio in this assignment for new and
    # old data as described in https://arxiv.org/abs/1708.02596
    start_time = time.time()
    for itr in range(onpol_iters):
        d("Iteration {}".format(itr))

        # Shuffle data.
        d("Shuffling data.")
        shuffle_indexes = np.random.permutation(data["observations"].shape[0])
        data["observations"] = data["observations"][shuffle_indexes]
        data["actions"] = data["actions"][shuffle_indexes]
        data["next_observations"] = data["next_observations"][shuffle_indexes]
        data["rewards"] = data["rewards"][shuffle_indexes]
        d("Done shuffling data.")

        # Fit the dynamics.
        d("Fitting dynamics.")
        dyn_model.fit(data)
        d("Done fitting dynamics.")

        # Generate on-policy rollouts.
        d("Generating on-policy rollouts.")
        rl_paths = sample(env=env,
                          controller=mpc_controller,
                          num_paths=num_paths_onpol,
                          horizon=env_horizon,
                          render=render)
        d("Done generating on-policy rollouts.")

        # Compute metrics.
        costs = np.array([path_cost(cost_fn, path) for path in rl_paths])
        returns = np.array([sum(path["rewards"]) for path in rl_paths])

        # Update data.
        new_data = paths_to_data(rl_paths)
        data = {
            "observations":
            np.concatenate([data["observations"], new_data["observations"]]),
            "actions":
            np.concatenate([data["actions"], new_data["actions"]]),
            "next_observations":
            np.concatenate(
                [data["next_observations"], new_data["next_observations"]]),
            "rewards":
            np.concatenate([data["rewards"], new_data["rewards"]]),
        }
        # TODO(mwhittaker): Shuffle if we need to.

        # LOGGING
        # Statistics for performance of MPC policy using our learned dynamics
        # model
        logz.log_tabular('Iteration', itr)
        logz.log_tabular('Time', time.time() - start_time)
        # In terms of cost function which your MPC controller uses to plan
        logz.log_tabular('AverageCost', np.mean(costs))
        logz.log_tabular('StdCost', np.std(costs))
        logz.log_tabular('MinimumCost', np.min(costs))
        logz.log_tabular('MaximumCost', np.max(costs))
        # In terms of true environment reward of your rolled out trajectory
        # using the MPC controller
        logz.log_tabular('AverageReturn', np.mean(returns))
        logz.log_tabular('StdReturn', np.std(returns))
        logz.log_tabular('MinimumReturn', np.min(returns))
        logz.log_tabular('MaximumReturn', np.max(returns))

        logz.dump_tabular()
Exemplo n.º 18
0
def train(env,
         cost_fn,
         logdir=None,
         render=False,
         learning_rate=1e-3,
         onpol_iters=10,
         dynamics_iters=20,
         batch_size=64,
         num_paths_random=10,
         num_paths_onpol=10,
         num_simulated_paths=10000,
         env_horizon=500,
         mpc_horizon=15,
         n_layers=2,
         size=64,
         activation=tf.nn.relu,
         output_activation=None,
         controller_service=None,
         ):

    """

    Arguments:

    onpol_iters                 Number of iterations of onpolicy aggregation for the loop to run.

    dynamics_iters              Number of iterations of training for the dynamics model
    |_                          which happen per iteration of the aggregation loop.

    batch_size                  Batch size for dynamics training.

    num_paths_random            Number of paths/trajectories/rollouts generated
    |                           by a random agent. We use these to train our
    |_                          initial dynamics model.

    num_paths_onpol             Number of paths to collect at each iteration of
    |_                          aggregation, using the Model Predictive Control policy.

    num_simulated_paths         How many fictitious rollouts the MPC policy
    |                           should generate each time it is asked for an
    |_                          action.

    env_horizon                 Number of timesteps in each path.

    mpc_horizon                 The MPC policy generates actions by imagining
    |                           fictitious rollouts, and picking the first action
    |                           of the best fictitious rollout. This argument is
    |                           how many timesteps should be in each fictitious
    |_                          rollout.

    n_layers/size/activations   Neural network architecture arguments.

    """

    logz.configure_output_dir(logdir)

    #========================================================
    #
    # First, we need a lot of data generated by a random
    # agent, with which we'll begin to train our dynamics
    # model.

    random_controller = RandomController(env)
    ref_controller = RefMPCController(env, lambda state: call_mpc(env, controller_service))

    paths = sample(env,
           random_controller,
           num_paths=num_paths_random,
           horizon=env_horizon,
           render=False,
           verbose=False,
           )


    #========================================================
    #
    # The random data will be used to get statistics (mean
    # and std) for the observations, actions, and deltas
    # (where deltas are o_{t+1} - o_t). These will be used
    # for normalizing inputs and denormalizing outputs
    # from the dynamics network.
    #
    normalization = compute_normalization(paths)
    print(normalization)

    #========================================================
    #
    # Build dynamics model and MPC controllers.
    #
    sess = tf.Session()

    dyn_model = NNDynamicsModel(env=env,
                                n_layers=n_layers,
                                size=size,
                                activation=activation,
                                output_activation=output_activation,
                                normalization=normalization,
                                batch_size=batch_size,
                                iterations=dynamics_iters,
                                learning_rate=learning_rate,
                                sess=sess)

    mpc_controller = MPCcontroller(env=env,
                                   dyn_model=dyn_model,
                                   horizon=mpc_horizon,
                                   cost_fn=cost_fn,
                                   num_simulated_paths=num_simulated_paths)


    #========================================================
    #
    # Tensorflow session building.
    #
    sess.__enter__()
    tf.global_variables_initializer().run()

    #========================================================
    #
    # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset.
    # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596
    #
    for itr in range(onpol_iters):

        dyn_model.fit(paths)
        new_paths = sample(env,mpc_controller, num_paths=num_paths_onpol,horizon=env_horizon,render=False,verbose=False)
        costs = []
        returns = []
        for new_path in new_paths:
            cost = path_cost(cost_fn, new_path)
            costs.append(cost)
            returns.append(new_path['return'])
        costs = np.array(costs)
        returns = np.array(returns)
        paths = paths + new_paths # Aggregation
        # LOGGING
        # Statistics for performance of MPC policy using
        # our learned dynamics model
        logz.log_tabular('Iteration', itr)
        # In terms of cost function which your MPC controller uses to plan
        logz.log_tabular('AverageCost', np.mean(costs))
        logz.log_tabular('StdCost', np.std(costs))
        logz.log_tabular('MinimumCost', np.min(costs))
        logz.log_tabular('MaximumCost', np.max(costs))
        # In terms of true environment reward of your rolled out trajectory using the MPC controller
        logz.log_tabular('AverageReturn', np.mean(returns))
        logz.log_tabular('StdReturn', np.std(returns))
        logz.log_tabular('MinimumReturn', np.min(returns))
        logz.log_tabular('MaximumReturn', np.max(returns))

        logz.dump_tabular()
Exemplo n.º 19
0
def train(state_cb,
          pub_cmd,
          pub_act,
          rate,
          cost_fn,
          logdir=None,
          render=False,
          learning_rate=1e-3,
          onpol_iters=10,
          dynamics_iters=60,
          batch_size=512,
          num_paths_random=10,
          num_paths_onpol=10,
          num_simulated_paths=10000,
          env_horizon=1000,
          mpc_horizon=15,
          n_layers=2,
          size=500,
          activation=tf.nn.relu,
          output_activation=None):
    """

    Arguments:

    onpol_iters                 Number of iterations of onpolicy aggregation for the loop to run.

    dynamics_iters              Number of iterations of training for the dynamics model
    |_                          which happen per iteration of the aggregation loop.

    batch_size                  Batch size for dynamics training.

    num_paths_random            Number of paths/trajectories/rollouts generated
    |                           by a random agent. We use these to train our
    |_                          initial dynamics model.

    num_paths_onpol             Number of paths to collect at each iteration of
    |_                          aggregation, using the Model Predictive Control policy.

    num_simulated_paths         How many fictitious rollouts the MPC policy
    |                           should generate each time it is asked for an
    |_                          action.

    env_horizon                 Number of timesteps in each path.

    mpc_horizon                 The MPC policy generates actions by imagining
    |                           fictitious rollouts, and picking the first action
    |                           of the best fictitious rollout. This argument is
    |                           how many timesteps should be in each fictitious
    |_                          rollout.

    n_layers/size/activations   Neural network architecture arguments.

    """

    logz.configure_output_dir(logdir)

    #========================================================
    #
    # First, we need a lot of data generated by a random
    # agent, with which we'll begin to train our dynamics
    # model.

    rand_controller = RandomController()
    paths = sample(state_cb, pub_cmd, pub_act, rate, rand_controller,
                   num_paths_random, env_horizon, render)
    data = paths_to_array(paths)

    #========================================================
    #
    # The random data will be used to get statistics (mean
    # and std) for the observations, actions, and deltas
    # (where deltas are o_{t+1} - o_t). These will be used
    # for normalizing inputs and denormalizing outputs
    # from the dynamics network.
    #
    normalization = compute_normalization(data)

    #========================================================
    #
    # Build dynamics model and MPC controllers.
    #
    sess = tf.Session()

    dyn_model = NNDynamicsModel(n_layers=n_layers,
                                size=size,
                                activation=activation,
                                output_activation=output_activation,
                                normalization=normalization,
                                batch_size=batch_size,
                                iterations=dynamics_iters,
                                learning_rate=learning_rate,
                                sess=sess)

    mpc_controller = MPCcontroller(dyn_model=dyn_model,
                                   horizon=mpc_horizon,
                                   cost_fn=cost_fn,
                                   num_simulated_paths=num_simulated_paths)

    #========================================================
    #
    # Tensorflow session building.
    #
    sess.__enter__()
    tf.global_variables_initializer().run()

    #========================================================
    #
    # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset.
    # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596
    #
    for itr in range(onpol_iters):
        # Fit dynamics model
        print('Training dynamics model...')
        dyn_model.fit(data)
        plot_comparison(dyn_model, state_cb, pub_act, pub_cmd, rate)
        mpc_controller.dyn_model = dyn_model
        costs = []
        returns = []
        # Do MPC
        for i in range(num_paths_onpol):
            print('On policy path: %i' % i)
            obs_t, obs_tp1, acs_t, rews_t = [], [], [], []
            s_t = state_cb.reset(pub_act, pub_cmd)
            total_return = 0

            for j in range(env_horizon):
                # print('Timestep: %i, Return: %g' % (j,total_return))
                a_t = mpc_controller.get_action(s_t)
                s_tp1, _ = state_cb.step(a_t, pub_act, pub_cmd)
                r_t = 0
                for i in range(9):
                    r_t += s_tp1[i * 12] - s_t[i * 12]
                total_return += r_t

                if render:
                    env.render()
                    time.sleep(0.05)

                obs_t.append(s_t)
                obs_tp1.append(s_tp1)
                acs_t.append(a_t)
                rews_t.append(r_t)

                s_t = s_tp1

            path = {
                "observations": np.array(obs_t),
                "next_observations": np.array(obs_tp1),
                "actions": np.array(acs_t),
                "rewards": np.array(rews_t)
            }
            total_cost = path_cost(cost_fn, path)

            paths.append(path)
            returns.append(total_return)
            costs.append(total_cost)
            print('Total cost: %g, Total reward: %g' %
                  (total_cost, total_return))

        data = paths_to_array(paths)
        normalization = compute_normalization(data)
        # Set new normalization statistics for dynamics model
        dyn_model.normalization = normalization

        # LOGGING
        # Statistics for performance of MPC policy using
        # our learned dynamics model
        logz.log_tabular('Iteration', itr)
        # In terms of cost function which your MPC controller uses to plan
        logz.log_tabular('AverageCost', np.mean(costs))
        logz.log_tabular('StdCost', np.std(costs))
        logz.log_tabular('MinimumCost', np.min(costs))
        logz.log_tabular('MaximumCost', np.max(costs))
        # In terms of true environment reward of your rolled out trajectory using the MPC controller
        logz.log_tabular('AverageReturn', np.mean(returns))
        logz.log_tabular('StdReturn', np.std(returns))
        logz.log_tabular('MinimumReturn', np.min(returns))
        logz.log_tabular('MaximumReturn', np.max(returns))

        logz.dump_tabular()
Exemplo n.º 20
0
def train(env,
          cost_fn,
          logdir=None,
          render=False,
          learning_rate=1e-3,
          onpol_iters=10,
          dynamics_iters=60,
          batch_size=512,
          num_paths_random=10,
          num_paths_onpol=10,
          num_simulated_paths=10000,
          env_horizon=1000,
          mpc_horizon=10,
          n_layers=2,
          size=500,
          activation=tf.nn.relu,
          output_activation=None):
    """

    Arguments:

    onpol_iters                 Number of iterations of onpolicy aggregation for the loop to run. 

    dynamics_iters              Number of iterations of training for the dynamics model
    |_                          which happen per iteration of the aggregation loop.

    batch_size                  Batch size for dynamics training.

    num_paths_random            Number of paths/trajectories/rollouts generated 
    |                           by a random agent. We use these to train our 
    |_                          initial dynamics model.
    
    num_paths_onpol             Number of paths to collect at each iteration of
    |_                          aggregation, using the Model Predictive Control policy.

    num_simulated_paths         How many fictitious rollouts the MPC policy
    |                           should generate each time it is asked for an
    |_                          action.

    env_horizon                 Number of timesteps in each path.

    mpc_horizon                 The MPC policy generates actions by imagining 
    |                           fictitious rollouts, and picking the first action
    |                           of the best fictitious rollout. This argument is
    |                           how many timesteps should be in each fictitious
    |_                          rollout.

    n_layers/size/activations   Neural network architecture arguments. 

    """

    logz.configure_output_dir(logdir)

    #========================================================
    #
    # First, we need a lot of data generated by a random
    # agent, with which we'll begin to train our dynamics
    # model.

    random_controller = RandomController(env)
    """ YOUR CODE HERE """
    paths = sample(env, random_controller, num_paths=50)
    first = 1
    for path in paths:
        if (first):
            data = {
                "observations": path['observations'],
                "next_observations": path['next_observations'],
                "rewards": path['rewards'],
                "actions": path['actions'],
                "returns": path['returns']
            }
            first = 0
        else:
            data['observations'] = np.vstack(
                (data['observations'], path['observations']))
            data['next_observations'] = np.vstack(
                (data['next_observations'], path['next_observations']))
            data['rewards'] = np.vstack((data['rewards'], path['rewards']))
            data['actions'] = np.vstack((data['actions'], path['actions']))
            data['returns'] = np.vstack((data['returns'], path['returns']))

    #========================================================
    #
    # The random data will be used to get statistics (mean
    # and std) for the observations, actions, and deltas
    # (where deltas are o_{t+1} - o_t). These will be used
    # for normalizing inputs and denormalizing outputs
    # from the dynamics network.
    #
    """ YOUR CODE HERE """
    normalization = compute_normalization(data)

    #========================================================
    #
    # Build dynamics model and MPC controllers.
    #
    sess = tf.Session()

    dyn_model = NNDynamicsModel(env=env,
                                n_layers=n_layers,
                                size=size,
                                activation=activation,
                                output_activation=output_activation,
                                normalization=normalization,
                                batch_size=batch_size,
                                iterations=dynamics_iters,
                                learning_rate=learning_rate,
                                sess=sess)

    mpc_controller = MPCcontroller(env=env,
                                   dyn_model=dyn_model,
                                   horizon=mpc_horizon,
                                   cost_fn=cost_fn,
                                   num_simulated_paths=num_simulated_paths)

    #========================================================
    #
    # Tensorflow session building.
    #
    sess.__enter__()
    tf.global_variables_initializer().run()

    #========================================================
    #
    # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset.
    # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596
    #

    #open CSV
    csv_file = open('results.csv', 'w')
    writer = csv.writer(csv_file, delimiter=',')

    for itr in range(onpol_iters):
        print(itr)
        costs = []
        returns = []
        """ YOUR CODE HERE """
        dyn_model.fit(data)
        #plot_comparison(env,dyn_model)
        mpc_controller.dyn_model = dyn_model  #need to update or not?
        new_paths = sample(env, mpc_controller)
        for path in new_paths:
            cost = path_cost(cost_fn, path)
            costs.append(cost)
            returns.append(path['returns'][-1])

            data['observations'] = np.vstack(
                (data['observations'], path['observations']))
            data['next_observations'] = np.vstack(
                (data['next_observations'], path['next_observations']))
            data['actions'] = np.vstack((data['actions'], path['actions']))

        dyn_model.normalization = compute_normalization(data)

        writer.writerow([itr, np.mean(returns)])

        # LOGGING
        # Statistics for performance of MPC policy using
        # our learned dynamics model
        logz.log_tabular('Iteration', itr)
        # In terms of cost function which your MPC controller uses to plan
        logz.log_tabular('AverageCost', np.mean(costs))
        logz.log_tabular('StdCost', np.std(costs))
        logz.log_tabular('MinimumCost', np.min(costs))
        logz.log_tabular('MaximumCost', np.max(costs))
        # In terms of true environment reward of your rolled out trajectory using the MPC controller
        logz.log_tabular('AverageReturn', np.mean(returns))
        logz.log_tabular('StdReturn', np.std(returns))
        logz.log_tabular('MinimumReturn', np.min(returns))
        logz.log_tabular('MaximumReturn', np.max(returns))

        logz.dump_tabular()
Exemplo n.º 21
0
Arquivo: main.py Projeto: Snowstu/MBMF
def train(env,
          cost_fn,
          logdir=None,
          render=False,
          learning_rate=1e-3,
          onpol_iters=10,
          dynamics_iters=60,
          batch_size=512,
          num_paths_random=10,
          num_paths_onpol=10,
          num_simulated_paths=10000,
          env_horizon=1000,
          mpc_horizon=15,
          n_layers=2,
          size=500,
          activation='relu',
          output_activation=None):
    """

	Arguments:

	onpol_iters                 Number of iterations of onpolicy aggregation for the loop to run.

	dynamics_iters              Number of iterations of training for the dynamics model
	|_                          which happen per iteration of the aggregation loop.

	batch_size                  Batch size for dynamics training.

	num_paths_random            Number of paths/trajectories/rollouts generated
	|                           by a random agent. We use these to train our
	|_                          initial dynamics model.

	num_paths_onpol             Number of paths to collect at each iteration of
	|_                          aggregation, using the Model Predictive Control policy.

	num_simulated_paths         How many fictitious rollouts the MPC policy
	|                           should generate each time it is asked for an
	|_                          action.

	env_horizon                 Number of timesteps in each path.

	mpc_horizon                 The MPC policy generates actions by imagining
	|                           fictitious rollouts, and picking the first action
	|                           of the best fictitious rollout. This argument is
	|                           how many timesteps should be in each fictitious
	|_                          rollout.

	n_layers/size/activations   Neural network architecture arguments.

	"""
    logz.configure_output_dir(logdir)
    # ========================================================
    #
    # First, we need a lot of data generated by a random
    # agent, with which we'll begin to train our dynamics
    # model.
    """ YOUR CODE HERE """
    random_controller = RandomController(env)

    paths = sample(env,
                   random_controller,
                   num_paths=num_paths_random,
                   horizon=env_horizon,
                   ignore_done=True)  # 10

    # ========================================================
    #
    # The random data will be used to get statistics (mean
    # and std) for the observations, actions, and deltas
    # (where deltas are o_{t+1} - o_t). These will be used
    # for normalizing inputs and denormalizing outputs
    # from the dynamics network.
    #
    """ YOUR CODE HERE """
    # concatenate observations & actions to numpy data_rand_x
    # concatenate (next_observations -observations) to numpy data_rand_y
    for i in range(num_paths_random):
        if i == 0:
            data_rand_x = np.concatenate(
                (paths[i]['observations'], paths[i]['actions']), axis=1)
            data_rand_y = paths[i]['next_observations'] - paths[i][
                'observations']
        else:
            x = np.concatenate((paths[i]['observations'], paths[i]['actions']),
                               axis=1)
            data_rand_x = np.concatenate((data_rand_x, x), axis=0)
            y = paths[i]['next_observations'] - paths[i]['observations']
            data_rand_y = np.concatenate((data_rand_y, y), axis=0)

    # Initialize data set D to Drand
    data_x = data_rand_x
    data_y = data_rand_y

    # ========================================================
    #
    # Build dynamics model and MPC controllers.
    #

    # sess = tf.Session()

    # dyn_model = NNDynamicsModel(env=env,
    # 							n_layers=n_layers,
    # 							size=size,
    # 							activation=activation,
    # 							output_activation=output_activation,
    # 							batch_size=batch_size,
    # 							iterations=dynamics_iters,
    # 							learning_rate=learning_rate,
    # 							normalization=normalization
    # 							)
    dyn_model = NNDynamicsModel(
        env=env,
        hidden_size=(500, 500),
        activation=activation,  #'tanh'
    ).cuda()

    mpc_controller = MPCcontroller(
        env=env,
        dyn_model=dyn_model,
        horizon=mpc_horizon,
        cost_fn=cost_fn,
        num_simulated_paths=num_simulated_paths,
    )

    # ========================================================
    #
    # Tensorflow session building.
    #
    # sess.__enter__()
    # tf.global_variables_initializer().run()

    # ========================================================
    #
    # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset.
    # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596
    #

    # make dirs output
    if not (os.path.exists(logdir)):
        os.makedirs(logdir)
    path = os.path.join(logdir, 'model')
    if not (os.path.exists(path)):
        os.makedirs(path)

    for itr in range(onpol_iters):
        """ YOUR CODE HERE """

        if itr != 0:
            dyn_model.load_state_dict(torch.load(path + '/net_params.pkl'))

        # store data
        # if (itr % 9) == 0 or itr == (onpol_iters-1):
        if itr >= 0:
            logger = Logger(logdir, csvname='log_orig' + str(itr))
            data = np.concatenate((data_x, data_y), axis=1)
            logger.log_table2csv(data)
        if itr == 0:
            data_x += np.random.normal(0, 0.001, size=data_x.shape)
            data_y += np.random.normal(0, 0.001, size=data_y.shape)
        else:
            data_x = best_x + np.random.normal(0, 0.001, size=best_x.shape)
            data_y = best_y + np.random.normal(0, 0.001, size=best_y.shape)

        dyn_model.fit(data_x,
                      data_y,
                      epoch_size=dynamics_iters,
                      batch_size=batch_size,
                      test=True)

        torch.save(dyn_model.state_dict(),
                   path + '/net_params.pkl')  # save only the parameters
        torch.save(dyn_model,
                   path + '/net' + str(itr) + '.pkl')  # save entire net

        print('-------------Itr %d-------------' % itr)
        print('Start time:\n')
        print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))

        start = time.time()  # caculate run time --start time point
        # sample
        if Monitor is True:
            monitor_path = os.path.join(logdir, 'monitor' + str(itr))
            env = wrappers.Monitor(env, monitor_path, force=True)

        paths = sample(env,
                       mpc_controller,
                       num_paths=num_paths_onpol,
                       horizon=env_horizon,
                       render=False,
                       ignore_done=False,
                       MPC=True)

        end = time.time()
        runtime2 = end - start
        print('runtime = ', runtime2)

        print('End time:\n')
        print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))

        # concatenate observations & actions to numpy data_rand_x
        # concatenate (next_observations -observations) to numpy data_rand_y
        for i in range(num_paths_onpol):
            if i == 0:
                data_rl_x = np.concatenate(
                    (paths[i]['observations'], paths[i]['actions']), axis=1)
                data_rl_y = paths[i]['next_observations'] - paths[i][
                    'observations']
            else:
                x = np.concatenate(
                    (paths[i]['observations'], paths[i]['actions']), axis=1)
                data_rl_x = np.concatenate((data_rl_x, x), axis=0)
                y = paths[i]['next_observations'] - paths[i]['observations']
                data_rl_y = np.concatenate((data_rl_y, y), axis=0)

        # Aggregate data
        data_x = np.concatenate((data_x, data_rl_x), axis=0)
        data_y = np.concatenate((data_y, data_rl_y), axis=0)

        costs = np.zeros((num_paths_onpol, 1))
        returns = np.zeros((num_paths_onpol, 1))
        for i in range(num_paths_onpol):
            costs[i] = paths[i]['cost']
            returns[i] = paths[i]['returns'][0]

        if itr == 0:
            best_x = data_rl_x
            best_y = data_rl_y
        else:
            best_x = np.concatenate((best_x, data_rl_x), axis=0)
            best_y = np.concatenate((best_y, data_rl_y), axis=0)
        # store data
        #if (itr % 9) == 0 or itr == (onpol_iters-1):
        if itr >= 0:
            logger = Logger(logdir, csvname='best' + str(itr))
            data = np.concatenate((best_x, best_y), axis=1)
            logger.log_table2csv(data)

        # LOGGING
        # Statistics for performance of MPC policy using
        # our learned dynamics model
        logz.log_tabular('Iteration', itr)
        # In terms of cost function which your MPC controller uses to plan
        logz.log_tabular('AverageCost', np.mean(costs))
        logz.log_tabular('StdCost', np.std(costs))
        logz.log_tabular('MinimumCost', np.min(costs))
        logz.log_tabular('MaximumCost', np.max(costs))
        # In terms of true environment reward of your rolled out trajectory using the MPC controller
        logz.log_tabular('AverageReturn', np.mean(returns))
        logz.log_tabular('StdReturn', np.std(returns))
        logz.log_tabular('MinimumReturn', np.min(returns))
        logz.log_tabular('MaximumReturn', np.max(returns))

        logz.dump_tabular()
Exemplo n.º 22
0
def train_PG(
             exp_name='',
             env_name='',
             n_iter=100, 
             gamma=1.0, 
             min_timesteps_per_batch=1000, 
             max_path_length=None,
             learning_rate=5e-3, 
             reward_to_go=False, 
             animate=True, 
             logdir=None, 
             normalize_advantages=False,
             nn_baseline=False, 
             seed=0,
             # network arguments
             n_layers=1,
             size=32,

             # mb mpc arguments
             model_learning_rate=1e-3,
             onpol_iters=10,
             dynamics_iters=260,
             batch_size=512,
             num_paths_random=10, 
             num_paths_onpol=10, 
             num_simulated_paths=1000,
             env_horizon=1000, 
             mpc_horizon=10,
             m_n_layers=2,
             m_size=500,
             ):

    start = time.time()

    # Configure output directory for logging
    logz.configure_output_dir(logdir)

    # Log experimental parameters
    args = inspect.getargspec(train_PG)[0]
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_params(params)

    # Set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)

    # Make the gym environment
    # env = gym.make(env_name)
    env = HalfCheetahEnvNew()
    cost_fn = cheetah_cost_fn
    activation=tf.nn.relu
    output_activation=None

    # Is this env continuous, or discrete?
    discrete = isinstance(env.action_space, gym.spaces.Discrete)

    # Maximum length for episodes
    # max_path_length = max_path_length or env.spec.max_episode_steps
    max_path_length = max_path_length

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]

    # Print environment infomation
    print("-------- env info --------")
    print("Environment name: ", env_name)
    print("Action space is discrete: ", discrete)
    print("Action space dim: ", ac_dim)
    print("Observation space dim: ", ob_dim)
    print("Max_path_length ", max_path_length)




    #========================================================================================#
    # Random data collection
    #========================================================================================#

    random_controller = RandomController(env)
    data_buffer_model = DataBuffer()
    data_buffer_ppo = DataBuffer_general(10000, 4)

    # sample path
    print("collecting random data .....  ")
    paths = sample(env, 
               random_controller, 
               num_paths=num_paths_random, 
               horizon=env_horizon, 
               render=False,
               verbose=False)

    # add into buffer
    for path in paths:
        for n in range(len(path['observations'])):
            data_buffer_model.add(path['observations'][n], path['actions'][n], path['next_observations'][n])

    print("data buffer size: ", data_buffer_model.size)

    normalization = compute_normalization(data_buffer_model)

    #========================================================================================#
    # Tensorflow Engineering: Config, Session, Variable initialization
    #========================================================================================#
    tf_config = tf.ConfigProto() 
    tf_config.allow_soft_placement = True
    tf_config.intra_op_parallelism_threads =4
    tf_config.inter_op_parallelism_threads = 1
    sess = tf.Session(config=tf_config)

    dyn_model = NNDynamicsModel(env=env, 
                                n_layers=n_layers, 
                                size=size, 
                                activation=activation, 
                                output_activation=output_activation, 
                                normalization=normalization,
                                batch_size=batch_size,
                                iterations=dynamics_iters,
                                learning_rate=learning_rate,
                                sess=sess)

    mpc_controller = MPCcontroller(env=env, 
                                   dyn_model=dyn_model, 
                                   horizon=mpc_horizon, 
                                   cost_fn=cost_fn, 
                                   num_simulated_paths=num_simulated_paths)


    policy_nn = policy_network_ppo(sess, ob_dim, ac_dim, discrete, n_layers, size, learning_rate)

    if nn_baseline:
        value_nn = value_network(sess, ob_dim, n_layers, size, learning_rate)

    sess.__enter__() # equivalent to `with sess:`

    tf.global_variables_initializer().run()


    #========================================================================================#
    # Training Loop
    #========================================================================================#

    total_timesteps = 0

    for itr in range(n_iter):
        print("********** Iteration %i ************"%itr)

        if MPC:
            dyn_model.fit(data_buffer_model)
        returns = []
        costs = []

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []

        while True:
            # print("data buffer size: ", data_buffer_model.size)
            current_path = {'observations': [], 'actions': [], 'reward': [], 'next_observations':[]}

            ob = env.reset()
            obs, acs, mpc_acs, rewards = [], [], [], []
            animate_this_episode=(len(paths)==0 and (itr % 10 == 0) and animate)
            steps = 0
            return_ = 0
 
            while True:
                # print("steps ", steps)
                if animate_this_episode:
                    env.render()
                    time.sleep(0.05)
                obs.append(ob)

                if MPC:
                    mpc_ac = mpc_controller.get_action(ob)
                else:
                    mpc_ac = random_controller.get_action(ob)

                ac = policy_nn.predict(ob, mpc_ac)

                ac = ac[0]

                if not PG:
                    ac = mpc_ac

                acs.append(ac)
                mpc_acs.append(mpc_ac)

                current_path['observations'].append(ob)

                ob, rew, done, _ = env.step(ac)

                current_path['reward'].append(rew)
                current_path['actions'].append(ac)
                current_path['next_observations'].append(ob)

                return_ += rew
                rewards.append(rew)

                steps += 1
                if done or steps > max_path_length:
                    break


            if MPC:
                # cost & return
                cost = path_cost(cost_fn, current_path)
                costs.append(cost)
                returns.append(return_)
                print("total return: ", return_)
                print("costs: ", cost)

                # add into buffers
                for n in range(len(current_path['observations'])):
                    data_buffer_model.add(current_path['observations'][n], current_path['actions'][n], current_path['next_observations'][n])

            for n in range(len(current_path['observations'])):
                data_buffer_ppo.add(current_path['observations'][n], current_path['actions'][n], current_path['reward'][n], current_path['next_observations'][n])
        
            path = {"observation" : np.array(obs), 
                    "reward" : np.array(rewards), 
                    "action" : np.array(acs),
                    "mpc_action" : np.array(mpc_acs)}



            paths.append(path)
            timesteps_this_batch += pathlength(path)
            # print("timesteps_this_batch", timesteps_this_batch)
            if timesteps_this_batch > min_timesteps_per_batch:
                break
        total_timesteps += timesteps_this_batch


        print("data_buffer_ppo.size:", data_buffer_ppo.size)


        # Build arrays for observation, action for the policy gradient update by concatenating 
        # across paths
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_na = np.concatenate([path["action"] for path in paths])
        mpc_ac_na = np.concatenate([path["mpc_action"] for path in paths])


        # Computing Q-values
     
        if reward_to_go:
            q_n = []
            for path in paths:
                for t in range(len(path["reward"])):
                    t_ = 0
                    q = 0
                    while t_ < len(path["reward"]):
                        if t_ >= t:
                            q += gamma**(t_-t) * path["reward"][t_]
                        t_ += 1
                    q_n.append(q)
            q_n = np.asarray(q_n)

        else:
            q_n = []
            for path in paths:
                for t in range(len(path["reward"])):
                    t_ = 0
                    q = 0
                    while t_ < len(path["reward"]):
                        q += gamma**t_ * path["reward"][t_]
                        t_ += 1
                    q_n.append(q)
            q_n = np.asarray(q_n)


        # Computing Baselines
        if nn_baseline:

            # b_n = sess.run(baseline_prediction, feed_dict={sy_ob_no :ob_no})
            b_n = value_nn.predict(ob_no)
            b_n = normalize(b_n)
            b_n = denormalize(b_n, np.std(q_n), np.mean(q_n))
            adv_n = q_n - b_n
        else:
            adv_n = q_n.copy()

        # Advantage Normalization
        if normalize_advantages:
            adv_n = normalize(adv_n)

        # Optimizing Neural Network Baseline
        if nn_baseline:
            b_n_target = normalize(q_n)
            value_nn.fit(ob_no, b_n_target)
                # sess.run(baseline_update_op, feed_dict={sy_ob_no :ob_no, sy_baseline_target_n:b_n_target})


        # Performing the Policy Update

        # policy_nn.fit(ob_no, ac_na, adv_n)
        policy_nn.fit(ob_no, ac_na, adv_n, mpc_ac_na)

        # sess.run(update_op, feed_dict={sy_ob_no :ob_no, sy_ac_na:ac_na, sy_adv_n:adv_n})

        # Log diagnostics
        returns = [path["reward"].sum() for path in paths]
        ep_lengths = [pathlength(path) for path in paths]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        logz.dump_tabular()
        logz.pickle_tf_vars()
Exemplo n.º 23
0
Arquivo: main.py Projeto: amoliu/MPC
                                normalization=normalization,
                                batch_size=batch_size,
                                iterations=dynamics_iters,
                                learning_rate=learning_rate_dyn,
                                sess=sess)

    dyn_model = NNPolicy(env=env,
                        normalization=normalization,
                        batch_size=batch_size,
                        iterations=policy_iters,
                        learning_rate=learning_rate_policy,
                        sess=sess)

    mpc_controller = MPCcontroller(env=env,
                                   dyn_model=dyn_model,
                                   horizon=mpc_horizon,
                                   cost_fn=cost_fn,
                                   num_simulated_paths=num_simulated_paths)


    #========================================================
    #
    # Tensorflow session building.
    #
    sess.__enter__()
    tf.global_variables_initializer().run()

    #========================================================
    #
    # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset.
    # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596
Exemplo n.º 24
0
def train(env,
          logdir=None,
          render=False,
          learning_rate=1e-3,
          dagger_iters=10,
          dynamics_iters=60,
          batch_size=512,
          num_random_rollouts=10,
          num_onpol_rollouts=10,
          num_simulated_paths=10000,
          env_horizon=1000,
          mpc_horizon=15,
          n_layers=2,
          n_hid_units=500,
          activation=tf.nn.relu,
          output_activation=None):
    """

    Arguments:

    dagger_iters                 Number of iterations of onpolicy aggregation for the loop to run.

    dyn_iters                   Number of iterations of training for the dynamics model
    |_                          which happen per iteration of the aggregation loop.

    batch_size                  Batch size for dynamics training.

    num_random_rollouts            Number of paths/trajectories/rollouts generated
    |                           by a random agent. We use these to train our
    |_                          initial dynamics model.

    num_onpol_rollouts          Number of paths to collect at each iteration of
    |_                          aggregation, using the Model Predictive Control policy.

    num_simulated_paths         How many fictitious rollouts the MPC policy
    |                           should generate each time it is asked for an
    |_                          action.

    env_horizon                 Number of timesteps in each path.

    mpc_horizon                 The MPC policy generates actions by imagining
    |                           fictitious rollouts, and picking the first action
    |                           of the best fictitious rollout. This argument is
    |                           how many timesteps should be in each fictitious
    |_                          rollout.

    n_layers/n_hid_units/activations   Neural network architecture arguments.

    """

    logz.configure_output_dir(logdir)

    #========================================================
    #
    # First, we need a lot of data generated by a random
    # agent, with which we'll begin to train our dynamics
    # model.

    random_controller = RandomController(env)
    paths = sample(env,
                   random_controller,
                   num_rollouts=num_random_rollouts,
                   horizon=env_horizon)

    #========================================================
    #
    # The random data will be used to get statistics (mean
    # and std) for the observations, actions, and deltas
    # (where deltas are o_{t+1} - o_t). These will be used
    # for normalizing inputs and denormalizing outputs
    # from the dynamics network.

    normalization_stats = compute_normalization_stats(paths)

    #========================================================
    #
    # Build dynamics model and MPC controllers.
    #
    sess = tf.Session()

    dyn_model = NNDynamicsModel(env=env,
                                n_layers=n_layers,
                                n_hid_units=n_hid_units,
                                activation=activation,
                                output_activation=output_activation,
                                normalization_stats=normalization_stats,
                                batch_size=batch_size,
                                num_iter=dynamics_iters,
                                learning_rate=learning_rate,
                                sess=sess)

    mpc_controller = MPCcontroller(env=env,
                                   dyn_model=dyn_model,
                                   horizon=mpc_horizon,
                                   num_simulated_paths=num_simulated_paths)

    #========================================================
    #
    # Tensorflow session building.
    #
    sess.__enter__()
    tf.global_variables_initializer().run()

    #========================================================
    #
    # Take multiple iterations of onpolicy aggregation
    # at each iteration refitting the dynamics model to current dataset
    # and then taking on-policy samples and aggregating to the dataset.
    #
    # Note: You don't need to use a mixing ratio in this assignment
    # for new and old data as described in https://arxiv.org/abs/1708.02596
    #
    for i in range(dagger_iters):
        print('********** ITERATION {}/{} ************'.format(
            i + 1, dagger_iters))

        # Fitting dynamics model
        dyn_model.fit(paths)

        # Sampling on-policy
        new_paths = sample(env,
                           mpc_controller,
                           num_rollouts=num_onpol_rollouts,
                           horizon=env_horizon)
        paths = new_paths + random.sample(
            paths,
            len(new_paths) // 9)  # Adding new paths and forgetting old ones
        # paths += new_paths

        returns = [sum(path['rewards']) for path in new_paths]
        costs = [path_cost(path) for path in new_paths]

        # LOGGING
        # Statistics for performance of MPC policy using our learned dynamics model
        # In terms of cost function which your MPC controller uses to plan
        logz.log_tabular('AverageCost', np.mean(costs))
        logz.log_tabular('StdCost', np.std(costs))
        logz.log_tabular('MinimumCost', np.min(costs))
        logz.log_tabular('MaximumCost', np.max(costs))

        # In terms of true environment reward of your rolled out trajectory using the MPC controller
        logz.log_tabular('AverageReturn', np.mean(returns))
        logz.log_tabular('StdReturn', np.std(returns))
        logz.log_tabular('MinimumReturn', np.min(returns))
        logz.log_tabular('MaximumReturn', np.max(returns))

        logz.dump_tabular()