Пример #1
0
def learn(env,
          policy,
          value_fn,
          gamma,
          lam,
          timesteps_per_batch,
          num_timesteps,
          animate=False,
          callback=None,
          desired_kl=0.002):
    """
    Traines an ACKTR model.

    :param env: (Gym environment) The environment to learn from
    :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...)
    :param value_fn: (Object) The value function model to use (MLP, CNN, LSTM, ...)
    :param gamma: (float) The discount value
    :param lam: (float) the tradeoff between exploration and exploitation
    :param timesteps_per_batch: (int) the number of timesteps for each batch
    :param num_timesteps: (int) the total number of timesteps to run
    :param animate: (bool) if render env
    :param callback: (function) called every step, used for logging and saving
    :param desired_kl: (float) the Kullback leibler weight for the loss
    """
    obfilter = ZFilter(env.observation_space.shape)

    max_pathlength = env.spec.timestep_limit
    stepsize = tf.Variable(initial_value=np.float32(np.array(0.03)),
                           name='stepsize')
    inputs, loss, loss_sampled = policy.update_info
    optim = kfac.KfacOptimizer(learning_rate=stepsize,
                               cold_lr=stepsize * (1 - 0.9),
                               momentum=0.9,
                               kfac_update=2,
                               epsilon=1e-2,
                               stats_decay=0.99,
                               async=1,
                               cold_iter=1,
                               weight_decay_dict=policy.wd_dict,
                               max_grad_norm=None)
    pi_var_list = []
    for var in tf.trainable_variables():
        if "pi" in var.name:
            pi_var_list.append(var)

    update_op, q_runner = optim.minimize(loss,
                                         loss_sampled,
                                         var_list=pi_var_list)
    do_update = tf_util.function(inputs, update_op)
    tf_util.initialize()

    # start queue runners
    enqueue_threads = []
    coord = tf.train.Coordinator()
    for queue_runner in [q_runner, value_fn.q_runner]:
        assert queue_runner is not None
        enqueue_threads.extend(
            queue_runner.create_threads(tf.get_default_session(),
                                        coord=coord,
                                        start=True))

    i = 0
    timesteps_so_far = 0
    while True:
        if timesteps_so_far > num_timesteps:
            break
        logger.log("********** Iteration %i ************" % i)

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            path = rollout(env,
                           policy,
                           max_pathlength,
                           animate=(len(paths) == 0 and (i % 10 == 0)
                                    and animate),
                           obfilter=obfilter)
            paths.append(path)
            timesteps_this_batch += path["reward"].shape[0]
            timesteps_so_far += path["reward"].shape[0]
            if timesteps_this_batch > timesteps_per_batch:
                break

        # Estimate advantage function
        vtargs = []
        advs = []
        for path in paths:
            rew_t = path["reward"]
            return_t = common.discount(rew_t, gamma)
            vtargs.append(return_t)
            vpred_t = value_fn.predict(path)
            vpred_t = np.append(vpred_t,
                                0.0 if path["terminated"] else vpred_t[-1])
            delta_t = rew_t + gamma * vpred_t[1:] - vpred_t[:-1]
            adv_t = common.discount(delta_t, gamma * lam)
            advs.append(adv_t)
        # Update value function
        value_fn.fit(paths, vtargs)

        # Build arrays for policy update
        ob_no = np.concatenate([path["observation"] for path in paths])
        action_na = np.concatenate([path["action"] for path in paths])
        oldac_dist = np.concatenate([path["action_dist"] for path in paths])
        adv_n = np.concatenate(advs)
        standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8)

        # Policy update
        do_update(ob_no, action_na, standardized_adv_n)

        min_stepsize = np.float32(1e-8)
        max_stepsize = np.float32(1e0)
        # Adjust stepsize
        kl_loss = policy.compute_kl(ob_no, oldac_dist)
        if kl_loss > desired_kl * 2:
            logger.log("kl too high")
            tf.assign(stepsize, tf.maximum(min_stepsize,
                                           stepsize / 1.5)).eval()
        elif kl_loss < desired_kl / 2:
            logger.log("kl too low")
            tf.assign(stepsize, tf.minimum(max_stepsize,
                                           stepsize * 1.5)).eval()
        else:
            logger.log("kl just right!")

        logger.record_tabular(
            "EpRewMean", np.mean([path["reward"].sum() for path in paths]))
        logger.record_tabular(
            "EpRewSEM",
            np.std([
                path["reward"].sum() / np.sqrt(len(paths)) for path in paths
            ]))
        logger.record_tabular(
            "EpLenMean", np.mean([path["reward"].shape[0] for path in paths]))
        logger.record_tabular("KL", kl_loss)
        if callback:
            callback()
        logger.dump_tabular()
        i += 1

    coord.request_stop()
    coord.join(enqueue_threads)
def learn(env,
          policy,
          vf,
          gamma,
          lam,
          timesteps_per_batch,
          num_timesteps,
          max_path_length,
          animate=False,
          callback=None,
          desired_kl=0.002):

    obfilter = ZFilter(env.observation_space.shape)

    max_pathlength = max_path_length
    stepsize = tf.Variable(initial_value=np.float32(np.array(0.03)),
                           name='stepsize')
    inputs, loss, loss_sampled = policy.update_info
    optim = kfac.KfacOptimizer(learning_rate=stepsize, cold_lr=stepsize*(1-0.9), momentum=0.9, kfac_update=2,\
                                epsilon=1e-2, stats_decay=0.99, async=1, cold_iter=1,
                                weight_decay_dict=policy.wd_dict, max_grad_norm=None)
    pi_var_list = []
    for var in tf.trainable_variables():
        if "pi" in var.name:
            pi_var_list.append(var)

    update_op, q_runner = optim.minimize(loss,
                                         loss_sampled,
                                         var_list=pi_var_list)
    do_update = U.function(inputs, update_op)
    U.initialize()

    # start queue runners
    enqueue_threads = []
    coord = tf.train.Coordinator()
    for qr in [q_runner, vf.q_runner]:
        assert (qr != None)
        enqueue_threads.extend(
            qr.create_threads(tf.get_default_session(),
                              coord=coord,
                              start=True))

    i = 0
    timesteps_so_far = 0
    while True:
        if timesteps_so_far > num_timesteps:
            break
        logger.log("********** Iteration %i ************" % i)

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            path = rollout(env,
                           policy,
                           max_pathlength,
                           animate=(len(paths) == 0 and (i % 10 == 0)
                                    and animate),
                           obfilter=obfilter)
            paths.append(path)
            n = pathlength(path)
            timesteps_this_batch += n
            timesteps_so_far += n
            if timesteps_this_batch > timesteps_per_batch:
                break

        # Estimate advantage function
        vtargs = []
        advs = []
        for path in paths:
            rew_t = path["reward"]
            return_t = common.discount(rew_t, gamma)
            vtargs.append(return_t)
            vpred_t = vf.predict(path)
            vpred_t = np.append(vpred_t,
                                0.0 if path["terminated"] else vpred_t[-1])
            delta_t = rew_t + gamma * vpred_t[1:] - vpred_t[:-1]
            adv_t = common.discount(delta_t, gamma * lam)
            advs.append(adv_t)
        # Update value function
        vf.fit(paths, vtargs)

        # Build arrays for policy update
        ob_no = np.concatenate([path["observation"] for path in paths])
        action_na = np.concatenate([path["action"] for path in paths])
        oldac_dist = np.concatenate([path["action_dist"] for path in paths])
        adv_n = np.concatenate(advs)
        standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8)

        # Policy update
        do_update(ob_no, action_na, standardized_adv_n)

        min_stepsize = np.float32(1e-8)
        max_stepsize = np.float32(1e0)
        # Adjust stepsize
        kl = policy.compute_kl(ob_no, oldac_dist)
        if kl > desired_kl * 2:
            logger.log("kl too high")
            tf.assign(stepsize, tf.maximum(min_stepsize,
                                           stepsize / 1.5)).eval()
        elif kl < desired_kl / 2:
            logger.log("kl too low")
            tf.assign(stepsize, tf.minimum(max_stepsize,
                                           stepsize * 1.5)).eval()
        else:
            logger.log("kl just right!")

        logger.record_tabular("n_timesteps", timesteps_so_far)
        logger.record_tabular(
            "AverageReturn", np.mean([path["reward"].sum() for path in paths]))
        logger.record_tabular(
            "EpRewMean", np.mean([path["reward"].sum() for path in paths]))
        logger.record_tabular(
            "EpRewSEM",
            np.std([
                path["reward"].sum() / np.sqrt(len(paths)) for path in paths
            ]))
        logger.record_tabular("EpLenMean",
                              np.mean([pathlength(path) for path in paths]))
        logger.record_tabular("KL", kl)
        if callback:
            callback()
        logger.dump_tabular()
        i += 1

    coord.request_stop()
    coord.join(enqueue_threads)
Пример #3
0
def main(args):
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(args.seed)

    def policy_fn(name, ob_space, ac_space, reuse=False):
        return mlp_policy_sawyer.MlpPolicy(name=name,
                                           ob_space=ob_space,
                                           ac_space=ac_space,
                                           reuse=reuse,
                                           hid_size=args.policy_hidden_size,
                                           num_hid_layers=2)

    #------- Run policy for reaching ---------#
    play_env = robosuite.make(
        args.env_id,
        ignore_done=True,
        use_camera_obs=False,
        has_renderer=True,
        control_freq=100,
        gripper_visualization=True,
        #box_pos = [0.63522776, -0.3287869, 0.82162434], # shift2
        #box_quat=[0.6775825618903728, 0, 0, 0.679425538604203], # shift2
        box_pos=[0.23522776, 0.2287869, 0.82162434],  #shift3
        box_quat=[0.3775825618903728, 0, 0, 0.679425538604203],  #shift3
        #box_pos = [0.53522776, 0.3287869, 0.82162434], #shift4
        #box_quat=[0.5775825618903728, 0, 0, 0.679425538604203], #shift4
        #box_pos = [0.53522776, 0.1287869, 0.82162434], #shift5
        #box_quat=[0.4775825618903728, 0, 0, 0.679425538604203], #shift5
        #box_pos = [0.48522776, -0.187869, 0.82162434], #shift6
        #box_quat=[0.8775825618903728, 0, 0, 0.679425538604203], #shift6
        #box_pos = [0.43522776, -0.367869, 0.82162434], #shift7
        #box_quat=[0.2775825618903728, 0, 0, 0.679425538604203], #shift7
    )

    play_env = GymWrapper(play_env, keys=None,
                          generalized_goal=False)  # false for loading prevs

    #Weights are loaded from reach model grasp_strange

    #play_env.viewer.set_camera(camera_id=2) # Switch views for eval

    # Setup network
    # ----------------------------------------
    ob_space = play_env.observation_space
    ac_space = play_env.action_space
    pi_reach = policy_fn("pi", ob_space, ac_space, reuse=False)

    # Hack for loading policies using tensorflow
    init_op = tf.compat.v1.global_variables_initializer()
    saver = tf.compat.v1.train.Saver(max_to_keep=5)
    with tf.compat.v1.Session() as sess:
        sess.run(init_op)
        # Load Checkpoint
        #ckpt_path = './reach_and_grasp_weights/reach_one/trpo_gail.transition_limitation_2100.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/'
        #ckpt_path = './reach_and_grasp_weights/reach_shift2/trpo_gail.transition_limitation_2500.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/'
        ckpt_path = './reach_and_grasp_weights/reach_3_almost/trpo_gail.transition_limitation_1100.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/'
        #ckpt_path = './reach_and_grasp_weights/reach_4/trpo_gail.transition_limitation_2400.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/' # problem child 2
        #ckpt_path = './reach_and_grasp_weights/reach_5/trpo_gail.transition_limitation_2000.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/' #problem child 1
        #ckpt_path = './reach_and_grasp_weights/reach_6/trpo_gail.transition_limitation_2500.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/'
        #ckpt_path = './reach_and_grasp_weights/reach_7/trpo_gail.transition_limitation_3000.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/'
        ckpt = tf.compat.v1.train.get_checkpoint_state(ckpt_path)
        saver.restore(sess, ckpt.model_checkpoint_path)

        # Create the playback environment

        _, _, last_ob, last_jpos, obs_array, jpos_array = runner_1_traj(
            play_env,
            pi_reach,
            None,
            timesteps_per_batch=3500,
            number_trajs=1,
            stochastic_policy=args.stochastic_policy,
            save=False)

    # Gripping load + setting up the last observation
    play_ob_dim = play_env.observation_space.shape[0]
    play_ac_dim = play_env.action_space.shape[0]
    grip_policy = GaussianMlpPolicy(play_ob_dim, play_ac_dim)
    grip_vf = NeuralNetValueFunction(play_ob_dim, play_ac_dim)
    grip_saver = tf.compat.v1.train.Saver(max_to_keep=5)

    unchanged_ob = np.float32(np.zeros(play_ob_dim))

    with tf.compat.v1.Session() as sess2:
        sess2.run(init_op)
        # Load Checkpoint
        #ckpt_path = './reach_and_grasp_weights/reach_one/trpo_gail.transition_limitation_2100.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/'
        #grip_ckpt_path = './reach_and_grasp_weights/grasp_and_pickup2/grasp_acktr_rl.transition_limitation_1500.SawyerLift'
        grip_ckpt_path = './reach_and_grasp_weights/grasp_3/grasp_acktr_rl.transition_limitation_1000.SawyerLift/'  #3rd grasp
        #ckpt_path = './reach_and_grasp_weights/reach_4/trpo_gail.transition_limitation_2400.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/' # problem child 2
        #ckpt_path = './reach_and_grasp_weights/reach_5/trpo_gail.transition_limitation_2000.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/' #problem child 1
        #ckpt_path = './reach_and_grasp_weights/reach_6/trpo_gail.transition_limitation_2500.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/'
        #ckpt_path = './reach_and_grasp_weights/reach_7/trpo_gail.transition_limitation_3000.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/'
        grip_ckpt = tf.compat.v1.train.get_checkpoint_state(grip_ckpt_path)

        #print(grip_ckpt)

        grip_saver.restore(sess2, grip_ckpt.model_checkpoint_path)

        tt = 0

        cum_rew = 0

        #ob = last_ob
        #prev_ob = np.float32(np.zeros(ob.shape)) # check if indeed starts at all zeros

        obfilter = ZFilter(play_env.observation_space.shape)

        #statsu = np.load("./reach_and_grasp_weights/grasp_and_pickup2/grasp_acktr_rl.transition_limitation_1500.SawyerLift/filter_stats_22953000.npz") # shift 2, is a problem?

        #statsu = np.load("./reach_and_grasp_weights/grasp_and_pickup2/grasp_acktr_rl.transition_limitation_1500.SawyerLift/filter_stats_22953000.npz") # shift 2

        statsu = np.load(
            "./reach_and_grasp_weights/grasp_3/grasp_acktr_rl.transition_limitation_1000.SawyerLift/filter_stats_21002000.npz"
        )  # shift 3

        #statsu = np.load("./reach_and_grasp_weights/grasp_4/grasp_acktr_rl.transition_limitation_1200.SawyerLift/filter_stats_20162400.npz") # shift 4
        #statsu = np.load("./reach_and_grasp_weights/grasp_5/grasp_acktr_rl.transition_limitation_1200.SawyerLift/filter_stats_26066400.npz") #shift 5
        #statsu = np.load("./reach_and_grasp_weights/grasp_and_then_throws_somehow_6/grasp_acktr_rl.transition_limitation_1500.SawyerLift/filter_stats_27363000.npz") #shift 6
        #statsu = np.load("./reach_and_grasp_weights/grasp_pickup_7/grasp_acktr_rl.transition_limitation_1500.SawyerLift/filter_stats_22773000.npz") #shift 7

        print("load n: ", statsu["n"])
        print("load M: ", statsu["M"])
        print("load S: ", statsu["S"])

        obfilter.rs._n = statsu["n"]
        obfilter.rs._M = statsu["M"]
        obfilter.rs._S = statsu["S"]

        print("obf n: ", obfilter.rs._n)
        print("obf M: ", obfilter.rs._M)
        print("obf S: ", obfilter.rs._S)

        #env.set_robot_joint_positions(last_jpos)
        #ob = np.concatenate((last_ob,env.box_end),axis=0)
        ob = last_ob

        # Will this change the behavior of loading?
        play_env.set_robot_joint_positions(last_jpos)

        prev_ob = np.float32(np.zeros(
            ob.shape))  # check if indeed starts at all zeros

        unchanged_ob = ob

        ob = obfilter(ob)

        while True:
            s = np.concatenate([ob, prev_ob], -1)
            ac, _, _ = grip_policy.act(s)

            prev_ob = np.copy(ob)

            scaled_ac = play_env.action_space.low + (ac + 1.) * 0.5 * (
                play_env.action_space.high - play_env.action_space.low)
            scaled_ac = np.clip(scaled_ac, play_env.action_space.low,
                                play_env.action_space.high)

            ob, rew, new, _ = play_env.step(scaled_ac)

            unchanged_ob = ob

            ob = obfilter(ob)

            cum_rew += rew

            play_env.render()  # check the running in for the first part
            #logger.log("rendering for reach policy")

            if new or tt >= 200:
                break
            tt += 1

        print("Cumulative reward over session: " + str(cum_rew))

    #obs_array = None
    #jpos_array = None

    #print(last_jpos)
    #print(last_ob)

    last_joint_after_grip = play_env._joint_positions
    last_after_grip = unchanged_ob
    # Change environment box position to start from the last position on playenv
    #env.model.box_pos_array = np.array(play_env.sim.data.body_xpos[play_env.cube_body_id])
    #env.model.box_quat_array = convert_quat(
    #        np.array(play_env.sim.data.body_xquat[play_env.cube_body_id]), to="xyzw"
    #    )
    #env.box_pos = play_env.model.box_pos_array
    #env.box_quat = play_env.model.box_quat_array

    # set up the environment for loading or training
    env = robosuite.make(
        args.env_id,
        ignore_done=True,
        use_camera_obs=False,
        has_renderer=True,
        control_freq=100,
        gripper_visualization=True,
        reward_shaping=True,
        rob_init=last_joint_after_grip,
        box_pos=np.array(
            play_env.sim.data.body_xpos[play_env.cube_body_id]),  #shift3
        box_quat=convert_quat(np.array(
            play_env.sim.data.body_xquat[play_env.cube_body_id]),
                              to="xyzw"),  #shift3
        #box_pos = [0.63522776, -0.3287869, 0.82162434], # shift2
        #box_quat=[0.6775825618903728, 0, 0, 0.679425538604203], # shift2
        #box_pos = [0.23522776, 0.2287869, 0.82162434], #shift3
        #box_quat=[0.3775825618903728, 0, 0, 0.679425538604203], #shift3
        box_end=[0.3, 0.1, 1.0]  # shift 3
        #box_pos = [0.53522776, 0.3287869, 0.82162434], #shift4, try to increase traj limit to 2000
        #box_quat=[0.5775825618903728, 0, 0, 0.679425538604203], #shift4
        #box_pos = [0.53522776, 0.1287869, 0.82162434], #shift5
        #box_quat=[0.4775825618903728, 0, 0, 0.679425538604203], #shift5
        #box_pos = [0.48522776, -0.187869, 0.82162434], #shift6
        #box_quat=[0.8775825618903728, 0, 0, 0.679425538604203], #shift6
        #box_pos = [0.43522776, -0.367869, 0.82162434], #shift7
        #box_quat=[0.2775825618903728, 0, 0, 0.679425538604203], #shift7
    )  # Switch from gym to robosuite, also add reward shaping to see reach goal

    env = GymWrapper(env, keys=None,
                     generalized_goal=True)  # wrap in the gym environment

    task = 'train'
    #task = 'evaluate'
    #task = 'cont_training'

    #env = bench.Monitor(env, logger.get_dir() and
    #                    osp.join(logger.get_dir(), "monitor.json"), allow_early_resets=True)

    # Note: taking away the bench monitor wrapping allows rendering

    #env.seed(args.seed) # Sawyer does not have seed

    gym.logger.setLevel(logging.WARN)
    task_name = get_task_name(args)
    args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name)
    args.log_dir = osp.join(args.log_dir, task_name)

    logger.log("log_directories: ", args.log_dir)
    logger.log("environment action space range: ",
               env.action_space)  #logging the action space

    if task == 'train':
        play_env.close()

        #init_op2 = tf.compat.v1.global_variables_initializer()

        #sess2 = tf.compat.v1.Session(config=tf.ConfigProto())
        #with tf.compat.v1.Session(config=tf.ConfigProto()) as sess2:
        #sess2.run(init_op)
        ob_dim = env.observation_space.shape[0]
        ac_dim = env.action_space.shape[0]
        lift_vf = NeuralNetValueFunction(ob_dim, ac_dim, name="lift_vf_aktr")
        lift_policy = GaussianMlpPolicy(ob_dim, ac_dim, name="lift_pi_aktr")

        #sess2.run(init_op2)

        old_acktr_learn(
            env,
            policy=lift_policy,
            vf=lift_vf,
            gamma=0.99,
            lam=0.97,
            timesteps_per_batch=1500,
            desired_kl=0.001,
            num_timesteps=args.num_timesteps,
            save_per_iter=args.save_per_iter,
            ckpt_dir=args.checkpoint_dir,
            traj_limitation=args.traj_limitation,
            last_ob=[last_after_grip],
            last_jpos=[last_joint_after_grip],
            animate=True,
            pi_name="lift_pi_aktr",
        )

        env.close()

    elif task == 'cont_training':

        play_env.close()

        # with tf.compat.v1.Session(config=tf.ConfigProto()):
        ob_dim = env.observation_space.shape[0]
        ac_dim = env.action_space.shape[0]

        #with tf.compat.v1.variable_scope("vf_aktr"):
        cont_vf = NeuralNetValueFunction(ob_dim, ac_dim, name="lift_vf_aktr")
        #with tf.compat.v1.variable_scope("pi_aktr"):
        cont_policy = GaussianMlpPolicy(ob_dim, ac_dim, name="lift_pi_aktr")

        ckpt_path_cont = './checkpoint/grasp_acktr_rl.transition_limitation_1500.SawyerLift'

        stat_cont = "./checkpoint/grasp_acktr_rl.transition_limitation_1500.SawyerLift/filter_stats_19713000.npz"

        old_acktr_learn(
            env,
            policy=cont_policy,
            vf=cont_vf,
            gamma=0.99,
            lam=0.97,
            timesteps_per_batch=1500,
            desired_kl=0.001,
            num_timesteps=args.num_timesteps,
            save_per_iter=args.save_per_iter,
            ckpt_dir=args.checkpoint_dir,
            traj_limitation=args.traj_limitation,
            last_ob=obs_array,
            last_jpos=jpos_array,
            animate=True,
            cont_training=True,
            load_path=ckpt_path_cont,
            obfilter_load=stat_cont,
            pi_name="lift_pi_aktr",
        )

        env.close()

    elif task == 'evaluate':

        play_env.close()

        # with tf.compat.v1.Session(config=tf.ConfigProto()):
        ob_dim = env.observation_space.shape[0]
        ac_dim = env.action_space.shape[0]

        eval_lift_vf = NeuralNetValueFunction(ob_dim,
                                              ac_dim,
                                              name="lift_vf_aktr")
        eval_lift_policy = GaussianMlpPolicy(ob_dim,
                                             ac_dim,
                                             name="lift_pi_aktr")

        saver_2 = tf.compat.v1.train.Saver(max_to_keep=5)

        #sess2 = tf.compat.v1.Session(config=tf.ConfigProto())
        #with tf.compat.v1.Session(config=tf.ConfigProto()) as sess3:
        with tf.compat.v1.Session() as sess3:
            sess3.run(init_op)

            ckpt_path_2 = './checkpoint/grasp_acktr_rl.transition_limitation_1500.SawyerLift'
            #ckpt_path_2 = './reach_and_grasp_weights/grasp_and_pickup2/grasp_acktr_rl.transition_limitation_1500.SawyerLift' # shift 2
            #ckpt_path_2 = './reach_and_grasp_weights/grasp_3/grasp_acktr_rl.transition_limitation_1000.SawyerLift' # shift 3
            #ckpt_path_2 = './reach_and_grasp_weights/grasp_4/grasp_acktr_rl.transition_limitation_1200.SawyerLift' # shift 4
            #ckpt_path_2 = './reach_and_grasp_weights/grasp_5/grasp_acktr_rl.transition_limitation_1200.SawyerLift' # shift 5
            #ckpt_path_2 = './reach_and_grasp_weights/grasp_and_then_throws_somehow_6/grasp_acktr_rl.transition_limitation_1500.SawyerLift' #shift 6
            #ckpt_path_2 = './reach_and_grasp_weights/grasp_pickup_7/grasp_acktr_rl.transition_limitation_1500.SawyerLift' #shift 7
            ckpt_2 = tf.compat.v1.train.get_checkpoint_state(ckpt_path_2)
            saver_2.restore(sess3, ckpt_2.model_checkpoint_path)

            tt = 0

            cum_rew = 0

            #ob = last_ob
            #prev_ob = np.float32(np.zeros(ob.shape)) # check if indeed starts at all zeros

            obfilter = ZFilter(env.observation_space.shape)

            statsu = np.load(
                "./checkpoint/grasp_acktr_rl.transition_limitation_1500.SawyerLift/filter_stats_23493000.npz"
            )

            #statsu = np.load("./reach_and_grasp_weights/grasp_and_pickup2/grasp_acktr_rl.transition_limitation_1500.SawyerLift/filter_stats_22953000.npz") # shift 2
            #statsu = np.load("./reach_and_grasp_weights/grasp_3/grasp_acktr_rl.transition_limitation_1000.SawyerLift/filter_stats_21002000.npz") # shift 3
            #statsu = np.load("./reach_and_grasp_weights/grasp_4/grasp_acktr_rl.transition_limitation_1200.SawyerLift/filter_stats_20162400.npz") # shift 4
            #statsu = np.load("./reach_and_grasp_weights/grasp_5/grasp_acktr_rl.transition_limitation_1200.SawyerLift/filter_stats_26066400.npz") #shift 5
            #statsu = np.load("./reach_and_grasp_weights/grasp_and_then_throws_somehow_6/grasp_acktr_rl.transition_limitation_1500.SawyerLift/filter_stats_27363000.npz") #shift 6
            #statsu = np.load("./reach_and_grasp_weights/grasp_pickup_7/grasp_acktr_rl.transition_limitation_1500.SawyerLift/filter_stats_22773000.npz") #shift 7

            print("load n: ", statsu["n"])
            print("load M: ", statsu["M"])
            print("load S: ", statsu["S"])

            obfilter.rs._n = statsu["n"]
            obfilter.rs._M = statsu["M"]
            obfilter.rs._S = statsu["S"]

            print("obf n: ", obfilter.rs._n)
            print("obf M: ", obfilter.rs._M)
            print("obf S: ", obfilter.rs._S)

            env.set_robot_joint_positions(last_jpos)
            ob = np.concatenate((last_ob, env.box_end), axis=0)
            prev_ob = np.float32(np.zeros(
                ob.shape))  # check if indeed starts at all zeros

            ob = obfilter(ob)

            while True:
                s = np.concatenate([ob, prev_ob], -1)
                ac, _, _ = eval_lift_policy.act(s)

                prev_ob = np.copy(ob)

                scaled_ac = env.action_space.low + (ac + 1.) * 0.5 * (
                    env.action_space.high - env.action_space.low)
                scaled_ac = np.clip(scaled_ac, env.action_space.low,
                                    env.action_space.high)

                ob, rew, new, _ = env.step(scaled_ac)

                ob = obfilter(ob)

                cum_rew += rew

                env.render()  # check the running in for the first part
                #logger.log("rendering for reach policy")

                if new or tt >= args.traj_limitation:
                    break
                tt += 1

            print("Cumulative reward over session: " + str(cum_rew))

    env.close()
Пример #4
0
def learn(env,
          policy,
          vf,
          gamma,
          lam,
          timesteps_per_batch,
          num_timesteps,
          animate=False,
          callback=None,
          desired_kl=0.002,
          lr=0.03,
          momentum=0.9):
    ob_dim, ac_dim = policy.ob_dim, policy.ac_dim
    dbpi = GaussianMlpPolicy(ob_dim, ac_dim, 'dbp')
    oldpi = GaussianMlpPolicy(ob_dim, ac_dim, 'oe')
    dboldpi = GaussianMlpPolicy(ob_dim, ac_dim, 'doi')
    # with tf.variable_scope('dbp'):
    # with tf.variable_scope('oe'):
    # with tf.variable_scope('doi'):

    pi = policy

    do_std = U.function([], [pi.std_1a, pi.logstd_1a])

    kloldnew = oldpi.pd.kl(pi.pd)
    dbkloldnew = dboldpi.pd.kl(dbpi.pd)
    dist = meankl = tf.reduce_mean(kloldnew)
    dbkl = tf.reduce_mean(dbkloldnew)
    obfilter = ZFilter(env.observation_space.shape)

    max_pathlength = env.spec.timestep_limit
    stepsize = tf.Variable(initial_value=np.float32(np.array(lr)),
                           name='stepsize')
    inputs, loss, loss_sampled = policy.update_info

    var_list = [v for v in tf.global_variables() if "pi" in v.name]
    db_var_list = [v for v in tf.global_variables() if "dbp" in v.name]
    old_var_list = [v for v in tf.global_variables() if "oe" in v.name]
    db_old_var_list = [v for v in tf.global_variables() if "doi" in v.name]
    print(len(var_list), len(db_var_list), len(old_var_list),
          len(db_old_var_list))
    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv, newv) in zipsame(old_var_list, var_list)
        ])
    assign_db = U.function(
        [], [],
        updates=[
            tf.assign(db, o) for (db, o) in zipsame(db_var_list, var_list)
        ] + [
            tf.assign(dbold, dbnew)
            for (dbold, dbnew) in zipsame(db_old_var_list, old_var_list)
        ])

    assign_old_eq_newr = U.function(
        [], [],
        updates=[
            tf.assign(newv, oldv)
            for (oldv, newv) in zipsame(old_var_list, var_list)
        ])
    # assign_dbr = U.function([], [], updates=
    # [tf.assign(o, db) for (db, o) in zipsame(db_var_list, var_list)] +
    # [tf.assign(dbnew, dbold) for (dbold, dbnew) in zipsame(db_old_var_list, old_var_list)])

    klgrads = tf.gradients(dist, var_list)
    dbklgrads = tf.gradients(dbkl, db_var_list)
    p_grads = [tf.ones_like(v) for v in dbklgrads]

    get_flat = U.GetFlat(var_list)
    get_old_flat = U.GetFlat(old_var_list)
    set_from_flat = U.SetFromFlat(var_list)

    flat_tangent2 = tf.placeholder(dtype=tf.float32,
                                   shape=[None],
                                   name="flat_tan2")
    shapes = [var.get_shape().as_list() for var in var_list]
    start = 0
    tangents2 = []
    for shape in shapes:
        sz = U.intprod(shape)
        tangents2.append(tf.reshape(flat_tangent2[start:start + sz], shape))
        start += sz
    gvp2 = tf.add_n([
        tf.reduce_sum(g * tangent2)
        for (g, tangent2) in zipsame(dbklgrads, tangents2)
    ])
    gvp2_grads = tf.gradients(gvp2, db_var_list)

    neg_term = tf.add_n([
        tf.reduce_sum(g * tangent2)
        for (g, tangent2) in zipsame(gvp2_grads, tangents2)
    ]) / 2.
    ng1 = tf.gradients(neg_term, db_var_list)
    ng2 = tf.gradients(neg_term, db_old_var_list)

    neg_term_grads = [
        a + b for (a, b) in zip(tf.gradients(neg_term, db_var_list),
                                tf.gradients(neg_term, db_old_var_list))
    ]
    neg_term = neg_term_grads
    # neg_term = tf.concat(axis=0, values=[tf.reshape(v, [U.numel(v)]) for v in neg_term_grads])

    pos_term = tf.add_n([
        tf.reduce_sum(g * tangent)
        for (g, tangent) in zipsame(gvp2_grads, p_grads)
    ])
    pos_term_grads = [
        a + b for (a, b) in zip(tf.gradients(pos_term, db_var_list),
                                tf.gradients(pos_term, db_old_var_list))
    ]
    pos_term_sum = tf.add_n([
        tf.reduce_sum(g * tangent)
        for (g, tangent) in zipsame(pos_term_grads, tangents2)
    ])
    pos_term_grads = tf.gradients(pos_term_sum, p_grads)
    pos_term = pos_term_grads
    # pos_term = tf.concat(axis=0, values=[tf.reshape(v, [U.numel(v)]) for v in pos_term_grads])
    geo_term = [(p - n) * 0.5 for p, n in zip(pos_term, neg_term)]

    optim = kfac.KfacOptimizer(learning_rate=stepsize, cold_lr=stepsize*(1-0.9), momentum=momentum, kfac_update=2,\
                                epsilon=1e-2, stats_decay=0.99, async=1, cold_iter=1,
                                weight_decay_dict=policy.wd_dict, max_grad_norm=None)
    pi_var_list = []
    for var in tf.trainable_variables():
        if "pi" in var.name:
            pi_var_list.append(var)

    grads = optim.compute_gradients(loss, var_list=pi_var_list)
    update_op, q_runner = optim.minimize(loss,
                                         loss_sampled,
                                         var_list=pi_var_list)
    geo_term = [g1 + g2[0] for g1, g2 in zip(geo_term, grads)]
    geo_grads = list(zip(geo_term, var_list))
    update_geo_op, q_runner_geo = optim.apply_gradients(geo_grads)
    do_update = U.function(inputs, update_op)
    inputs_tangent = list(inputs) + [flat_tangent2]
    do_update_geo = U.function(inputs_tangent, update_geo_op)
    do_get_geo_term = U.function(inputs_tangent, [ng1, ng2])
    U.initialize()

    # start queue runners
    enqueue_threads = []
    coord = tf.train.Coordinator()
    for qr in [q_runner, vf.q_runner, q_runner_geo]:
        assert (qr != None)
        enqueue_threads.extend(
            qr.create_threads(tf.get_default_session(),
                              coord=coord,
                              start=True))

    i = 0
    timesteps_so_far = 0
    while True:
        if timesteps_so_far > num_timesteps:
            break
        logger.log("********** Iteration %i ************" % i)

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            path = rollout(env,
                           policy,
                           max_pathlength,
                           animate=(len(paths) == 0 and (i % 10 == 0)
                                    and animate),
                           obfilter=obfilter)
            paths.append(path)
            n = pathlength(path)
            timesteps_this_batch += n
            timesteps_so_far += n
            if timesteps_this_batch > timesteps_per_batch:
                break

        # Estimate advantage function
        vtargs = []
        advs = []
        for path in paths:
            rew_t = path["reward"]
            return_t = common.discount(rew_t, gamma)
            vtargs.append(return_t)
            vpred_t = vf.predict(path)
            vpred_t = np.append(vpred_t,
                                0.0 if path["terminated"] else vpred_t[-1])
            delta_t = rew_t + gamma * vpred_t[1:] - vpred_t[:-1]
            adv_t = common.discount(delta_t, gamma * lam)
            advs.append(adv_t)
        # Update value function
        vf.fit(paths, vtargs)

        # Build arrays for policy update
        ob_no = np.concatenate([path["observation"] for path in paths])
        action_na = np.concatenate([path["action"] for path in paths])
        oldac_dist = np.concatenate([path["action_dist"] for path in paths])
        adv_n = np.concatenate(advs)
        standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8)

        assign_old_eq_new()  # set old parameter values to new parameter values
        assign_db()

        # Policy update
        do_update(ob_no, action_na, standardized_adv_n)
        # ft2 = get_flat() - get_old_flat()

        # assign_old_eq_newr() # assign back
        # gnp = do_get_geo_term(ob_no, action_na, standardized_adv_n, ft2)

        # def check_nan(bs):
        #     return [~np.isnan(b).all() for b in bs]

        # print(gnp[0])
        # print('.....asdfasdfadslfkadsjfaksdfalsdkfjaldskf')
        # print(gnp[1])
        # do_update_geo(ob_no, action_na, standardized_adv_n, ft2)

        min_stepsize = np.float32(1e-8)
        max_stepsize = np.float32(1e0)
        # Adjust stepsize
        kl = policy.compute_kl(ob_no, oldac_dist)
        # if kl > desired_kl * 2:
        #     logger.log("kl too high")
        #     tf.assign(stepsize, tf.maximum(min_stepsize, stepsize / 1.5)).eval()
        # elif kl < desired_kl / 2:
        #     logger.log("kl too low")
        #     tf.assign(stepsize, tf.minimum(max_stepsize, stepsize * 1.5)).eval()
        # else:
        #     logger.log("kl just right!")

        logger.record_tabular(
            "EpRewMean", np.mean([path["reward"].sum() for path in paths]))
        logger.record_tabular(
            "EpRewSEM",
            np.std([
                path["reward"].sum() / np.sqrt(len(paths)) for path in paths
            ]))
        logger.record_tabular("EpLenMean",
                              np.mean([pathlength(path) for path in paths]))
        logger.record_tabular("KL", kl)
        print(do_std())
        if callback:
            callback()
        logger.dump_tabular()
        i += 1

    coord.request_stop()
    coord.join(enqueue_threads)
Пример #5
0
def learn(env,
          policy,
          vf,
          gamma,
          lam,
          timesteps_per_batch,
          num_timesteps,
          animate=False,
          callback=None,
          desired_kl=0.002,
          save_model_with_prefix=None,
          restore_model_from_file=None,
          outdir="/tmp/rosrl/experiments/continuous/acktr/"):

    obfilter = ZFilter(env.observation_space.shape)
    # Risto change
    max_pathlength = env.max_episode_steps
    stepsize = tf.Variable(initial_value=np.float32(np.array(0.03)),
                           name='stepsize')
    inputs, loss, loss_sampled = policy.update_info
    optim = kfac.KfacOptimizer(learning_rate=stepsize, cold_lr=stepsize*(1-0.9), momentum=0.9, kfac_update=2,\
                                epsilon=1e-2, stats_decay=0.99, async_=1, cold_iter=1,
                                weight_decay_dict=policy.wd_dict, max_grad_norm=None)
    pi_var_list = []
    for var in tf.trainable_variables():
        if "pi" in var.name:
            pi_var_list.append(var)

    update_op, q_runner = optim.minimize(loss,
                                         loss_sampled,
                                         var_list=pi_var_list)
    do_update = U.function(inputs, update_op)
    U.initialize()
    """
    Here we add a possibility to resume from a previously saved model if a model file is provided
    """
    if restore_model_from_file:
        saver = tf.train.Saver()
        saver.restore(tf.get_default_session(), restore_model_from_file)
        logger.log("Loaded model from {}".format(restore_model_from_file))

    # start queue runners
    enqueue_threads = []
    coord = tf.train.Coordinator()
    for qr in [q_runner, vf.q_runner]:
        assert (qr != None)
        enqueue_threads.extend(
            qr.create_threads(tf.get_default_session(),
                              coord=coord,
                              start=True))

    i = 0
    timesteps_so_far = 0

    if save_model_with_prefix:
        # basePath = '/tmp/rosrl/' + str(env.__class__.__name__) +'/acktr/'
        summary_writer = tf.summary.FileWriter(outdir,
                                               graph=tf.get_default_graph())

    while True:
        if timesteps_so_far > num_timesteps:
            break
        logger.log("********** Iteration %i ************" % i)

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            path = rollout(env,
                           policy,
                           max_pathlength,
                           animate=(len(paths) == 0 and (i % 10 == 0)
                                    and animate),
                           obfilter=obfilter)
            paths.append(path)
            n = pathlength(path)
            timesteps_this_batch += n
            timesteps_so_far += n
            if timesteps_this_batch > timesteps_per_batch:
                break

        # Estimate advantage function
        vtargs = []
        advs = []
        for path in paths:
            rew_t = path["reward"]
            return_t = common.discount(rew_t, gamma)
            vtargs.append(return_t)
            vpred_t = vf.predict(path)
            vpred_t = np.append(vpred_t,
                                0.0 if path["terminated"] else vpred_t[-1])
            delta_t = rew_t + gamma * vpred_t[1:] - vpred_t[:-1]
            adv_t = common.discount(delta_t, gamma * lam)
            advs.append(adv_t)
        # Update value function
        vf.fit(paths, vtargs)

        # Build arrays for policy update
        ob_no = np.concatenate([path["observation"] for path in paths])
        action_na = np.concatenate([path["action"] for path in paths])
        oldac_dist = np.concatenate([path["action_dist"] for path in paths])
        adv_n = np.concatenate(advs)
        standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8)

        # Policy update
        do_update(ob_no, action_na, standardized_adv_n)

        min_stepsize = np.float32(1e-8)
        max_stepsize = np.float32(1e0)
        # Adjust stepsize
        kl = policy.compute_kl(ob_no, oldac_dist)
        if kl > desired_kl * 2:
            logger.log("kl too high")
            tf.assign(stepsize, tf.maximum(min_stepsize,
                                           stepsize / 1.5)).eval()
        elif kl < desired_kl / 2:
            logger.log("kl too low")
            tf.assign(stepsize, tf.minimum(max_stepsize,
                                           stepsize * 1.5)).eval()
        else:
            logger.log("kl just right!")

        logger.record_tabular(
            "EpRewMean", np.mean([path["reward"].sum() for path in paths]))
        logger.record_tabular(
            "EpRewSEM",
            np.std([
                path["reward"].sum() / np.sqrt(len(paths)) for path in paths
            ]))
        logger.record_tabular("EpLenMean",
                              np.mean([pathlength(path) for path in paths]))
        logger.record_tabular("KL", kl)

        if callback:
            callback()
        logger.dump_tabular()
        """
        Save the model at every itteration
        """
        if save_model_with_prefix:
            if np.mean([path["reward"].sum() for path in paths]) > -50.0:
                # basePath = '/tmp/rosrl/' + str(env.__class__.__name__) +'/acktr/'
                summary = tf.Summary(value=[
                    tf.Summary.Value(tag="EpRewMean",
                                     simple_value=np.mean([
                                         path["reward"].sum() for path in paths
                                     ]))
                ])
                summary_writer.add_summary(summary, i)
                if not os.path.exists(outdir):
                    os.makedirs(outdir)
                modelF = outdir + '/' + save_model_with_prefix + "_afterIter_" + str(
                    i) + ".model"
                U.save_state(modelF)
                logger.log("Saved model to file :{}".format(modelF))

        i += 1

    coord.request_stop()
    coord.join(enqueue_threads)
Пример #6
0
def learn(env,
          policy,
          vf,
          gamma,
          lam,
          timesteps_per_batch,
          num_timesteps,
          animate=False,
          callback=None,
          desired_kl=0.002,
          fname='./training.ckpt'):

    mean_logger = setup_logger("Mean Logger", "log/episode_mean.txt")

    # print("Filter shape:  ", env.observation_space.shape)
    space = (env.observation_space.shape[0] * 2, )
    obfilter = ZFilter(space)

    max_pathlength = env.spec.timestep_limit
    stepsize = tf.Variable(initial_value=np.float32(np.array(0.03)),
                           name='stepsize')  #0.03
    inputs, loss, loss_sampled = policy.update_info
    optim = kfac.KfacOptimizer(learning_rate=stepsize, cold_lr=stepsize*(1-0.9), momentum=0.9, kfac_update=2,\
                                epsilon=1e-2, stats_decay=0.99, async=1, cold_iter=1,
                                weight_decay_dict=policy.wd_dict, max_grad_norm=None)
    pi_var_list = []
    for var in tf.trainable_variables():
        if "pi" in var.name:
            pi_var_list.append(var)

    update_op, q_runner = optim.minimize(loss,
                                         loss_sampled,
                                         var_list=pi_var_list)
    do_update = U.function(inputs, update_op)
    U.initialize()

    #changes
    if fname != None and tf.train.checkpoint_exists(fname):
        saver = tf.train.Saver()
        saver.restore(tf.get_default_session(), fname)
        logger.log("Model loaded from file {}".format(fname))

    # start queue runners
    enqueue_threads = []
    coord = tf.train.Coordinator()
    for qr in [q_runner, vf.q_runner]:
        assert (qr != None, "QR is None")
        enqueue_threads.extend(
            qr.create_threads(tf.get_default_session(),
                              coord=coord,
                              start=True))

    i = 0
    timesteps_so_far = 0
    total_reward = float()
    while True:
        print("Timestep Number: %d of %d" % (timesteps_so_far, num_timesteps))
        if timesteps_so_far > num_timesteps:
            break
        logger.log("********** Iteration %i ************" % i)

        #Save model every 100 iterations
        if fname != None and (i % 100 == 0):
            os.makedirs(os.path.dirname(fname), exist_ok=True)
            saver = tf.train.Saver()
            saver.save(tf.get_default_session(), fname)
            logger.log("Model saved to file {}".format(fname))
            env.seed()

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        terminal_rew = []
        while True:
            path, temp_rew = rollout(env,
                                     policy,
                                     max_pathlength,
                                     animate=(len(paths) == 0 and (i % 10 == 0)
                                              and animate),
                                     obfilter=obfilter)
            paths.append(path)
            terminal_rew.append(np.array(temp_rew))
            n = pathlength(path)
            timesteps_this_batch += n
            if timesteps_this_batch > timesteps_per_batch:
                break
        timesteps_so_far += 1

        # Estimate advantage function
        vtargs = []
        advs = []
        for path in paths:
            rew_t = path["reward"]
            return_t = common.discount(rew_t, gamma)
            vtargs.append(return_t)
            vpred_t = vf.predict(path)
            vpred_t = np.append(vpred_t,
                                0.0 if path["terminated"] else vpred_t[-1])
            delta_t = rew_t + gamma * vpred_t[1:] - vpred_t[:-1]
            adv_t = common.discount(delta_t, gamma * lam)
            advs.append(adv_t)
        # Update value function
        vf.fit(paths, vtargs)

        # Build arrays for policy update
        ob_no = np.concatenate([path["observation"] for path in paths])
        action_na = np.concatenate([path["action"] for path in paths])
        oldac_dist = np.concatenate([path["action_dist"] for path in paths])
        adv_n = np.concatenate(advs)
        standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8)

        # Policy update
        do_update(ob_no, action_na, standardized_adv_n)

        min_stepsize = np.float32(1e-8)
        max_stepsize = np.float32(1e0)
        # Adjust stepsize
        kl = policy.compute_kl(ob_no, oldac_dist)
        if kl > desired_kl * 2:
            logger.log("kl too high")
            tf.assign(stepsize, tf.maximum(min_stepsize,
                                           stepsize / 1.5)).eval()
        elif kl < desired_kl / 2:
            logger.log("kl too low")
            tf.assign(stepsize, tf.minimum(max_stepsize,
                                           stepsize * 1.5)).eval()
        else:
            logger.log("kl just right!")

        terminal_rew = np.array(terminal_rew)
        rew_mean = np.mean([path.sum() for path in terminal_rew])
        rew_sem = np.std(
            [path.sum() / np.sqrt(len(terminal_rew)) for path in terminal_rew])
        len_mean = np.mean([path.shape[0] for path in terminal_rew])

        # rewList = []
        # for path in paths:
        #     trew = []
        #     rew_i = 0
        #     while True:
        #         trew.append(path["reward"][rew_i])
        #         rew_i += 11
        #         if rew_i > (len(path["reward"])-1):
        #             break
        #     rewList.append( np.array(trew) )
        # rewList = np.array(rewList)

        # rew_mean = np.mean([path.sum() for path in rewList])
        # rew_sem = np.std([path.sum()/np.sqrt(len(rewList)) for path in rewList])
        # len_mean = np.mean([path.shape[0] for path in rewList])

        # rew_mean = np.mean([path["reward"].sum() for path in paths])
        # rew_sem = np.std([path["reward"].sum()/np.sqrt(len(paths)) for path in paths])
        # len_mean = np.mean([pathlength(path) for path in paths])

        total_reward += rew_mean

        logger.record_tabular("EpRewMean", rew_mean)
        logger.record_tabular("EpRewSEM", rew_sem)
        logger.record_tabular("EpLenMean", len_mean)
        logger.record_tabular("TotalRewardMean", total_reward)
        logger.record_tabular("KL", kl)
        if callback:
            callback()
        logger.dump_tabular()

        mean_logger.info(
            "Result for episode {}  of {}: Sum: {}, Average: {}, Length: {}".
            format(timesteps_so_far, num_timesteps, rew_mean, rew_sem,
                   len_mean))

        i += 1

    if fname != None:
        os.makedirs(os.path.dirname(fname), exist_ok=True)
        saver = tf.train.Saver()
        saver.save(tf.get_default_session(), fname)
        logger.log("Model saved to file {}".format(fname))
        env.seed()
    coord.request_stop()
    coord.join(enqueue_threads)