Exemplo n.º 1
0
def test_function():
    tf.reset_default_graph()
    x = tf.placeholder(tf.int32, (), name="x")
    y = tf.placeholder(tf.int32, (), name="y")
    z = 3 * x + 2 * y
    lin = function([x, y], z, givens={y: 0})

    with single_threaded_session():
        initialize()

        assert lin(2) == 6
        assert lin(x=3) == 9
        assert lin(2, 2) == 10
        assert lin(x=2, y=3) == 12
Exemplo n.º 2
0
def test_set_value():
    a = tf.Variable(42.)
    with single_threaded_session():
        set_value(a, 5)
        assert a.eval() == 5
        g = tf.get_default_graph()
        g.finalize()
        set_value(a, 6)
        assert a.eval() == 6

        # test the test
        try:
            assert a.eval() == 7
        except AssertionError:
            pass
        else:
            assert False, "assertion should have failed"
Exemplo n.º 3
0
def test_multikwargs():
    tf.reset_default_graph()
    x = tf.placeholder(tf.int32, (), name="x")
    with tf.variable_scope("other"):
        x2 = tf.placeholder(tf.int32, (), name="x")
    z = 3 * x + 2 * x2

    lin = function([x, x2], z, givens={x2: 0})
    with single_threaded_session():
        initialize()
        assert lin(2) == 6
        assert lin(2, 2) == 10
        expt_caught = False
        try:
            lin(x=2)
        except AssertionError:
            expt_caught = True
        assert expt_caught
Exemplo n.º 4
0
def run():
  with U.single_threaded_session() as sess:
    actor_model = DDPGSkill(observation_shape= (observation_shape,), skill_name="skill", nb_actions = env.action_space.shape[-1])

    print("Assumption: Goal is 3d target location")

    pred_model = classifier(in_shape = in_size, 
                        out_shape = out_size,
                        name = "suc_pred_model", sess=sess,
                        log_dir=log_dir)



    init_op = tf.group(tf.global_variables_initializer(),
                           tf.local_variables_initializer(), 
                           )
    sess.run(init_op)

    # restore actor
    actor_model.restore_skill(path = get_home_path(restore_dir), sess = sess)

    generate_data(env, env_id, log_dir, actor_model, dataset_size, commit_for)
Exemplo n.º 5
0
def run(env_id,
        render,
        log_dir,
        restore_dir,
        commit_for,
        train_epoch,
        batch_size=32,
        lr=1e-3,
        seed=0,
        dataset_size=2000):

    env = gym.make(env_id)
    observation_shape = env.observation_space.shape[-1]
    global in_size, out_size
    in_size = observation_shape
    out_size = observation_shape - 3

    set_global_seeds(seed)
    env.seed(seed)

    with U.single_threaded_session() as sess:
        actor_model = DDPGSkill(observation_shape=(observation_shape, ),
                                skill_name="skill",
                                nb_actions=env.action_space.shape[-1],
                                restore_path=restore_dir)

        print("Assumption: Goal is 3d target location")

        pred_model = regressor(in_shape=in_size,
                               out_shape=out_size,
                               name="suc_pred_model",
                               sess=sess,
                               log_dir=log_dir)

        init_op = tf.group(
            tf.global_variables_initializer(),
            tf.local_variables_initializer(),
            # train_iter.initializer, test_iter.initializer
        )
        sess.run(init_op)

        # restore actor
        actor_model.restore_skill(path=get_home_path(
            osp.expanduser(restore_dir)),
                                  sess=sess)

        generate_data(env, env_id, log_dir, actor_model, dataset_size,
                      commit_for, render)

        exit(1)
        ## creating dataset tensors
        csv_filename = osp.join(log_dir, "%s.csv" % env_id)
        # base_dataset = tf.data.TextLineDataset(csv_filename)

        # train_dataset = base_dataset.filter(in_training_set).map(decode_line).shuffle(buffer_size=5*batch_size, seed =seed).repeat().batch(batch_size)
        # train_iter = train_dataset.make_initializable_iterator()
        # train_el = train_iter.get_next()

        # test_dataset = base_dataset.filter(in_test_set).map(decode_line).batch(batch_size)
        # test_iter = test_dataset.make_initializable_iterator()
        # test_el = test_iter.get_next()

        ##
        base_dataset = pd.read_csv(csv_filename)
        train, test = train_test_split(base_dataset, test_size=0.2)
        # print(train.shape, test.shape)

        # whiten
        train_mean = np.mean(train, axis=0)
        train_std = np.std(train, axis=0)

        # save mean and var
        statistics = np.concatenate((train_mean, train_std))
        with open(osp.join(log_dir, "%s_stat.npy" % env_id), 'wb') as f:
            np.save(f, statistics)
        # create pd
        train_dataset = ((train - train_mean) / train_std)
        test_dataset = ((test - train_mean) / train_std)
        test_dataset = test_dataset.values
        test_dataset = [test_dataset[:, :in_size], test_dataset[:, in_size:]]
        ####

        print(train_dataset.shape, test_dataset[0].shape)
        pred_model.train(train_epoch, batch_size, lr, train_dataset,
                         test_dataset)
        pred_model.save()
def test(env,
         render_eval,
         reward_scale,
         param_noise,
         actor,
         critic,
         normalize_returns,
         normalize_observations,
         critic_l2_reg,
         actor_lr,
         critic_lr,
         action_noise,
         popart,
         gamma,
         clip_norm,
         nb_eval_steps,
         batch_size,
         memory,
         tau=0.01,
         eval_env=None,
         param_noise_adaption_interval=50,
         **kwargs):

    assert (np.abs(env.action_space.low) == env.action_space.high
            ).all()  # we assume symmetric actions.
    max_action = env.action_space.high

    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 env.action_space.shape,
                 False,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale)

    saver = tf.train.Saver()

    writer = imageio.get_writer('/tmp/0.mp4', fps=10)

    with U.single_threaded_session() as sess:
        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()

        ## restore
        restore_dir = osp.join(kwargs["restore_dir"], "model")
        if (restore_dir is not None):
            print('Restore path : ', restore_dir)
            checkpoint = tf.train.get_checkpoint_state(restore_dir)
            if checkpoint and checkpoint.model_checkpoint_path:

                saver.restore(U.get_session(),
                              checkpoint.model_checkpoint_path)
                print("checkpoint loaded:", checkpoint.model_checkpoint_path)
                tokens = checkpoint.model_checkpoint_path.split("-")[-1]
                # set global step
                global_t = int(tokens)
                print(">>> global step set:", global_t)
            else:
                print(">>>no checkpoint file found")

        epoch_episode_eval_rewards = []
        epoch_episode_eval_steps = []

        # Evaluate.
        eval_episode_rewards = []
        eval_episode_rewards_history = []
        eval_episode_success = []

        logdir = logger.get_dir()
        assert logdir is not None
        try:
            os.mkdir(osp.join(logdir, 'vis'))
        except:
            pass  #already exists
        try:
            os.mkdir(osp.join(logdir, 'cam'))
        except:
            pass

        #logdir = logger.get_dir()
        vidpath = osp.join(logdir, 'vis/0.mp4')
        campath = osp.join(logdir, 'cam/0.mp4')
        vid_writer = imageio.get_writer(vidpath, fps=10)
        cam_writer = imageio.get_writer(campath, fps=10)

        for i in range(100):
            print("Evaluating:%d" % (i + 1))
            eval_episode_reward = 0.
            eval_obs = eval_env.reset()
            eval_done = False

            while (not eval_done):
                eval_action, eval_q = agent.pi(eval_obs,
                                               apply_noise=False,
                                               compute_Q=True)
                eval_obs, eval_r, eval_done, eval_info = eval_env.step(
                    max_action * eval_action
                )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])

                # print(eval_obs, max_action*eval_action, eval_info)

                #if render_eval:
                #    eval_env.render()
                if render_eval:
                    eval_env.render(writer=vid_writer)

                    #let's draw the bounding box...
                    box = eval_env.last_box
                    box = np.minimum(box, 63)
                    box = box.astype(np.int32)

                    img = eval_obs[1]
                    img[box[2], box[0]:box[1], :] = 0
                    img[box[3], box[0]:box[1], :] = 0
                    img[box[2]:box[3], box[0], :] = 0
                    img[box[2]:box[3], box[1], :] = 0
                    img = (img * 255.0).astype(np.uint8)

                    cam_writer.append_data(img)
                    #eval_env.render(writer = writer)
                    #sleep(0.1)

                eval_episode_reward += eval_r

            print("episode reward::%f" % eval_episode_reward)

            eval_episode_rewards.append(eval_episode_reward)
            eval_episode_rewards_history.append(eval_episode_reward)
            eval_episode_success.append(eval_info["done"] == "goal reached")
            eval_episode_reward = 0.

        print("episode reward - mean:%.4f, var:%.4f, success:%.4f" %
              (np.mean(eval_episode_rewards), np.var(eval_episode_rewards),
               np.mean(eval_episode_success)))

    cam_writer.close()
    vid_writer.close()
def train(env,
          nb_epochs,
          nb_epoch_cycles,
          render_eval,
          reward_scale,
          render,
          param_noise,
          actor,
          critic,
          normalize_returns,
          normalize_observations,
          critic_l2_reg,
          actor_lr,
          critic_lr,
          action_noise,
          popart,
          gamma,
          clip_norm,
          nb_train_steps,
          nb_rollout_steps,
          nb_eval_episodes,
          batch_size,
          memory,
          tau=0.05,
          eval_env=None,
          param_noise_adaption_interval=50,
          **kwargs):
    rank = MPI.COMM_WORLD.Get_rank()

    assert (np.abs(env.action_space.low) == env.action_space.high
            ).all()  # we assume symmetric actions.
    max_action = env.action_space.high

    if "dologging" in kwargs:
        dologging = kwargs["dologging"]
    else:
        dologging = True

    if "tf_sum_logging" in kwargs:
        tf_sum_logging = kwargs["tf_sum_logging"]
    else:
        tf_sum_logging = False

    if "invert_grad" in kwargs:
        invert_grad = kwargs["invert_grad"]
    else:
        invert_grad = False

    if "actor_reg" in kwargs:
        actor_reg = kwargs["actor_reg"]
    else:
        actor_reg = False

    if dologging:
        logger.debug(
            'scaling actions by {} before executing in env'.format(max_action))

    if kwargs['look_ahead']:
        look_ahead = True
        look_ahead_planner = Planning_with_memories(
            skillset=kwargs['my_skill_set'],
            env=env,
            num_samples=kwargs['num_samples'])
        exploration = LinearSchedule(schedule_timesteps=int(nb_epochs *
                                                            nb_epoch_cycles),
                                     initial_p=1.0,
                                     final_p=kwargs['exploration_final_eps'])
    else:
        look_ahead = False

    if kwargs['skillset']:
        action_shape = (kwargs['my_skill_set'].len +
                        kwargs['my_skill_set'].num_params, )
    else:
        action_shape = env.action_space.shape

    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 action_shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale,
                 inverting_grad=invert_grad,
                 actor_reg=actor_reg)

    if dologging and MPI.COMM_WORLD.Get_rank() == 0:
        logger.debug('Using agent with the following configuration:')
        logger.debug(str(agent.__dict__.items()))

    # should have saver for all thread to restore. But dump only using 1 saver
    saver = tf.train.Saver(keep_checkpoint_every_n_hours=2,
                           max_to_keep=20,
                           save_relative_paths=True)
    save_freq = kwargs["save_freq"]

    # step = 0
    global_t = 0
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)

    ## get the session with the current graph => identical graph is used for each session
    with U.single_threaded_session() as sess:
        # Set summary saver
        if dologging and tf_sum_logging and rank == 0:
            tf.summary.histogram("actor_grads", agent.actor_grads)
            tf.summary.histogram("critic_grads", agent.critic_grads)
            actor_trainable_vars = actor.trainable_vars
            for var in actor_trainable_vars:
                tf.summary.histogram(var.name, var)
            critic_trainable_vars = critic.trainable_vars
            for var in critic_trainable_vars:
                tf.summary.histogram(var.name, var)

            tf.summary.histogram("actions_out", agent.actor_tf)
            tf.summary.histogram("critic_out", agent.critic_tf)
            tf.summary.histogram("target_Q", agent.target_Q)

            summary_var = tf.summary.merge_all()
            writer_t = tf.summary.FileWriter(
                osp.join(logger.get_dir(), 'train'), sess.graph)
        else:
            summary_var = tf.no_op()

        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()

        ## restore
        if kwargs['skillset']:
            ## restore skills
            my_skill_set = kwargs['my_skill_set']
            my_skill_set.restore_skillset(sess=sess)
        ## restore current controller
        if kwargs["restore_dir"] is not None:
            restore_dir = osp.join(kwargs["restore_dir"], "model")
            if (restore_dir is not None) and rank == 0:
                print('Restore path : ', restore_dir)
                model_checkpoint_path = read_checkpoint_local(restore_dir)
                if model_checkpoint_path:
                    saver.restore(U.get_session(), model_checkpoint_path)
                    logger.info("checkpoint loaded:" +
                                str(model_checkpoint_path))
                    tokens = model_checkpoint_path.split("-")[-1]
                    # set global step
                    global_t = int(tokens)
                    print(">>> global step set:", global_t)

        agent.reset()
        obs = env.reset()

        # maintained across epochs
        episodes = 0
        t = 0
        start_time = time.time()

        # creating vars. this is done to keep the syntax for deleting the list simple a[:] = []
        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_actions = []
        epoch_actor_losses = []
        epoch_critic_losses = []
        if param_noise is not None:
            epoch_adaptive_distances = []

        eval_episode_rewards = []
        eval_episode_success = []

        # for each episode
        done = False
        episode_reward = 0.
        episode_step = 0

        ## containers for hierarchical hindsight
        if kwargs["her"]:
            logger.debug("-" * 50 + '\nWill create HER\n' + "-" * 50)
            # per episode
            states, pactions, sub_states = [], [], []

        print("Ready to go!")
        for epoch in range(global_t, nb_epochs):

            # stat containers
            epoch_episodes = 0.
            epoch_start_time = time.time()

            epoch_episode_rewards[:] = []
            epoch_episode_steps[:] = []
            epoch_actions[:] = [
            ]  # action mean: don't know if this indicates anything
            epoch_actor_losses[:] = []
            epoch_critic_losses[:] = []

            if param_noise is not None:
                epoch_adaptive_distances[:] = []

            eval_episode_rewards[:] = []
            eval_episode_success[:] = []

            for cycle in range(nb_epoch_cycles):
                # Perform rollouts.
                for t_rollout in range(
                        int(nb_rollout_steps / MPI.COMM_WORLD.Get_size())):
                    # print(rank, t_rollout)

                    # Predict next action.
                    # exploration check
                    if kwargs['look_ahead'] and (np.random.rand(
                    ) < exploration.value(epoch * nb_epoch_cycles + cycle)):
                        paction, planner_info = look_ahead_planner.create_plan(
                            obs)
                    else:
                        paction, _ = agent.pi(obs,
                                              apply_noise=True,
                                              compute_Q=True)

                    if (my_skill_set):
                        ## break actions into primitives and their params
                        primitives_prob = paction[:kwargs['my_skill_set'].len]
                        primitive_id = np.argmax(primitives_prob)

                        # print("skill chosen", primitive_id)
                        r = 0.
                        skill_obs = obs.copy()

                        if kwargs['her']:
                            curr_sub_states = [skill_obs.copy()]

                        for _ in range(kwargs['commit_for']):
                            action = my_skill_set.pi(
                                primitive_id=primitive_id,
                                obs=skill_obs.copy(),
                                primitive_params=paction[my_skill_set.len:])
                            # Execute next action.
                            if rank == 0 and render:
                                sleep(0.1)
                                env.render()
                            assert max_action.shape == action.shape
                            new_obs, skill_r, done, info = env.step(
                                max_action * action
                            )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                            r += skill_r

                            if kwargs['her']:
                                curr_sub_states.append(new_obs.copy())

                            skill_obs = new_obs
                            if done or my_skill_set.termination(
                                    new_obs,
                                    primitive_id,
                                    primitive_params=paction[my_skill_set.
                                                             len:]):
                                break

                        # assuming the skill is trained from different reward signal
                        r = skill_r

                    else:
                        action = paction
                        # Execute next action.
                        if rank == 0 and render:
                            env.render()
                        assert max_action.shape == action.shape
                        new_obs, r, done, info = env.step(
                            max_action * action
                        )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])

                    assert action.shape == env.action_space.shape

                    t += 1

                    episode_reward += r
                    episode_step += 1

                    # Book-keeping.
                    epoch_actions.append(paction)
                    agent.store_transition(obs, paction, r, new_obs, done)

                    # storing info for hindsight
                    if kwargs['her']:
                        states.append(obs.copy())
                        pactions.append(paction.copy())
                        sub_states.append(curr_sub_states)

                    # print(planner_info['next_state'][:6], new_obs[:6])

                    obs = new_obs

                    if done:
                        # Episode done.
                        # update stats
                        epoch_episode_rewards.append(episode_reward)
                        episode_rewards_history.append(episode_reward)
                        epoch_episode_steps.append(episode_step)
                        epoch_episodes += 1
                        episodes += 1
                        # reinit
                        episode_reward = 0.
                        episode_step = 0
                        agent.reset()
                        obs = env.reset()

                        if kwargs["her"]:
                            # logger.info("-"*50 +'\nCreating HER\n' + "-"*50)

                            # create hindsight experience replay
                            if kwargs['skillset']:
                                her_states, her_rewards = env.apply_hierarchical_hindsight(
                                    states, pactions, new_obs.copy(),
                                    sub_states)
                            else:
                                her_states, her_rewards = env.apply_hindsight(
                                    states, pactions, new_obs.copy())

                            ## store her transitions: her_states: n+1, her_rewards: n
                            for her_i in range(len(her_states) - 2):
                                agent.store_transition(her_states[her_i],
                                                       pactions[her_i],
                                                       her_rewards[her_i],
                                                       her_states[her_i + 1],
                                                       False)
                            #store last transition
                            agent.store_transition(her_states[-2],
                                                   pactions[-1],
                                                   her_rewards[-1],
                                                   her_states[-1], True)

                            ## refresh the storage containers
                            states[:], pactions[:] = [], []
                            if kwargs['skillset']:
                                sub_states[:] = []

                # print(rank, "Training!")
                # Train.
                for t_train in range(nb_train_steps):
                    # print(rank, t_train)
                    # Adapt param noise, if necessary.
                    if (memory.nb_entries >= batch_size) and (
                            t % param_noise_adaption_interval
                            == 0) and (param_noise is not None):
                        distance = agent.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)

                    cl, al, current_summary = agent.train(summary_var)
                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    agent.update_target_net()

                    if dologging and tf_sum_logging and rank == 0:
                        writer_t.add_summary(
                            current_summary,
                            epoch * nb_epoch_cycles * nb_train_steps +
                            cycle * nb_train_steps + t_train)

            # print("Evaluating!")
            # Evaluate after training is done.
            if (eval_env is not None) and rank == 0:
                for _ in range(nb_eval_episodes):
                    eval_episode_reward = 0.
                    eval_obs = eval_env.reset()
                    eval_obs_start = eval_obs.copy()
                    eval_done = False
                    while (not eval_done):
                        eval_paction, _ = agent.pi(eval_obs,
                                                   apply_noise=False,
                                                   compute_Q=False)

                        if (kwargs['skillset']):
                            ## break actions into primitives and their params
                            eval_primitives_prob = eval_paction[:kwargs[
                                'my_skill_set'].len]
                            eval_primitive_id = np.argmax(eval_primitives_prob)

                            eval_r = 0.
                            eval_skill_obs = eval_obs.copy()
                            for _ in range(kwargs['commit_for']):
                                eval_action = my_skill_set.pi(
                                    primitive_id=eval_primitive_id,
                                    obs=eval_skill_obs.copy(),
                                    primitive_params=eval_paction[my_skill_set.
                                                                  len:])

                                eval_new_obs, eval_skill_r, eval_done, eval_info = eval_env.step(
                                    max_action * eval_action
                                )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])

                                if render_eval:
                                    eval_env.render()

                                eval_r += eval_skill_r
                                # check for skill termination or episode termination
                                eval_terminate_skill = my_skill_set.termination(
                                    eval_new_obs,
                                    eval_primitive_id,
                                    primitive_params=eval_paction[my_skill_set.
                                                                  len:])
                                if eval_done or eval_terminate_skill:
                                    break

                                eval_skill_obs = eval_new_obs

                            # hack assuming the skills are trained from diff reward signal
                            eval_r = eval_skill_r

                        else:
                            eval_action, _ = eval_paction, eval_pq
                            eval_new_obs, eval_r, eval_done, eval_info = eval_env.step(
                                max_action * eval_action)

                        eval_episode_reward += eval_r
                        eval_obs = eval_new_obs

                    eval_episode_rewards.append(eval_episode_reward)
                    eval_episode_rewards_history.append(eval_episode_reward)
                    eval_episode_success.append(
                        eval_info["done"] == "goal reached")
                    if (eval_info["done"] == "goal reached"):
                        logger.info(
                            "success, training epoch:%d,starting config:" %
                            epoch, eval_obs_start, 'final state', eval_obs)

            if dologging and rank == 0:
                print("Logging!")
                # Log stats.
                epoch_train_duration = time.time() - epoch_start_time
                duration = time.time() - start_time
                stats = agent.get_stats()
                combined_stats = {}
                for key in sorted(stats.keys()):
                    combined_stats[key] = normal_mean(stats[key])

                # Rollout statistics.
                combined_stats['rollout/return'] = normal_mean(
                    epoch_episode_rewards)
                if len(episode_rewards_history) > 0:
                    combined_stats['rollout/return_history'] = normal_mean(
                        np.mean(episode_rewards_history))
                else:
                    combined_stats['rollout/return_history'] = 0.
                combined_stats['rollout/episode_steps'] = normal_mean(
                    epoch_episode_steps)
                combined_stats['rollout/episodes'] = np.sum(epoch_episodes)
                combined_stats['rollout/actions_mean'] = normal_mean(
                    epoch_actions)
                combined_stats['rollout/actions_std'] = normal_std(
                    epoch_actions)

                # Train statistics.
                combined_stats['train/loss_actor'] = normal_mean(
                    epoch_actor_losses)
                combined_stats['train/loss_critic'] = normal_mean(
                    epoch_critic_losses)
                if param_noise is not None:
                    combined_stats['train/param_noise_distance'] = normal_mean(
                        epoch_adaptive_distances)

                if kwargs['look_ahead']:
                    combined_stats['train/exploration'] = exploration.value(
                        epoch * nb_epoch_cycles + cycle)

                # Evaluation statistics.
                if eval_env is not None:
                    combined_stats['eval/return'] = normal_mean(
                        eval_episode_rewards)
                    combined_stats['eval/success'] = normal_mean(
                        eval_episode_success)
                    if len(eval_episode_rewards_history) > 0:
                        combined_stats['eval/return_history'] = normal_mean(
                            np.mean(eval_episode_rewards_history))
                    else:
                        combined_stats['eval/return_history'] = 0.
                    combined_stats['eval/episodes'] = normal_mean(
                        len(eval_episode_rewards))

                # Total statistics.
                combined_stats['total/duration'] = normal_mean(duration)
                combined_stats['total/rollout_per_second'] = normal_mean(
                    float(t) / float(duration))
                combined_stats['total/episodes'] = normal_mean(episodes)
                combined_stats['total/epochs'] = epoch + 1
                combined_stats['total/steps'] = t

                for key in sorted(combined_stats.keys()):
                    logger.record_tabular(key, combined_stats[key])
                logger.dump_tabular()
                logger.info('')
                logdir = logger.get_dir()

                # if rank == 0 and logdir:
                #     print("Dumping progress!")
                #     if hasattr(env, 'get_state'):
                #         with open(osp.join(logdir, 'env_state.pkl'), 'wb') as f:
                #             pickle.dump(env.get_state(), f)
                #     if eval_env and hasattr(eval_env, 'get_state'):
                #         with open(osp.join(logdir, 'eval_env_state.pkl'), 'wb') as f:
                #             pickle.dump(eval_env.get_state(), f)

                ## save tf model
                if rank == 0 and (epoch + 1) % save_freq == 0:
                    print("Saving the model!")
                    os.makedirs(osp.join(logdir, "model"), exist_ok=True)
                    saver.save(U.get_session(),
                               logdir + "/model/ddpg",
                               global_step=epoch)
Exemplo n.º 8
0
        # use successor model
        feed_dict = {
            self.obs0: [self.get_obs(obs=obs, params=primitive_params)]
        }
        next_obs = self.sess.run(self.next_state_pred, feed_dict)[0]

        # print("nn", next_obs_nn[:6])
        # print("pred", next_obs[:6])

        # append target
        target = obs[-3:]
        next_state = np.concatenate((next_obs, target))
        next_full_state = self.get_full_state(next_state, prev_obs=obs)

        return next_full_state, min_dist_idx


if __name__ == "__main__":
    from HER.skills import set1
    import numpy as np

    with U.single_threaded_session() as sess:
        sset = SkillSet(set10.skillset)
        sess.run(tf.global_variables_initializer())

        obs = np.random.rand(set10.skillset[0]['observation_shape'][0])
        sset.restore_skillset(sess)
        action, q = sset.pi(primitive_id=0, obs=obs)
        print(action.shape)
Exemplo n.º 9
0
def run(env_id,
        render,
        log_dir,
        train_epoch,
        batch_size=32,
        lr=1e-3,
        seed=0,
        whiten=False):

    env = gym.make(env_id)
    observation_shape = env.observation_space.shape[-1]
    global in_size, out_size
    in_size = observation_shape
    out_size = observation_shape - 3

    set_global_seeds(seed)
    # env.seed(seed)

    with U.single_threaded_session() as sess:

        ## creating dataset tensors
        csv_filename = osp.join(log_dir, "%s.csv" % env_id)

        ##
        base_dataset = np.loadtxt(csv_filename, delimiter=',')
        train, test = train_test_split(base_dataset, test_size=0.2)

        # NN error
        nn_error = get_nn_error(train, test, in_size)

        print("memory based nn error", nn_error)

        # whiten
        if whiten:
            train_feat_mean = np.mean(train, axis=0)
            train_feat_std = np.std(train, axis=0)

            # save mean and var
            statistics = np.concatenate((train_feat_mean, train_feat_std))
            with open(osp.join(log_dir, "%s_stat.npy" % env_id), 'wb') as f:
                np.save(f, statistics)

            # create pd
            train_dataset = ((train - train_feat_mean) /
                             (train_feat_std + eps))
            # print(train_dataset.shape, train_labels[:, np.newaxis].shape)
            train_dataset = pd.DataFrame(train_dataset)

            test_dataset = ((test - train_feat_mean) / (train_feat_std + eps))
            ####

            print(train_dataset.shape, test_dataset[0].shape)
            whiten_data = [train_feat_mean[in_size:], train_feat_std[in_size:]]
        else:
            # train_dataset = pd.DataFrame(np.concatenate((train_feat, train_labels[:, np.newaxis]),axis=1))
            train_dataset = pd.DataFrame(train)
            test_dataset = test  #pd.DataFrame(test)#[test[:, :-1], test[:,[-1]]]
            whiten_data = None

        pred_model = regressor(in_shape=in_size,
                               out_shape=out_size,
                               name="succmodel",
                               sess=sess,
                               log_dir=log_dir,
                               whiten_data=whiten_data)

        init_op = tf.group(
            tf.global_variables_initializer(),
            tf.local_variables_initializer(),
        )
        sess.run(init_op)

        pred_model.train(train_epoch, batch_size, lr, train_dataset,
                         test_dataset)
        pred_model.save()
Exemplo n.º 10
0
def run(env_id,
        render,
        log_dir,
        train_epoch,
        batch_size=32,
        lr=1e-3,
        seed=0,
        whiten=False):

    env = gym.make(env_id)
    observation_shape = env.observation_space.shape[-1]
    global in_size, out_size
    in_size = observation_shape
    out_size = 1

    set_global_seeds(seed)
    env.seed(seed)

    with U.single_threaded_session() as sess:

        pred_model = classifier(in_shape=in_size,
                                out_shape=out_size,
                                name="suc_pred_model",
                                sess=sess,
                                log_dir=log_dir)

        init_op = tf.group(
            tf.global_variables_initializer(),
            tf.local_variables_initializer(),
        )
        sess.run(init_op)

        ## creating dataset tensors
        csv_filename = osp.join(log_dir, "%s_data.csv" % env_id)

        ##
        base_dataset = np.loadtxt(csv_filename, delimiter=',')
        train, test = train_test_split(base_dataset, test_size=0.2)
        train_feat = train[:, :-1]
        train_labels = train[:, -1]
        # print(train.shape, test.shape)

        # whiten
        if whiten:
            train_feat_mean = np.mean(train_feat, axis=0)
            train_feat_std = np.std(train_feat, axis=0)

            # save mean and var
            statistics = np.concatenate((train_feat_mean, train_feat_std))
            with open(osp.join(log_dir, "%s_stat.npy" % env_id), 'wb') as f:
                np.save(f, statistics)

            # create pd
            train_feat_dataset = ((train_feat - train_feat_mean) /
                                  train_feat_std)
            print(train_feat_dataset.shape, train_labels[:, np.newaxis].shape)
            train_dataset = pd.DataFrame(
                np.concatenate(
                    (train_feat_dataset, train_labels[:, np.newaxis]), axis=1))

            test_feat_dataset = ((test[:, :-1] - train_feat_mean) /
                                 train_feat_std)
            test_dataset = [test_feat_dataset, test[:, [-1]]]
            ####

            print(train_dataset.shape, test_dataset[0].shape)
        else:
            train_dataset = pd.DataFrame(
                np.concatenate((train_feat, train_labels[:, np.newaxis]),
                               axis=1))
            test_dataset = [test[:, :-1], test[:, [-1]]]

        pred_model.train(train_epoch, batch_size, lr, train_dataset,
                         test_dataset)
        pred_model.save()
Exemplo n.º 11
0
def test(env, render_eval, reward_scale, param_noise, actor, critic,
    normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise,
    popart, gamma, clip_norm, nb_eval_steps, batch_size, memory,
    tau=0.01, eval_env=None, param_noise_adaption_interval=50, **kwargs):
    
    assert (np.abs(env.action_space.low) == env.action_space.high).all()  # we assume symmetric actions.
    max_action = env.action_space.high
    
    agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape,
        gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations,
        batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg,
        actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm,
        reward_scale=reward_scale)
    
       
    saver = tf.train.Saver()

    with U.single_threaded_session() as sess:
        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()

        ## restore
        restore_dir = osp.join(kwargs["restore_dir"], "model")
        if (restore_dir is not None):
            print('Restore path : ',restore_dir)
            checkpoint = tf.train.get_checkpoint_state(restore_dir)
            if checkpoint and checkpoint.model_checkpoint_path:
                
                saver.restore(U.get_session(), checkpoint.model_checkpoint_path)
                print( "checkpoint loaded:" , checkpoint.model_checkpoint_path)
                tokens = checkpoint.model_checkpoint_path.split("-")[-1]
                # set global step
                global_t = int(tokens)
                print( ">>> global step set:", global_t)
            else:
                print(">>>no checkpoint file found")
        
        epoch_episode_eval_rewards = []
        epoch_episode_eval_steps = []
                
        # Evaluate.
        eval_episode_rewards = []
        eval_episode_rewards_history = []
        eval_episode_success = []
        for i in range(10):
            print("Evaluating:%d"%(i+1))
            eval_episode_reward = 0.
            eval_obs = eval_env.reset()
            eval_done = False
            
            while(not eval_done):
                eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True)
                eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                
                # print(eval_obs, max_action*eval_action, eval_info)
                if render_eval:
                    eval_env.render()
                    sleep(0.001)
                    
                eval_episode_reward += eval_r

            print("ended",eval_info["done"])
                
            print("episode reward::%f"%eval_episode_reward)
            
            eval_episode_rewards.append(eval_episode_reward)
            eval_episode_rewards_history.append(eval_episode_reward)
            eval_episode_success.append(eval_info["done"]=="goal reached")
            eval_episode_reward = 0.
            
        print("episode reward - mean:%.4f, var:%.4f, success:%.4f"%(np.mean(eval_episode_rewards), np.var(eval_episode_rewards), np.mean(eval_episode_success)))
def train(env,
          nb_epochs,
          nb_epoch_cycles,
          render_eval,
          reward_scale,
          render,
          param_noise,
          actor,
          critic,
          additional_critic,
          normalize_returns,
          normalize_observations,
          critic_l2_reg,
          actor_lr,
          critic_lr,
          action_noise,
          popart,
          gamma,
          clip_norm,
          nb_train_steps,
          nb_rollout_steps,
          nb_eval_steps,
          batch_size,
          memory,
          tau=0.05,
          eval_env=None,
          param_noise_adaption_interval=50,
          nb_eval_episodes=20,
          **kwargs):
    rank = MPI.COMM_WORLD.Get_rank()

    assert (np.abs(env.action_space.low) == env.action_space.high
            ).all()  # we assume symmetric actions.
    max_action = env.action_space.high

    if "dologging" in kwargs:
        dologging = kwargs["dologging"]
    else:
        dologging = True

    if "tf_sum_logging" in kwargs:
        tf_sum_logging = kwargs["tf_sum_logging"]
    else:
        tf_sum_logging = False

    if "invert_grad" in kwargs:
        invert_grad = kwargs["invert_grad"]
    else:
        invert_grad = False

    if "actor_reg" in kwargs:
        actor_reg = kwargs["actor_reg"]
    else:
        actor_reg = False

    if dologging:
        logger.info(
            'scaling actions by {} before executing in env'.format(max_action))
    agent = CDQ(actor,
                critic,
                additional_critic,
                memory,
                env.observation_space.shape,
                env.action_space.shape,
                gamma=gamma,
                tau=tau,
                normalize_returns=normalize_returns,
                normalize_observations=normalize_observations,
                batch_size=batch_size,
                action_noise=action_noise,
                param_noise=param_noise,
                critic_l2_reg=critic_l2_reg,
                actor_lr=actor_lr,
                critic_lr=critic_lr,
                enable_popart=popart,
                clip_norm=clip_norm,
                reward_scale=reward_scale,
                inverting_grad=invert_grad,
                actor_reg=actor_reg)
    if dologging: logger.debug('Using agent with the following configuration:')
    if dologging: logger.debug(str(agent.__dict__.items()))

    # Set up logging stuff only for a single worker.
    if rank != -1:
        saver = tf.train.Saver(keep_checkpoint_every_n_hours=2,
                               max_to_keep=5,
                               save_relative_paths=True)
        save_freq = kwargs["save_freq"]
    else:
        saver = None

    # step = 0
    global_t = 0
    episode = 0
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)

    with U.single_threaded_session() as sess:

        # Set summary saver
        if dologging and tf_sum_logging and rank == 0:
            tf.summary.histogram("actor_grads", agent.actor_grads)
            tf.summary.histogram("critic_grads", agent.critic_grads)
            actor_trainable_vars = actor.trainable_vars
            for var in actor_trainable_vars:
                tf.summary.histogram(var.name, var)
            critic_trainable_vars = critic.trainable_vars
            for var in critic_trainable_vars:
                tf.summary.histogram(var.name, var)

            tf.summary.histogram("actions_out", agent.actor_tf)
            tf.summary.histogram("critic_out", agent.critic_tf)
            tf.summary.histogram("target_Q", agent.target_Q)

            summary_var = tf.summary.merge_all()
            writer_t = tf.summary.FileWriter(
                osp.join(logger.get_dir(), 'train'), sess.graph)
        else:
            summary_var = tf.no_op()

        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()

        #set_trace()
        ## restore
        if kwargs["restore_dir"] is not None:
            restore_dir = osp.join(kwargs["restore_dir"], "model")
            if (restore_dir is not None):
                print('Restore path : ', restore_dir)
                # checkpoint = tf.train.get_checkpoint_state(restore_dir)
                # if checkpoint and checkpoint.model_checkpoint_path:
                model_checkpoint_path = read_checkpoint_local(restore_dir)
                if model_checkpoint_path:
                    saver.restore(U.get_session(), model_checkpoint_path)
                    print("checkpoint loaded:", model_checkpoint_path)
                    logger.info("checkpoint loaded:" +
                                str(model_checkpoint_path))
                    tokens = model_checkpoint_path.split("-")[-1]
                    # set global step
                    global_t = int(tokens)
                    print(">>> global step set:", global_t)

        agent.reset()
        obs = env.reset()

        done = False
        episode_reward = 0.
        episode_step = 0
        episodes = 0
        t = 0

        epoch = 0
        start_time = time.time()

        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_episode_eval_rewards = []
        epoch_episode_eval_steps = []
        epoch_start_time = time.time()
        epoch_actions = []
        epoch_qs = []
        epoch_episodes = 0

        ## containers for hindsight
        if kwargs["her"]:
            # logger.info("-"*50 +'\nWill create HER\n' + "-"*50)
            states, actions = [], []

        print("Ready to go!")
        for epoch in range(global_t, nb_epochs):

            # stat containers
            epoch_actor_losses = []
            epoch_critic_losses = []
            epoch_adaptive_distances = []

            eval_episode_rewards = []
            eval_qs = []
            eval_episode_success = []

            for cycle in range(nb_epoch_cycles):
                # print("cycle:%d"%cycle)
                # Perform rollouts.
                for t_rollout in range(
                        int(nb_rollout_steps / MPI.COMM_WORLD.Get_size())):
                    # print(rank, t_rollout)
                    # Predict next action.
                    action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                    assert action.shape == env.action_space.shape

                    # Execute next action.
                    if rank == 0 and render:
                        env.render()
                    assert max_action.shape == action.shape
                    new_obs, r, done, info = env.step(
                        max_action * action
                    )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    #if((t+1)%100) == 0:
                    #    print(max_action*action, new_obs, r)
                    t += 1
                    if rank == 0 and render:
                        env.render()
                        sleep(0.1)
                    episode_reward += r
                    episode_step += 1

                    # Book-keeping.
                    epoch_actions.append(action)
                    epoch_qs.append(q)
                    agent.store_transition(obs, action, r, new_obs, done)

                    ## storing info for hindsight
                    states.append(obs.copy())
                    actions.append(action.copy())

                    obs = new_obs

                    if done:
                        # Episode done.
                        epoch_episode_rewards.append(episode_reward)
                        episode_rewards_history.append(episode_reward)
                        epoch_episode_steps.append(episode_step)
                        episode_reward = 0.
                        episode_step = 0
                        epoch_episodes += 1
                        episodes += 1

                        if kwargs["her"]:
                            # logger.info("-"*50 +'\nCreating HER\n' + "-"*50)

                            ## create hindsight experience replay
                            her_states, her_rewards = env.env.apply_hindsight(
                                states, actions, new_obs.copy())

                            ## store her transitions: her_states: n+1, her_rewards: n
                            for her_i in range(len(her_states) - 2):
                                agent.store_transition(her_states[her_i],
                                                       actions[her_i],
                                                       her_rewards[her_i],
                                                       her_states[her_i + 1],
                                                       False)
                            #store last transition
                            agent.store_transition(her_states[-2], actions[-1],
                                                   her_rewards[-1],
                                                   her_states[-1], True)

                            ## refresh the storage containers
                            del states, actions
                            states, actions = [], []

                        agent.reset()
                        obs = env.reset()
                        #print(obs)

                # print(rank, "Training!")
                # Train.

                for t_train in range(nb_train_steps):
                    # print(rank, t_train)
                    # Adapt param noise, if necessary.
                    if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0:
                        distance = agent.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)

                    cl, al, current_summary = agent.train(summary_var)
                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    agent.update_target_net()

                    if dologging and tf_sum_logging and rank == 0:

                        writer_t.add_summary(
                            current_summary,
                            epoch * nb_epoch_cycles * nb_train_steps +
                            cycle * nb_train_steps + t_train)

                # print("Evaluating!")
                # Evaluate.

            if (eval_env is not None) and rank == 0:
                for _ in range(nb_eval_episodes):
                    eval_episode_reward = 0.
                    eval_obs = eval_env.reset()
                    eval_obs_start = eval_obs.copy()
                    eval_done = False
                    while (not eval_done):
                        eval_action, eval_q = agent.pi(eval_obs,
                                                       apply_noise=False,
                                                       compute_Q=True)
                        eval_obs, eval_r, eval_done, eval_info = eval_env.step(
                            max_action * eval_action
                        )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                        if render_eval:
                            sleep(0.1)
                            print("Render!")

                            eval_env.render()
                            print("rendered!")
                        eval_episode_reward += eval_r

                        eval_qs.append(eval_q)

                    eval_episode_rewards.append(eval_episode_reward)
                    eval_episode_rewards_history.append(eval_episode_reward)
                    eval_episode_success.append(
                        eval_info["done"] == "goal reached")
                    if (eval_info["done"] == "goal reached"):
                        logger.info(
                            "success, training epoch:%d,starting config:" %
                            epoch, eval_obs_start, 'final state', eval_obs)

            if dologging and rank == 0:
                print("Logging!")
                # Log stats.
                epoch_train_duration = time.time() - epoch_start_time
                duration = time.time() - start_time
                stats = agent.get_stats()
                combined_stats = {}
                for key in sorted(stats.keys()):
                    combined_stats[key] = normal_mean(stats[key])

                # Rollout statistics.
                combined_stats['rollout/return'] = normal_mean(
                    epoch_episode_rewards)
                if len(episode_rewards_history) > 0:
                    combined_stats['rollout/return_history'] = normal_mean(
                        np.mean(episode_rewards_history))
                else:
                    combined_stats['rollout/return_history'] = 0.
                combined_stats['rollout/episode_steps'] = normal_mean(
                    epoch_episode_steps)
                combined_stats['rollout/episodes'] = np.sum(epoch_episodes)
                combined_stats['rollout/actions_mean'] = normal_mean(
                    epoch_actions)
                combined_stats['rollout/actions_std'] = normal_std(
                    epoch_actions)
                combined_stats['rollout/Q_mean'] = normal_mean(epoch_qs)

                # Train statistics.
                combined_stats['train/loss_actor'] = normal_mean(
                    epoch_actor_losses)
                combined_stats['train/loss_critic'] = normal_mean(
                    epoch_critic_losses)
                combined_stats['train/param_noise_distance'] = normal_mean(
                    epoch_adaptive_distances)

                # Evaluation statistics.
                if eval_env is not None:
                    combined_stats['eval/return'] = normal_mean(
                        eval_episode_rewards)
                    combined_stats['eval/success'] = normal_mean(
                        eval_episode_success)
                    if len(eval_episode_rewards_history) > 0:
                        combined_stats['eval/return_history'] = normal_mean(
                            np.mean(eval_episode_rewards_history))
                    else:
                        combined_stats['eval/return_history'] = 0.
                    combined_stats['eval/Q'] = normal_mean(eval_qs)
                    combined_stats['eval/episodes'] = normal_mean(
                        len(eval_episode_rewards))

                # Total statistics.
                combined_stats['total/duration'] = normal_mean(duration)
                combined_stats['total/steps_per_second'] = normal_mean(
                    float(t) / float(duration))
                combined_stats['total/episodes'] = normal_mean(episodes)
                combined_stats['total/epochs'] = epoch + 1
                combined_stats['total/steps'] = t

                for key in sorted(combined_stats.keys()):
                    logger.record_tabular(key, combined_stats[key])
                logger.dump_tabular()
                logger.info('')
                logdir = logger.get_dir()
                if rank == 0 and logdir:
                    print("Dumping progress!")
                    if hasattr(env, 'get_state'):
                        with open(os.path.join(logdir, 'env_state.pkl'),
                                  'wb') as f:
                            pickle.dump(env.get_state(), f)
                    if eval_env and hasattr(eval_env, 'get_state'):
                        with open(os.path.join(logdir, 'eval_env_state.pkl'),
                                  'wb') as f:
                            pickle.dump(eval_env.get_state(), f)

                ## save tf model
                if rank == 0 and (epoch + 1) % save_freq == 0:
                    print("Saving the model!")
                    os.makedirs(osp.join(logdir, "model"), exist_ok=True)
                    saver.save(U.get_session(),
                               logdir + "/model/cdq",
                               global_step=epoch)
Exemplo n.º 13
0
def test(env,
         render_eval,
         reward_scale,
         param_noise,
         actor,
         critic,
         normalize_returns,
         normalize_observations,
         critic_l2_reg,
         actor_lr,
         critic_lr,
         action_noise,
         popart,
         gamma,
         clip_norm,
         nb_eval_steps,
         batch_size,
         memory,
         tau=0.01,
         eval_env=None,
         param_noise_adaption_interval=50,
         **kwargs):

    assert (np.abs(env.action_space.low) == env.action_space.high
            ).all()  # we assume symmetric actions.
    max_action = env.action_space.high

    if kwargs['skillset']:
        action_shape = (kwargs['my_skill_set'].len +
                        kwargs['my_skill_set'].params, )
    else:
        action_shape = env.action_space.shape

    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 action_shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale)


    var_list_restore = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="actor") + \
                            tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="obs_rms")

    print(var_list_restore)
    saver = tf.train.Saver(var_list=var_list_restore)

    with U.single_threaded_session() as sess:
        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()

        ## restore
        if kwargs['skillset']:
            ## restore skills
            my_skill_set = kwargs['my_skill_set']
            my_skill_set.restore_skillset(sess=sess)

        ## restore meta controller weights
        restore_dir = osp.join(kwargs["restore_dir"], "model")
        if (restore_dir is not None):
            print('Restore path : ', restore_dir)
            # checkpoint = tf.train.get_checkpoint_state(restore_dir)
            # if checkpoint and checkpoint.model_checkpoint_path:
            model_checkpoint_path = read_checkpoint_local(restore_dir)
            if model_checkpoint_path:

                saver.restore(U.get_session(), model_checkpoint_path)
                print("checkpoint loaded:", model_checkpoint_path)
                tokens = model_checkpoint_path.split("-")[-1]
                # set global step
                global_t = int(tokens)
                print(">>> global step set:", global_t)
            else:
                print(">>>no checkpoint file found")

        epoch_episode_eval_rewards = []
        epoch_episode_eval_steps = []

        # Evaluate.
        eval_episode_rewards = []
        eval_episode_rewards_history = []
        eval_episode_success = []
        for i in range(100):
            print("Evaluating:%d" % (i + 1))
            eval_episode_reward = 0.
            eval_obs = eval_env.reset()
            print("start obs", eval_obs[:6], eval_obs[-3:])

            # for _ in range(1000):
            #     eval_env.render()
            #     print(eval_env.sim.data.get_site_xpos('box'))
            #     sleep(0.1)

            eval_done = False

            while (not eval_done):
                eval_paction, eval_pq = agent.pi(eval_obs,
                                                 apply_noise=False,
                                                 compute_Q=True)
                if (kwargs['skillset']):
                    ## break actions into primitives and their params
                    eval_primitives_prob = eval_paction[:my_skill_set.len]
                    eval_primitive_id = np.argmax(eval_primitives_prob)

                    print("skill chosen%d" % eval_primitive_id)
                    eval_r = 0.
                    eval_skill_obs = eval_obs.copy()
                    for _ in range(kwargs['commit_for']):
                        eval_action = my_skill_set.pi(
                            primitive_id=eval_primitive_id,
                            obs=eval_skill_obs.copy(),
                            primitive_params=eval_paction[
                                kwargs['my_skill_set'].len:])
                        eval_skill_new_obs, eval_skill_r, eval_done, eval_info = eval_env.step(
                            max_action * eval_action
                        )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])

                        eval_skill_obs = eval_skill_new_obs
                        eval_r += eval_skill_r
                        if render_eval:
                            eval_env.render()
                            sleep(0.1)

                        if eval_done or my_skill_set.termination(
                                eval_skill_new_obs,
                                eval_primitive_id,
                                primitive_params=eval_paction[my_skill_set.
                                                              len:]):
                            break

                    eval_new_obs = eval_skill_new_obs

                else:
                    eval_action, q = eval_paction, eval_pq
                    eval_new_obs, eval_r, eval_done, eval_info = eval_env.step(
                        max_action * eval_action
                    )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])

                eval_episode_reward += eval_r
                eval_obs = eval_new_obs

            print("ended", eval_info["done"])

            print("episode reward::%f" % eval_episode_reward)

            eval_episode_rewards.append(eval_episode_reward)
            eval_episode_rewards_history.append(eval_episode_reward)
            eval_episode_success.append(eval_info["done"] == "goal reached")
            eval_episode_reward = 0.

        print("episode reward - mean:%.4f, var:%.4f, success:%.4f" %
              (np.mean(eval_episode_rewards), np.var(eval_episode_rewards),
               np.mean(eval_episode_success)))