Exemplo n.º 1
0
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs):
    # Configure things.
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)

    # Create envs.
    env = gym.make(env_id)
    env = bench.Monitor(
        env,
        logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
    gym.logger.setLevel(logging.WARN)

    if evaluation and rank == 0:
        eval_env = gym.make(env_id)
        eval_env = bench.Monitor(eval_env,
                                 os.path.join(logger.get_dir(), 'gym_eval'))
        #env = bench.Monitor(env, None)
    else:
        eval_env = None

    # Parse noise_type
    action_noise = None
    param_noise = None
    nb_actions = env.action_space.shape[-1]
    for current_noise_type in noise_type.split(','):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == 'none':
            pass
        elif 'adaptive-param' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            param_noise = AdaptiveParamNoiseSpec(
                initial_stddev=float(stddev),
                desired_action_stddev=float(stddev))
        elif 'normal' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = NormalActionNoise(mu=np.zeros(nb_actions),
                                             sigma=float(stddev) *
                                             np.ones(nb_actions))
        elif 'ou' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = OrnsteinUhlenbeckActionNoise(
                mu=np.zeros(nb_actions),
                sigma=float(stddev) * np.ones(nb_actions))
        else:
            raise RuntimeError(
                'unknown noise type "{}"'.format(current_noise_type))

    # Configure components.
    memory = Memory(limit=int(1e6),
                    action_shape=env.action_space.shape,
                    observation_shape=env.observation_space.shape)
    critic = Critic(layer_norm=layer_norm)
    actor = Actor(nb_actions, layer_norm=layer_norm)

    # Seed everything to make things reproducible.
    seed = seed + 1000000 * rank
    logger.info('rank {}: seed={}, logdir={}'.format(rank, seed,
                                                     logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    env.seed(seed)
    if eval_env is not None:
        eval_env.seed(seed)

    # Disable logging for rank != 0 to avoid noise.
    if rank == 0:
        start_time = time.time()
    training.train(env=env,
                   eval_env=eval_env,
                   param_noise=param_noise,
                   action_noise=action_noise,
                   actor=actor,
                   critic=critic,
                   memory=memory,
                   **kwargs)
    env.close()
    if eval_env is not None:
        eval_env.close()
    if rank == 0:
        logger.info('total runtime: {}s'.format(time.time() - start_time))
Exemplo n.º 2
0
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs):
    # Configure things.
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)

    # Create envs.
    env = gym.make(env_id)
    logger.debug("Env info")
    logger.debug(env.__doc__)
    logger.debug("-" * 20)
    gym.logger.setLevel(logging.WARN)

    if evaluation and rank == 0:
        if kwargs['eval_env_id']:
            eval_env_id = kwargs['eval_env_id']
        else:
            eval_env_id = env_id
        eval_env = gym.make(eval_env_id)
        # del eval_env_id from kwargs
        del kwargs['eval_env_id']
    else:
        eval_env = None

    # Parse noise_type
    action_noise = None
    param_noise = None
    nb_actions = env.action_space.shape[-1]
    for current_noise_type in noise_type.split(','):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == 'none':
            pass
        elif 'adaptive-param' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            param_noise = AdaptiveParamNoiseSpec(
                initial_stddev=float(stddev),
                desired_action_stddev=float(stddev))
        elif 'normal' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = NormalActionNoise(mu=np.zeros(nb_actions),
                                             sigma=float(stddev) *
                                             np.ones(nb_actions))
        elif 'ou' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = OrnsteinUhlenbeckActionNoise(
                mu=np.zeros(nb_actions),
                sigma=float(stddev) * np.ones(nb_actions))
        elif 'epsnorm' in current_noise_type:
            _, stddev, epsilon = current_noise_type.split('_')
            action_noise = EpsilonNormalActionNoise(mu=np.zeros(nb_actions),
                                                    sigma=float(stddev) *
                                                    np.ones(nb_actions),
                                                    epsilon=float(epsilon))
        else:
            raise RuntimeError(
                'unknown noise type "{}"'.format(current_noise_type))

    # Configure components.
    memory = Memory(limit=int(1e6),
                    action_shape=env.action_space.shape,
                    observation_shape=env.observation_space.shape)
    critic = Critic(layer_norm=layer_norm)
    actor = Actor(nb_actions, layer_norm=layer_norm)

    # Seed everything to make things reproducible.
    seed = seed + 1000000 * rank
    tf.reset_default_graph()

    # importing the current skill configs
    if kwargs['look_ahead'] and kwargs['skillset']:
        skillset_file = __import__("HER.skills.%s" % kwargs['skillset'],
                                   fromlist=[''])
        my_skill_set = SkillSet(skillset_file.skillset)
    else:
        my_skill_set = None

    set_global_seeds(seed)
    env.seed(seed)
    if eval_env is not None:
        eval_env.seed(seed)

    # Disable logging for rank != 0 to avoid noise.
    if rank == 0:
        logger.info('rank {}: seed={}, logdir={}'.format(
            rank, seed, logger.get_dir()))
        start_time = time.time()
    training.train(env=env,
                   eval_env=eval_env,
                   param_noise=param_noise,
                   action_noise=action_noise,
                   actor=actor,
                   critic=critic,
                   memory=memory,
                   my_skill_set=my_skill_set,
                   **kwargs)
    env.close()
    if eval_env is not None:
        eval_env.close()
    if rank == 0:
        logger.info('total runtime: {}s'.format(time.time() - start_time))
def test(env,
         render_eval,
         reward_scale,
         param_noise,
         actor,
         critic,
         normalize_returns,
         normalize_observations,
         critic_l2_reg,
         actor_lr,
         critic_lr,
         action_noise,
         popart,
         gamma,
         clip_norm,
         nb_eval_steps,
         batch_size,
         memory,
         tau=0.01,
         eval_env=None,
         param_noise_adaption_interval=50,
         **kwargs):

    assert (np.abs(env.action_space.low) == env.action_space.high
            ).all()  # we assume symmetric actions.
    max_action = env.action_space.high

    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 env.action_space.shape,
                 False,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale)

    saver = tf.train.Saver()

    writer = imageio.get_writer('/tmp/0.mp4', fps=10)

    with U.single_threaded_session() as sess:
        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()

        ## restore
        restore_dir = osp.join(kwargs["restore_dir"], "model")
        if (restore_dir is not None):
            print('Restore path : ', restore_dir)
            checkpoint = tf.train.get_checkpoint_state(restore_dir)
            if checkpoint and checkpoint.model_checkpoint_path:

                saver.restore(U.get_session(),
                              checkpoint.model_checkpoint_path)
                print("checkpoint loaded:", checkpoint.model_checkpoint_path)
                tokens = checkpoint.model_checkpoint_path.split("-")[-1]
                # set global step
                global_t = int(tokens)
                print(">>> global step set:", global_t)
            else:
                print(">>>no checkpoint file found")

        epoch_episode_eval_rewards = []
        epoch_episode_eval_steps = []

        # Evaluate.
        eval_episode_rewards = []
        eval_episode_rewards_history = []
        eval_episode_success = []

        logdir = logger.get_dir()
        assert logdir is not None
        try:
            os.mkdir(osp.join(logdir, 'vis'))
        except:
            pass  #already exists
        try:
            os.mkdir(osp.join(logdir, 'cam'))
        except:
            pass

        #logdir = logger.get_dir()
        vidpath = osp.join(logdir, 'vis/0.mp4')
        campath = osp.join(logdir, 'cam/0.mp4')
        vid_writer = imageio.get_writer(vidpath, fps=10)
        cam_writer = imageio.get_writer(campath, fps=10)

        for i in range(100):
            print("Evaluating:%d" % (i + 1))
            eval_episode_reward = 0.
            eval_obs = eval_env.reset()
            eval_done = False

            while (not eval_done):
                eval_action, eval_q = agent.pi(eval_obs,
                                               apply_noise=False,
                                               compute_Q=True)
                eval_obs, eval_r, eval_done, eval_info = eval_env.step(
                    max_action * eval_action
                )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])

                # print(eval_obs, max_action*eval_action, eval_info)

                #if render_eval:
                #    eval_env.render()
                if render_eval:
                    eval_env.render(writer=vid_writer)

                    #let's draw the bounding box...
                    box = eval_env.last_box
                    box = np.minimum(box, 63)
                    box = box.astype(np.int32)

                    img = eval_obs[1]
                    img[box[2], box[0]:box[1], :] = 0
                    img[box[3], box[0]:box[1], :] = 0
                    img[box[2]:box[3], box[0], :] = 0
                    img[box[2]:box[3], box[1], :] = 0
                    img = (img * 255.0).astype(np.uint8)

                    cam_writer.append_data(img)
                    #eval_env.render(writer = writer)
                    #sleep(0.1)

                eval_episode_reward += eval_r

            print("episode reward::%f" % eval_episode_reward)

            eval_episode_rewards.append(eval_episode_reward)
            eval_episode_rewards_history.append(eval_episode_reward)
            eval_episode_success.append(eval_info["done"] == "goal reached")
            eval_episode_reward = 0.

        print("episode reward - mean:%.4f, var:%.4f, success:%.4f" %
              (np.mean(eval_episode_rewards), np.var(eval_episode_rewards),
               np.mean(eval_episode_success)))

    cam_writer.close()
    vid_writer.close()
Exemplo n.º 4
0
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs):
    # Configure things.
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)

    # Create envs.
    env = gym.make(env_id)

    # print(env.action_space.shape)
    logger.info("Env info")
    logger.info(env.__doc__)
    logger.info("-" * 20)
    gym.logger.setLevel(logging.WARN)

    if evaluation and rank == 0:
        if kwargs['eval_env_id']:
            eval_env_id = kwargs['eval_env_id']
        else:
            eval_env_id = env_id
        eval_env = gym.make(eval_env_id)
        # del eval_env_id from kwargs
        del kwargs['eval_env_id']
    else:
        eval_env = None

    # Parse noise_type
    action_noise = None
    param_noise = None

    tf.reset_default_graph()
    ## this is a HACK
    if kwargs['skillset']:
        # import HER.skills.set2 as skillset_file
        skillset_file = __import__("HER.skills.%s" % kwargs['skillset'],
                                   fromlist=[''])
        my_skill_set = SkillSet(skillset_file.skillset)
        nb_actions = my_skill_set.params + my_skill_set.len

    else:
        nb_actions = env.action_space.shape[-1]

    for current_noise_type in noise_type.split(','):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == 'none':
            pass
        elif 'adaptive-param' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            param_noise = AdaptiveParamNoiseSpec(
                initial_stddev=float(stddev),
                desired_action_stddev=float(stddev))
        elif 'normal' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = NormalActionNoise(mu=np.zeros(nb_actions),
                                             sigma=float(stddev) *
                                             np.ones(nb_actions))
        elif 'ou' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = OrnsteinUhlenbeckActionNoise(
                mu=np.zeros(nb_actions),
                sigma=float(stddev) * np.ones(nb_actions))
        elif 'epsnorm' in current_noise_type:
            _, stddev, epsilon = current_noise_type.split('_')
            action_noise = EpsilonNormalActionNoise(mu=np.zeros(nb_actions),
                                                    sigma=float(stddev) *
                                                    np.ones(nb_actions),
                                                    epsilon=float(epsilon))
        elif 'pepsnorm' in current_noise_type:
            _, stddev, epsilon = current_noise_type.split('_')
            action_noise = EpsilonNormalParameterizedActionNoise(
                mu=np.zeros(my_skill_set.num_params),
                sigma=float(stddev) * np.ones(my_skill_set.num_params),
                epsilon=float(epsilon),
                discrete_actions_dim=my_skill_set.len)
        else:
            raise RuntimeError(
                'unknown noise type "{}"'.format(current_noise_type))

    # Configure components.
    memory = Memory(limit=int(1e6),
                    action_shape=(nb_actions, ),
                    observation_shape=env.observation_space.shape)
    if kwargs['newarch']:
        critic = Critic(layer_norm=layer_norm, hidden_unit_list=[400, 300])
    elif kwargs['newcritic']:
        critic = NewCritic(layer_norm=layer_norm)
    else:
        critic = Critic(layer_norm=layer_norm)

    if kwargs['skillset'] is None:
        if kwargs['newarch']:
            actor = Actor(discrete_action_size=env.env.discrete_action_size,
                          cts_action_size=nb_actions -
                          env.env.discrete_action_size,
                          layer_norm=layer_norm,
                          hidden_unit_list=[400, 300])
        else:
            actor = Actor(discrete_action_size=env.env.discrete_action_size,
                          cts_action_size=nb_actions -
                          env.env.discrete_action_size,
                          layer_norm=layer_norm)
        my_skill_set = None
    else:
        # pass
        # get the skillset and make actor accordingly
        if kwargs['newarch']:
            actor = Actor(discrete_action_size=my_skill_set.len,
                          cts_action_size=nb_actions - my_skill_set.len,
                          layer_norm=layer_norm,
                          hidden_unit_list=[400, 300])
        else:
            actor = Actor(discrete_action_size=my_skill_set.len,
                          cts_action_size=nb_actions - my_skill_set.len,
                          layer_norm=layer_norm)

    # Seed everything to make things reproducible.
    seed = seed + 1000000 * rank
    logger.info('rank {}: seed={}, logdir={}'.format(rank, seed,
                                                     logger.get_dir()))

    set_global_seeds(seed)
    env.seed(seed)
    if eval_env is not None:
        eval_env.seed(seed)

    # Disable logging for rank != 0 to avoid noise.
    if rank == 0:
        start_time = time.time()

    training.train(env=env,
                   eval_env=eval_env,
                   param_noise=param_noise,
                   action_noise=action_noise,
                   actor=actor,
                   critic=critic,
                   memory=memory,
                   my_skill_set=my_skill_set,
                   **kwargs)

    env.close()
    if eval_env is not None:
        eval_env.close()
    if rank == 0:
        logger.info('total runtime: {}s'.format(time.time() - start_time))
def train(env,
          nb_epochs,
          nb_epoch_cycles,
          render_eval,
          reward_scale,
          render,
          param_noise,
          actor,
          critic,
          normalize_returns,
          normalize_observations,
          critic_l2_reg,
          actor_lr,
          critic_lr,
          action_noise,
          popart,
          gamma,
          clip_norm,
          nb_train_steps,
          nb_rollout_steps,
          nb_eval_episodes,
          batch_size,
          memory,
          tau=0.05,
          eval_env=None,
          param_noise_adaption_interval=50,
          **kwargs):
    rank = MPI.COMM_WORLD.Get_rank()

    assert (np.abs(env.action_space.low) == env.action_space.high
            ).all()  # we assume symmetric actions.
    max_action = env.action_space.high

    if "dologging" in kwargs:
        dologging = kwargs["dologging"]
    else:
        dologging = True

    if "tf_sum_logging" in kwargs:
        tf_sum_logging = kwargs["tf_sum_logging"]
    else:
        tf_sum_logging = False

    if "invert_grad" in kwargs:
        invert_grad = kwargs["invert_grad"]
    else:
        invert_grad = False

    if "actor_reg" in kwargs:
        actor_reg = kwargs["actor_reg"]
    else:
        actor_reg = False

    if dologging:
        logger.debug(
            'scaling actions by {} before executing in env'.format(max_action))

    if kwargs['look_ahead']:
        look_ahead = True
        look_ahead_planner = Planning_with_memories(
            skillset=kwargs['my_skill_set'],
            env=env,
            num_samples=kwargs['num_samples'])
        exploration = LinearSchedule(schedule_timesteps=int(nb_epochs *
                                                            nb_epoch_cycles),
                                     initial_p=1.0,
                                     final_p=kwargs['exploration_final_eps'])
    else:
        look_ahead = False

    if kwargs['skillset']:
        action_shape = (kwargs['my_skill_set'].len +
                        kwargs['my_skill_set'].num_params, )
    else:
        action_shape = env.action_space.shape

    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 action_shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale,
                 inverting_grad=invert_grad,
                 actor_reg=actor_reg)

    if dologging and MPI.COMM_WORLD.Get_rank() == 0:
        logger.debug('Using agent with the following configuration:')
        logger.debug(str(agent.__dict__.items()))

    # should have saver for all thread to restore. But dump only using 1 saver
    saver = tf.train.Saver(keep_checkpoint_every_n_hours=2,
                           max_to_keep=20,
                           save_relative_paths=True)
    save_freq = kwargs["save_freq"]

    # step = 0
    global_t = 0
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)

    ## get the session with the current graph => identical graph is used for each session
    with U.single_threaded_session() as sess:
        # Set summary saver
        if dologging and tf_sum_logging and rank == 0:
            tf.summary.histogram("actor_grads", agent.actor_grads)
            tf.summary.histogram("critic_grads", agent.critic_grads)
            actor_trainable_vars = actor.trainable_vars
            for var in actor_trainable_vars:
                tf.summary.histogram(var.name, var)
            critic_trainable_vars = critic.trainable_vars
            for var in critic_trainable_vars:
                tf.summary.histogram(var.name, var)

            tf.summary.histogram("actions_out", agent.actor_tf)
            tf.summary.histogram("critic_out", agent.critic_tf)
            tf.summary.histogram("target_Q", agent.target_Q)

            summary_var = tf.summary.merge_all()
            writer_t = tf.summary.FileWriter(
                osp.join(logger.get_dir(), 'train'), sess.graph)
        else:
            summary_var = tf.no_op()

        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()

        ## restore
        if kwargs['skillset']:
            ## restore skills
            my_skill_set = kwargs['my_skill_set']
            my_skill_set.restore_skillset(sess=sess)
        ## restore current controller
        if kwargs["restore_dir"] is not None:
            restore_dir = osp.join(kwargs["restore_dir"], "model")
            if (restore_dir is not None) and rank == 0:
                print('Restore path : ', restore_dir)
                model_checkpoint_path = read_checkpoint_local(restore_dir)
                if model_checkpoint_path:
                    saver.restore(U.get_session(), model_checkpoint_path)
                    logger.info("checkpoint loaded:" +
                                str(model_checkpoint_path))
                    tokens = model_checkpoint_path.split("-")[-1]
                    # set global step
                    global_t = int(tokens)
                    print(">>> global step set:", global_t)

        agent.reset()
        obs = env.reset()

        # maintained across epochs
        episodes = 0
        t = 0
        start_time = time.time()

        # creating vars. this is done to keep the syntax for deleting the list simple a[:] = []
        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_actions = []
        epoch_actor_losses = []
        epoch_critic_losses = []
        if param_noise is not None:
            epoch_adaptive_distances = []

        eval_episode_rewards = []
        eval_episode_success = []

        # for each episode
        done = False
        episode_reward = 0.
        episode_step = 0

        ## containers for hierarchical hindsight
        if kwargs["her"]:
            logger.debug("-" * 50 + '\nWill create HER\n' + "-" * 50)
            # per episode
            states, pactions, sub_states = [], [], []

        print("Ready to go!")
        for epoch in range(global_t, nb_epochs):

            # stat containers
            epoch_episodes = 0.
            epoch_start_time = time.time()

            epoch_episode_rewards[:] = []
            epoch_episode_steps[:] = []
            epoch_actions[:] = [
            ]  # action mean: don't know if this indicates anything
            epoch_actor_losses[:] = []
            epoch_critic_losses[:] = []

            if param_noise is not None:
                epoch_adaptive_distances[:] = []

            eval_episode_rewards[:] = []
            eval_episode_success[:] = []

            for cycle in range(nb_epoch_cycles):
                # Perform rollouts.
                for t_rollout in range(
                        int(nb_rollout_steps / MPI.COMM_WORLD.Get_size())):
                    # print(rank, t_rollout)

                    # Predict next action.
                    # exploration check
                    if kwargs['look_ahead'] and (np.random.rand(
                    ) < exploration.value(epoch * nb_epoch_cycles + cycle)):
                        paction, planner_info = look_ahead_planner.create_plan(
                            obs)
                    else:
                        paction, _ = agent.pi(obs,
                                              apply_noise=True,
                                              compute_Q=True)

                    if (my_skill_set):
                        ## break actions into primitives and their params
                        primitives_prob = paction[:kwargs['my_skill_set'].len]
                        primitive_id = np.argmax(primitives_prob)

                        # print("skill chosen", primitive_id)
                        r = 0.
                        skill_obs = obs.copy()

                        if kwargs['her']:
                            curr_sub_states = [skill_obs.copy()]

                        for _ in range(kwargs['commit_for']):
                            action = my_skill_set.pi(
                                primitive_id=primitive_id,
                                obs=skill_obs.copy(),
                                primitive_params=paction[my_skill_set.len:])
                            # Execute next action.
                            if rank == 0 and render:
                                sleep(0.1)
                                env.render()
                            assert max_action.shape == action.shape
                            new_obs, skill_r, done, info = env.step(
                                max_action * action
                            )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                            r += skill_r

                            if kwargs['her']:
                                curr_sub_states.append(new_obs.copy())

                            skill_obs = new_obs
                            if done or my_skill_set.termination(
                                    new_obs,
                                    primitive_id,
                                    primitive_params=paction[my_skill_set.
                                                             len:]):
                                break

                        # assuming the skill is trained from different reward signal
                        r = skill_r

                    else:
                        action = paction
                        # Execute next action.
                        if rank == 0 and render:
                            env.render()
                        assert max_action.shape == action.shape
                        new_obs, r, done, info = env.step(
                            max_action * action
                        )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])

                    assert action.shape == env.action_space.shape

                    t += 1

                    episode_reward += r
                    episode_step += 1

                    # Book-keeping.
                    epoch_actions.append(paction)
                    agent.store_transition(obs, paction, r, new_obs, done)

                    # storing info for hindsight
                    if kwargs['her']:
                        states.append(obs.copy())
                        pactions.append(paction.copy())
                        sub_states.append(curr_sub_states)

                    # print(planner_info['next_state'][:6], new_obs[:6])

                    obs = new_obs

                    if done:
                        # Episode done.
                        # update stats
                        epoch_episode_rewards.append(episode_reward)
                        episode_rewards_history.append(episode_reward)
                        epoch_episode_steps.append(episode_step)
                        epoch_episodes += 1
                        episodes += 1
                        # reinit
                        episode_reward = 0.
                        episode_step = 0
                        agent.reset()
                        obs = env.reset()

                        if kwargs["her"]:
                            # logger.info("-"*50 +'\nCreating HER\n' + "-"*50)

                            # create hindsight experience replay
                            if kwargs['skillset']:
                                her_states, her_rewards = env.apply_hierarchical_hindsight(
                                    states, pactions, new_obs.copy(),
                                    sub_states)
                            else:
                                her_states, her_rewards = env.apply_hindsight(
                                    states, pactions, new_obs.copy())

                            ## store her transitions: her_states: n+1, her_rewards: n
                            for her_i in range(len(her_states) - 2):
                                agent.store_transition(her_states[her_i],
                                                       pactions[her_i],
                                                       her_rewards[her_i],
                                                       her_states[her_i + 1],
                                                       False)
                            #store last transition
                            agent.store_transition(her_states[-2],
                                                   pactions[-1],
                                                   her_rewards[-1],
                                                   her_states[-1], True)

                            ## refresh the storage containers
                            states[:], pactions[:] = [], []
                            if kwargs['skillset']:
                                sub_states[:] = []

                # print(rank, "Training!")
                # Train.
                for t_train in range(nb_train_steps):
                    # print(rank, t_train)
                    # Adapt param noise, if necessary.
                    if (memory.nb_entries >= batch_size) and (
                            t % param_noise_adaption_interval
                            == 0) and (param_noise is not None):
                        distance = agent.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)

                    cl, al, current_summary = agent.train(summary_var)
                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    agent.update_target_net()

                    if dologging and tf_sum_logging and rank == 0:
                        writer_t.add_summary(
                            current_summary,
                            epoch * nb_epoch_cycles * nb_train_steps +
                            cycle * nb_train_steps + t_train)

            # print("Evaluating!")
            # Evaluate after training is done.
            if (eval_env is not None) and rank == 0:
                for _ in range(nb_eval_episodes):
                    eval_episode_reward = 0.
                    eval_obs = eval_env.reset()
                    eval_obs_start = eval_obs.copy()
                    eval_done = False
                    while (not eval_done):
                        eval_paction, _ = agent.pi(eval_obs,
                                                   apply_noise=False,
                                                   compute_Q=False)

                        if (kwargs['skillset']):
                            ## break actions into primitives and their params
                            eval_primitives_prob = eval_paction[:kwargs[
                                'my_skill_set'].len]
                            eval_primitive_id = np.argmax(eval_primitives_prob)

                            eval_r = 0.
                            eval_skill_obs = eval_obs.copy()
                            for _ in range(kwargs['commit_for']):
                                eval_action = my_skill_set.pi(
                                    primitive_id=eval_primitive_id,
                                    obs=eval_skill_obs.copy(),
                                    primitive_params=eval_paction[my_skill_set.
                                                                  len:])

                                eval_new_obs, eval_skill_r, eval_done, eval_info = eval_env.step(
                                    max_action * eval_action
                                )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])

                                if render_eval:
                                    eval_env.render()

                                eval_r += eval_skill_r
                                # check for skill termination or episode termination
                                eval_terminate_skill = my_skill_set.termination(
                                    eval_new_obs,
                                    eval_primitive_id,
                                    primitive_params=eval_paction[my_skill_set.
                                                                  len:])
                                if eval_done or eval_terminate_skill:
                                    break

                                eval_skill_obs = eval_new_obs

                            # hack assuming the skills are trained from diff reward signal
                            eval_r = eval_skill_r

                        else:
                            eval_action, _ = eval_paction, eval_pq
                            eval_new_obs, eval_r, eval_done, eval_info = eval_env.step(
                                max_action * eval_action)

                        eval_episode_reward += eval_r
                        eval_obs = eval_new_obs

                    eval_episode_rewards.append(eval_episode_reward)
                    eval_episode_rewards_history.append(eval_episode_reward)
                    eval_episode_success.append(
                        eval_info["done"] == "goal reached")
                    if (eval_info["done"] == "goal reached"):
                        logger.info(
                            "success, training epoch:%d,starting config:" %
                            epoch, eval_obs_start, 'final state', eval_obs)

            if dologging and rank == 0:
                print("Logging!")
                # Log stats.
                epoch_train_duration = time.time() - epoch_start_time
                duration = time.time() - start_time
                stats = agent.get_stats()
                combined_stats = {}
                for key in sorted(stats.keys()):
                    combined_stats[key] = normal_mean(stats[key])

                # Rollout statistics.
                combined_stats['rollout/return'] = normal_mean(
                    epoch_episode_rewards)
                if len(episode_rewards_history) > 0:
                    combined_stats['rollout/return_history'] = normal_mean(
                        np.mean(episode_rewards_history))
                else:
                    combined_stats['rollout/return_history'] = 0.
                combined_stats['rollout/episode_steps'] = normal_mean(
                    epoch_episode_steps)
                combined_stats['rollout/episodes'] = np.sum(epoch_episodes)
                combined_stats['rollout/actions_mean'] = normal_mean(
                    epoch_actions)
                combined_stats['rollout/actions_std'] = normal_std(
                    epoch_actions)

                # Train statistics.
                combined_stats['train/loss_actor'] = normal_mean(
                    epoch_actor_losses)
                combined_stats['train/loss_critic'] = normal_mean(
                    epoch_critic_losses)
                if param_noise is not None:
                    combined_stats['train/param_noise_distance'] = normal_mean(
                        epoch_adaptive_distances)

                if kwargs['look_ahead']:
                    combined_stats['train/exploration'] = exploration.value(
                        epoch * nb_epoch_cycles + cycle)

                # Evaluation statistics.
                if eval_env is not None:
                    combined_stats['eval/return'] = normal_mean(
                        eval_episode_rewards)
                    combined_stats['eval/success'] = normal_mean(
                        eval_episode_success)
                    if len(eval_episode_rewards_history) > 0:
                        combined_stats['eval/return_history'] = normal_mean(
                            np.mean(eval_episode_rewards_history))
                    else:
                        combined_stats['eval/return_history'] = 0.
                    combined_stats['eval/episodes'] = normal_mean(
                        len(eval_episode_rewards))

                # Total statistics.
                combined_stats['total/duration'] = normal_mean(duration)
                combined_stats['total/rollout_per_second'] = normal_mean(
                    float(t) / float(duration))
                combined_stats['total/episodes'] = normal_mean(episodes)
                combined_stats['total/epochs'] = epoch + 1
                combined_stats['total/steps'] = t

                for key in sorted(combined_stats.keys()):
                    logger.record_tabular(key, combined_stats[key])
                logger.dump_tabular()
                logger.info('')
                logdir = logger.get_dir()

                # if rank == 0 and logdir:
                #     print("Dumping progress!")
                #     if hasattr(env, 'get_state'):
                #         with open(osp.join(logdir, 'env_state.pkl'), 'wb') as f:
                #             pickle.dump(env.get_state(), f)
                #     if eval_env and hasattr(eval_env, 'get_state'):
                #         with open(osp.join(logdir, 'eval_env_state.pkl'), 'wb') as f:
                #             pickle.dump(eval_env.get_state(), f)

                ## save tf model
                if rank == 0 and (epoch + 1) % save_freq == 0:
                    print("Saving the model!")
                    os.makedirs(osp.join(logdir, "model"), exist_ok=True)
                    saver.save(U.get_session(),
                               logdir + "/model/ddpg",
                               global_step=epoch)
Exemplo n.º 6
0
def run(env_id, seed, noise_type, layer_norm, evaluation, memory_size, factor,
        **kwargs):
    # Configure things.
    rank = 0
    if rank != 0:
        logger.set_level(logger.DISABLED)

    dologging = kwargs["dologging"]

    # Create envs.
    env = gym.make(env_id)
    gym.logger.setLevel(logging.WARN)

    if evaluation and rank == 0:
        eval_env = gym.make(env_id)
    else:
        eval_env = None

    # Parse noise_type
    action_noise = None
    param_noise = None
    nb_actions = env.action_space.shape[-1]
    for current_noise_type in noise_type.split(','):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == 'none':
            pass
        elif 'adaptive-param' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            param_noise = AdaptiveParamNoiseSpec(
                initial_stddev=float(stddev),
                desired_action_stddev=float(stddev))
        elif 'normal' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = NormalActionNoise(mu=np.zeros(nb_actions),
                                             sigma=float(stddev) *
                                             np.ones(nb_actions))
        elif 'ou' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = OrnsteinUhlenbeckActionNoise(
                mu=np.zeros(nb_actions),
                sigma=float(stddev) * np.ones(nb_actions))
        else:
            raise RuntimeError(
                'unknown noise type "{}"'.format(current_noise_type))

    # Configure components.

    single_train = False

    ospace = env.observation_space
    has_image = (not hasattr(ospace, 'shape')) or (not ospace.shape)

    if has_image:
        assert isinstance(env.observation_space, gym.spaces.Tuple)
        env.observation_space.shape = [
            x.shape for x in env.observation_space.spaces
        ]
        #eval_env.observation_space.shape = [x.shape for x in eval_env.observation_space.spaces]

    if rank == 0 or not single_train:
        memory = Memory(limit=memory_size,
                        action_shape=env.action_space.shape,
                        observation_shape=env.observation_space.shape)
    else:
        memory = None

    if has_image:
        ignore = False
        if ignore:
            critic = IgnoreDepthCritic(layer_norm=layer_norm)
            actor = IgnoreDepthActor(nb_actions, layer_norm=layer_norm)
        else:
            critic = DepthCritic(layer_norm=layer_norm)
            if factor:
                actor = FactoredDepthActor(nb_actions, layer_norm=layer_norm)
            else:
                actor = DepthActor(nb_actions, layer_norm=layer_norm)
    else:
        critic = Critic(layer_norm=layer_norm)
        actor = Actor(nb_actions, layer_norm=layer_norm)

    # Seed everything to make things reproducible.
    seed = seed + 1000000 * rank
    logger.info('rank {}: seed={}, logdir={}'.format(rank, seed,
                                                     logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    env.seed(seed)
    if eval_env is not None:
        eval_env.seed(6)

    # Disable logging for rank != 0 to avoid noise.
    if rank == 0:
        start_time = time.time()

    testing.test(env=env,
                 eval_env=eval_env,
                 param_noise=param_noise,
                 action_noise=action_noise,
                 actor=actor,
                 critic=critic,
                 memory=memory,
                 **kwargs)

    env.close()
    if eval_env is not None:
        eval_env.close()
    if rank == 0:
        logger.info('total runtime: {}s'.format(time.time() - start_time))
def train(env,
          nb_epochs,
          nb_epoch_cycles,
          render_eval,
          reward_scale,
          render,
          param_noise,
          actor,
          critic,
          additional_critic,
          normalize_returns,
          normalize_observations,
          critic_l2_reg,
          actor_lr,
          critic_lr,
          action_noise,
          popart,
          gamma,
          clip_norm,
          nb_train_steps,
          nb_rollout_steps,
          nb_eval_steps,
          batch_size,
          memory,
          tau=0.05,
          eval_env=None,
          param_noise_adaption_interval=50,
          nb_eval_episodes=20,
          **kwargs):
    rank = MPI.COMM_WORLD.Get_rank()

    assert (np.abs(env.action_space.low) == env.action_space.high
            ).all()  # we assume symmetric actions.
    max_action = env.action_space.high

    if "dologging" in kwargs:
        dologging = kwargs["dologging"]
    else:
        dologging = True

    if "tf_sum_logging" in kwargs:
        tf_sum_logging = kwargs["tf_sum_logging"]
    else:
        tf_sum_logging = False

    if "invert_grad" in kwargs:
        invert_grad = kwargs["invert_grad"]
    else:
        invert_grad = False

    if "actor_reg" in kwargs:
        actor_reg = kwargs["actor_reg"]
    else:
        actor_reg = False

    if dologging:
        logger.info(
            'scaling actions by {} before executing in env'.format(max_action))
    agent = CDQ(actor,
                critic,
                additional_critic,
                memory,
                env.observation_space.shape,
                env.action_space.shape,
                gamma=gamma,
                tau=tau,
                normalize_returns=normalize_returns,
                normalize_observations=normalize_observations,
                batch_size=batch_size,
                action_noise=action_noise,
                param_noise=param_noise,
                critic_l2_reg=critic_l2_reg,
                actor_lr=actor_lr,
                critic_lr=critic_lr,
                enable_popart=popart,
                clip_norm=clip_norm,
                reward_scale=reward_scale,
                inverting_grad=invert_grad,
                actor_reg=actor_reg)
    if dologging: logger.debug('Using agent with the following configuration:')
    if dologging: logger.debug(str(agent.__dict__.items()))

    # Set up logging stuff only for a single worker.
    if rank != -1:
        saver = tf.train.Saver(keep_checkpoint_every_n_hours=2,
                               max_to_keep=5,
                               save_relative_paths=True)
        save_freq = kwargs["save_freq"]
    else:
        saver = None

    # step = 0
    global_t = 0
    episode = 0
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)

    with U.single_threaded_session() as sess:

        # Set summary saver
        if dologging and tf_sum_logging and rank == 0:
            tf.summary.histogram("actor_grads", agent.actor_grads)
            tf.summary.histogram("critic_grads", agent.critic_grads)
            actor_trainable_vars = actor.trainable_vars
            for var in actor_trainable_vars:
                tf.summary.histogram(var.name, var)
            critic_trainable_vars = critic.trainable_vars
            for var in critic_trainable_vars:
                tf.summary.histogram(var.name, var)

            tf.summary.histogram("actions_out", agent.actor_tf)
            tf.summary.histogram("critic_out", agent.critic_tf)
            tf.summary.histogram("target_Q", agent.target_Q)

            summary_var = tf.summary.merge_all()
            writer_t = tf.summary.FileWriter(
                osp.join(logger.get_dir(), 'train'), sess.graph)
        else:
            summary_var = tf.no_op()

        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()

        #set_trace()
        ## restore
        if kwargs["restore_dir"] is not None:
            restore_dir = osp.join(kwargs["restore_dir"], "model")
            if (restore_dir is not None):
                print('Restore path : ', restore_dir)
                # checkpoint = tf.train.get_checkpoint_state(restore_dir)
                # if checkpoint and checkpoint.model_checkpoint_path:
                model_checkpoint_path = read_checkpoint_local(restore_dir)
                if model_checkpoint_path:
                    saver.restore(U.get_session(), model_checkpoint_path)
                    print("checkpoint loaded:", model_checkpoint_path)
                    logger.info("checkpoint loaded:" +
                                str(model_checkpoint_path))
                    tokens = model_checkpoint_path.split("-")[-1]
                    # set global step
                    global_t = int(tokens)
                    print(">>> global step set:", global_t)

        agent.reset()
        obs = env.reset()

        done = False
        episode_reward = 0.
        episode_step = 0
        episodes = 0
        t = 0

        epoch = 0
        start_time = time.time()

        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_episode_eval_rewards = []
        epoch_episode_eval_steps = []
        epoch_start_time = time.time()
        epoch_actions = []
        epoch_qs = []
        epoch_episodes = 0

        ## containers for hindsight
        if kwargs["her"]:
            # logger.info("-"*50 +'\nWill create HER\n' + "-"*50)
            states, actions = [], []

        print("Ready to go!")
        for epoch in range(global_t, nb_epochs):

            # stat containers
            epoch_actor_losses = []
            epoch_critic_losses = []
            epoch_adaptive_distances = []

            eval_episode_rewards = []
            eval_qs = []
            eval_episode_success = []

            for cycle in range(nb_epoch_cycles):
                # print("cycle:%d"%cycle)
                # Perform rollouts.
                for t_rollout in range(
                        int(nb_rollout_steps / MPI.COMM_WORLD.Get_size())):
                    # print(rank, t_rollout)
                    # Predict next action.
                    action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                    assert action.shape == env.action_space.shape

                    # Execute next action.
                    if rank == 0 and render:
                        env.render()
                    assert max_action.shape == action.shape
                    new_obs, r, done, info = env.step(
                        max_action * action
                    )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    #if((t+1)%100) == 0:
                    #    print(max_action*action, new_obs, r)
                    t += 1
                    if rank == 0 and render:
                        env.render()
                        sleep(0.1)
                    episode_reward += r
                    episode_step += 1

                    # Book-keeping.
                    epoch_actions.append(action)
                    epoch_qs.append(q)
                    agent.store_transition(obs, action, r, new_obs, done)

                    ## storing info for hindsight
                    states.append(obs.copy())
                    actions.append(action.copy())

                    obs = new_obs

                    if done:
                        # Episode done.
                        epoch_episode_rewards.append(episode_reward)
                        episode_rewards_history.append(episode_reward)
                        epoch_episode_steps.append(episode_step)
                        episode_reward = 0.
                        episode_step = 0
                        epoch_episodes += 1
                        episodes += 1

                        if kwargs["her"]:
                            # logger.info("-"*50 +'\nCreating HER\n' + "-"*50)

                            ## create hindsight experience replay
                            her_states, her_rewards = env.env.apply_hindsight(
                                states, actions, new_obs.copy())

                            ## store her transitions: her_states: n+1, her_rewards: n
                            for her_i in range(len(her_states) - 2):
                                agent.store_transition(her_states[her_i],
                                                       actions[her_i],
                                                       her_rewards[her_i],
                                                       her_states[her_i + 1],
                                                       False)
                            #store last transition
                            agent.store_transition(her_states[-2], actions[-1],
                                                   her_rewards[-1],
                                                   her_states[-1], True)

                            ## refresh the storage containers
                            del states, actions
                            states, actions = [], []

                        agent.reset()
                        obs = env.reset()
                        #print(obs)

                # print(rank, "Training!")
                # Train.

                for t_train in range(nb_train_steps):
                    # print(rank, t_train)
                    # Adapt param noise, if necessary.
                    if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0:
                        distance = agent.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)

                    cl, al, current_summary = agent.train(summary_var)
                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    agent.update_target_net()

                    if dologging and tf_sum_logging and rank == 0:

                        writer_t.add_summary(
                            current_summary,
                            epoch * nb_epoch_cycles * nb_train_steps +
                            cycle * nb_train_steps + t_train)

                # print("Evaluating!")
                # Evaluate.

            if (eval_env is not None) and rank == 0:
                for _ in range(nb_eval_episodes):
                    eval_episode_reward = 0.
                    eval_obs = eval_env.reset()
                    eval_obs_start = eval_obs.copy()
                    eval_done = False
                    while (not eval_done):
                        eval_action, eval_q = agent.pi(eval_obs,
                                                       apply_noise=False,
                                                       compute_Q=True)
                        eval_obs, eval_r, eval_done, eval_info = eval_env.step(
                            max_action * eval_action
                        )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                        if render_eval:
                            sleep(0.1)
                            print("Render!")

                            eval_env.render()
                            print("rendered!")
                        eval_episode_reward += eval_r

                        eval_qs.append(eval_q)

                    eval_episode_rewards.append(eval_episode_reward)
                    eval_episode_rewards_history.append(eval_episode_reward)
                    eval_episode_success.append(
                        eval_info["done"] == "goal reached")
                    if (eval_info["done"] == "goal reached"):
                        logger.info(
                            "success, training epoch:%d,starting config:" %
                            epoch, eval_obs_start, 'final state', eval_obs)

            if dologging and rank == 0:
                print("Logging!")
                # Log stats.
                epoch_train_duration = time.time() - epoch_start_time
                duration = time.time() - start_time
                stats = agent.get_stats()
                combined_stats = {}
                for key in sorted(stats.keys()):
                    combined_stats[key] = normal_mean(stats[key])

                # Rollout statistics.
                combined_stats['rollout/return'] = normal_mean(
                    epoch_episode_rewards)
                if len(episode_rewards_history) > 0:
                    combined_stats['rollout/return_history'] = normal_mean(
                        np.mean(episode_rewards_history))
                else:
                    combined_stats['rollout/return_history'] = 0.
                combined_stats['rollout/episode_steps'] = normal_mean(
                    epoch_episode_steps)
                combined_stats['rollout/episodes'] = np.sum(epoch_episodes)
                combined_stats['rollout/actions_mean'] = normal_mean(
                    epoch_actions)
                combined_stats['rollout/actions_std'] = normal_std(
                    epoch_actions)
                combined_stats['rollout/Q_mean'] = normal_mean(epoch_qs)

                # Train statistics.
                combined_stats['train/loss_actor'] = normal_mean(
                    epoch_actor_losses)
                combined_stats['train/loss_critic'] = normal_mean(
                    epoch_critic_losses)
                combined_stats['train/param_noise_distance'] = normal_mean(
                    epoch_adaptive_distances)

                # Evaluation statistics.
                if eval_env is not None:
                    combined_stats['eval/return'] = normal_mean(
                        eval_episode_rewards)
                    combined_stats['eval/success'] = normal_mean(
                        eval_episode_success)
                    if len(eval_episode_rewards_history) > 0:
                        combined_stats['eval/return_history'] = normal_mean(
                            np.mean(eval_episode_rewards_history))
                    else:
                        combined_stats['eval/return_history'] = 0.
                    combined_stats['eval/Q'] = normal_mean(eval_qs)
                    combined_stats['eval/episodes'] = normal_mean(
                        len(eval_episode_rewards))

                # Total statistics.
                combined_stats['total/duration'] = normal_mean(duration)
                combined_stats['total/steps_per_second'] = normal_mean(
                    float(t) / float(duration))
                combined_stats['total/episodes'] = normal_mean(episodes)
                combined_stats['total/epochs'] = epoch + 1
                combined_stats['total/steps'] = t

                for key in sorted(combined_stats.keys()):
                    logger.record_tabular(key, combined_stats[key])
                logger.dump_tabular()
                logger.info('')
                logdir = logger.get_dir()
                if rank == 0 and logdir:
                    print("Dumping progress!")
                    if hasattr(env, 'get_state'):
                        with open(os.path.join(logdir, 'env_state.pkl'),
                                  'wb') as f:
                            pickle.dump(env.get_state(), f)
                    if eval_env and hasattr(eval_env, 'get_state'):
                        with open(os.path.join(logdir, 'eval_env_state.pkl'),
                                  'wb') as f:
                            pickle.dump(eval_env.get_state(), f)

                ## save tf model
                if rank == 0 and (epoch + 1) % save_freq == 0:
                    print("Saving the model!")
                    os.makedirs(osp.join(logdir, "model"), exist_ok=True)
                    saver.save(U.get_session(),
                               logdir + "/model/cdq",
                               global_step=epoch)
Exemplo n.º 8
0
def run(env_id, seed, evaluation, **kwargs):
    
    # Create envs.
    env = gym.make(env_id)

    # print(env.action_space.shape)
    logger.info("Env info")
    logger.info(env.__doc__)
    logger.info("-"*20)
    gym.logger.setLevel(logging.WARN)

    if evaluation:
        if kwargs['eval_env_id']: 
            eval_env_id = kwargs['eval_env_id']
        else: 
            eval_env_id = env_id
        eval_env = gym.make(eval_env_id)
        # del eval_env_id from kwargs
        del kwargs['eval_env_id']
    else:
        eval_env = None

    
    if kwargs['skillset']:
        skillset_file = __import__("HER.skills.%s"%kwargs['skillset'], fromlist=[''])
        my_skill_set = SkillSet(skillset_file.skillset)

    model = models.mlp([64])

    # Seed everything to make things reproducible.
    logger.info('seed={}, logdir={}'.format(seed, logger.get_dir()))
    
    set_global_seeds(seed)
    env.seed(seed)
    if eval_env is not None:
        eval_env.seed(seed)

    start_time = time.time()
    
    training.train(
        env=env,
        eval_env = eval_env,
        q_func=model,
        lr=kwargs['lr'],
        max_timesteps=kwargs['num_timesteps'],
        buffer_size=50000,
        exploration_fraction=0.1,
        exploration_final_eps=0.002,
        train_freq=1,
        batch_size=kwargs['batch_size'],
        print_freq=100,
        checkpoint_freq=kwargs['save_freq'],
        learning_starts=max(50, kwargs['batch_size']),
        target_network_update_freq=100,
        prioritized_replay= kwargs['prioritized_replay'],
        prioritized_replay_alpha=0.6,
        prioritized_replay_beta0=0.4,
        prioritized_replay_beta_iters=None,
        prioritized_replay_eps=1e-6,
        param_noise=False,
        gamma = kwargs['gamma'],
        log_dir = kwargs['log_dir'],
        my_skill_set= my_skill_set,
        num_eval_episodes=kwargs['num_eval_episodes'],
        render = kwargs['render'],
        render_eval = kwargs['render_eval'],
        commit_for = kwargs['commit_for']
    )
    
    env.close()
    if eval_env is not None:
        eval_env.close()
    
    logger.info('total runtime: {}s'.format(time.time() - start_time))
Exemplo n.º 9
0
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs):
    # Configure things.
    rank = 0
    if rank != 0:
        logger.set_level(logger.DISABLED)

    dologging = kwargs["dologging"]

    # Create envs.
    env = gym.make(env_id)
    gym.logger.setLevel(logging.WARN)

    if evaluation and rank == 0:
        eval_env = gym.make(env_id)
    else:
        eval_env = None

    tf.reset_default_graph()

    if kwargs['skillset']:
        skillset_file = __import__("HER.skills.%s" % kwargs['skillset'],
                                   fromlist=[''])
        my_skill_set = SkillSet(skillset_file.skillset)
        nb_actions = my_skill_set.params + my_skill_set.len

    else:
        nb_actions = env.action_space.shape[-1]

    # Configure components.
    memory = Memory(limit=int(1e6),
                    action_shape=env.action_space.shape,
                    observation_shape=env.observation_space.shape)
    critic = Critic(layer_norm=layer_norm)

    if kwargs['skillset'] is None:
        actor = Actor(discrete_action_size=env.env.discrete_action_size,
                      cts_action_size=nb_actions -
                      env.env.discrete_action_size,
                      layer_norm=layer_norm)
        my_skill_set = None
    else:
        # pass
        # get the skillset and make actor accordingly
        actor = Actor(discrete_action_size=my_skill_set.len,
                      cts_action_size=nb_actions - my_skill_set.len,
                      layer_norm=layer_norm)

    # Seed everything to make things reproducible.
    seed = seed + 1000000 * rank
    logger.info('rank {}: seed={}, logdir={}'.format(rank, seed,
                                                     logger.get_dir()))

    set_global_seeds(seed)
    env.seed(seed)
    if eval_env is not None:
        eval_env.seed(seed)

    # Disable logging for rank != 0 to avoid noise.
    if rank == 0:
        start_time = time.time()

    testing.test(env=env,
                 eval_env=eval_env,
                 param_noise=None,
                 action_noise=None,
                 actor=actor,
                 critic=critic,
                 memory=memory,
                 my_skill_set=my_skill_set,
                 **kwargs)
    env.close()
    if eval_env is not None:
        eval_env.close()
    if rank == 0:
        logger.info('total runtime: {}s'.format(time.time() - start_time))