def __init__(self, policy_fn, agents, dims, logger, make_env, T, use_her, rollout_batch_size=1,
                 compute_Q=False, render=False, history_len=100):
        """Rollout worker generates experience by interacting with one or many environments.

        Args:
            make_env (function): a factory function that creates a new instance of the environment
                when called
            policy (object): the policy that is used to act
            dims (dict of ints): the dimensions for observations (o), goals (g), and actions (u)
            logger (object): the logger that is used by the rollout worker
            rollout_batch_size (int): the number of parallel rollouts that should be used
            exploit (boolean): whether or not to exploit, i.e. to act optimally according to the
                current policy without any exploration
            use_target_net (boolean): whether or not to use the target net for rollouts
            compute_Q (boolean): whether or not to compute the Q values alongside the actions
            noise_eps (float): scale of the additive Gaussian noise
            random_eps (float): probability of selecting a completely random action
            history_len (int): length of history for statistics smoothing
            render (boolean): whether or not to render the rollouts
        """
        self.envs = [make_env() for _ in range(rollout_batch_size)]

        assert (np.abs(self.envs[0].action_space.low) == self.envs[0].action_space.high).all()  # we assume symmetric actions.
        self.max_action = self.envs[0].action_space.high
        logger.info('Scaling actions by {} before executing in env'.format(self.max_action))

        assert self.T > 0
        self.info_keys = [key.replace('info_', '') for key in dims.keys() if key.startswith('info_')]

        if self.use_her:
            self.success_history = deque(maxlen=history_len)
        self.reward_per_episode_history = deque(maxlen=history_len)

        self.Q_history = deque(maxlen=history_len)

        self.n_episodes = 0
        self.initial_o = np.empty((self.rollout_batch_size, self.dims['o']), np.float32)  # observations
        if self.use_her:
            self.g = np.empty((self.rollout_batch_size, self.dims['g']), np.float32)  # goals
            self.initial_ag = np.empty((self.rollout_batch_size, self.dims['g']), np.float32)  # achieved goals
        self.total_reward_this_episode = np.zeros((self.rollout_batch_size,), np.float32)

        self.reset_all(force_env_resets=True)
        self.clear_history()

        self.current_heatmap_prefix = None

        self.recording = False
def display_var_info(vars):
    from ddpg_curiosity_mc_her import logger
    count_params = 0
    for v in vars:
        name = v.name
        if "/Adam" in name or "beta1_power" in name or "beta2_power" in name:
            continue
        v_params = np.prod(v.shape.as_list())
        count_params += v_params
        if "/b:" in name or "/biases" in name:
            continue  # Wx+b, bias is not interesting to look at => count params, but not print
        logger.info("   %s%s %i params %s" %
                    (name, " " * (55 - len(name)), v_params, str(v.shape)))

    logger.info("Total model parameters: %0.2f million" %
                (count_params * 1e-6))
    def __init__(self, obs0, action, obs1, clip_norm, hidden, layers, comm):

        logger.info("Using Forward Dynamics")
        assert hidden is not None
        assert layers is not None

        with tf.variable_scope('forward_dynamics'):
            self.dynamics_scope = tf.get_variable_scope().name

            input = tf.concat(values=[obs0, action], axis=-1)
            next_state_tf = nn(input, [hidden] * layers + [obs1.shape[-1]])

        # loss functions
        self.per_sample_loss_tf = tf.expand_dims(tf.reduce_mean(tf.square(next_state_tf - obs1), axis=1), axis=1)
        self.mean_loss_tf = tf.reduce_mean(self.per_sample_loss_tf)

        self.dynamics_grads = U.flatgrad(self.mean_loss_tf, _vars(self.dynamics_scope), clip_norm=clip_norm)

        # optimizers
        self.dynamics_adam = MpiAdam(_vars(self.dynamics_scope), scale_grad_by_procs=False, comm=comm)
Пример #4
0
def configure_replay_buffer(params):
    logger.info('Using Replay Buffer')

    sample_transitions = configure_her(params)
    input_dims = configure_dims(params)
    input_shapes = dims_to_shapes(input_dims)

    buffer_shapes = {
        key:
        (params['T'] if key != 'o' else params['T'] + 1, *input_shapes[key])
        for key, val in input_shapes.items()
    }

    if params['use_her']:
        buffer_shapes['g'] = (buffer_shapes['g'][0], input_dims['g'])
        buffer_shapes['ag'] = (params['T'] + 1, input_dims['g'])
    else:
        buffer_shapes['r'] = (params['T'], 1)
        buffer_shapes['t'] = (params['T'], 1)

    buffer_size = (params['buffer_size'] //
                   params['rollout_batch_size']) * params['rollout_batch_size']
    return ReplayBuffer(buffer_shapes, buffer_size, params['T'],
                        sample_transitions, params['use_her'])
Пример #5
0
def run(args):
    # Configure things.
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)

    # If we are supposed to divide gpu usage among a specific set of devices,
    # set this processes' device to the correct one.
    gpu_nums = args['split_gpu_usage_among_device_nums']
    if gpu_nums is not None:
        gpu_num_to_use = gpu_nums[rank % len(gpu_nums)]
        os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_num_to_use)

    # Seed everything to make things reproducible.
    rank_seed = args['seed'] + 1000000 * rank
    logger.info('rank {}: seed={}, logdir={}'.format(rank, rank_seed, logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(rank_seed)

    input_dims = configure_dims(args)

    # Configure the replay buffer.
    memory = configure_memory(args)

    with U.single_threaded_session() as sess:
        # Setup up DDPG Agents

        agents = create_agents(sess=sess, memory=memory, input_dims=input_dims, params=args)

        saver = tf.train.Saver()
        if args['restore_from_ckpt'] is not None:
            logger.info("Restoring agents from {}".format(args['restore_from_ckpt']))
            saver.restore(sess, args['restore_from_ckpt'])

        sess.graph.finalize()
        logger.log_graph_to_tensorboard(sess.graph)

        # Setup Rollout workers
        train_policy_fn = get_policy_fn(
            name=args['train_policy_fn'], agents=agents
        )
        eval_policy_fn = get_policy_fn(
            name=args['eval_policy_fn'], agents=agents
        )

        train_rollout_worker = configure_rollout_worker(
            role='train', policy_fn=train_policy_fn, agents=agents, dims=input_dims,
            seed=rank_seed, logger=logger, params=args
        )
        eval_rollout_worker = configure_rollout_worker(
            role='eval', policy_fn=eval_policy_fn, agents=agents, dims=input_dims,
            seed=rank_seed, logger=logger, params=args
        )

        # Begin main training loop
        if rank == 0:
            start_time = time.time()

        if args['do_demo_only'] is False:
            training.train(
                memory=memory, agents=agents, saver=saver, sess=sess,
                train_rollout_worker=train_rollout_worker, eval_rollout_worker=eval_rollout_worker,
                param_noise_adaption_interval=50, **args
            )
        else:
            demo.demo(agents=agents, eval_rollout_worker=eval_rollout_worker,
                      demo_video_recording_name=args["demo_video_recording_name"])

        train_rollout_worker.close()
        eval_rollout_worker.close()

        if rank == 0:
            logger.info('total runtime: {}s'.format(time.time() - start_time))
Пример #6
0
def configure_ddpg_agent(sess, role, memory, input_dims, external_critic_fn,
                         params):
    input_shapes = dims_to_shapes(input_dims)
    observation_shape = input_shapes['o']
    goal_shape = input_shapes['g'] if params['use_her'] else None
    action_shape = input_shapes['u']
    action_dim = input_dims['u']

    if role == 'exploit':
        comm = MPI.COMM_WORLD
        use_goals = True if params['use_her'] else False
        use_intrinsic_reward = False
        dynamics_loss_mapper = None
        mix_external_critic_with_internal = None
        external_critic_fn = None

    elif role == 'explore':
        comm = params['explore_comm']
        assert comm != MPI.COMM_WORLD
        use_intrinsic_reward = True
        dynamics_loss_mapper = params['dynamics_loss_mapper']
        mix_external_critic_with_internal = params[
            'mix_extrinsic_intrinsic_objectives_for_explore']
        if mix_external_critic_with_internal is not None:
            assert len(mix_external_critic_with_internal) == 2
            assert external_critic_fn is not None
            use_goals = True if params['use_her'] else False
        else:
            use_goals = False
            external_critic_fn = None

    else:
        raise ValueError('role must either be \'exploit\' or \'explore\'.')

    agent = DDPG(
        sess=sess,
        scope=role + '_ddpg',
        layer_norm=[role + '_use_layer_norm'],
        nb_actions=action_dim,
        memory=memory,
        observation_shape=observation_shape,
        action_shape=action_shape,
        goal_shape=goal_shape,
        param_noise=params[role + '_param_noise'],
        action_noise=params[role + '_action_noise'],
        gamma=params[role + '_gamma'],
        tau=params[role + '_polyak_tau'],
        normalize_returns=params[role + '_normalize_returns'],
        enable_popart=params[role + '_popart'],
        normalize_observations=params['agents_normalize_observations'],
        normalize_goals=params['agents_normalize_goals'],
        batch_size=params['batch_size'],
        observation_range=(-5., 5.),
        goal_range=(-200, 200),
        action_range=(-1., 1.),
        return_range=(-np.inf, np.inf),
        critic_l2_reg=params[role + '_critic_l2_reg'],
        actor_lr=params[role + '_pi_lr'],
        critic_lr=params[role + '_Q_lr'],
        clip_norm=None,
        reward_scale=1.,
        use_intrinsic_reward=use_intrinsic_reward,
        use_goals=use_goals,
        agent_hidden_layer_sizes=[params[role + '_hidden']] *
        params[role + '_layers'],
        dynamics_hidden=params['dynamics_hidden'],
        dynamics_layers=params['dynamics_layers'],
        dynamics_normalize_observations=params[
            'dynamics_normalize_observations'],
        dynamics_loss_mapper=dynamics_loss_mapper,
        mix_external_critic_with_internal=mix_external_critic_with_internal,
        external_critic_fn=external_critic_fn,
        intrinsic_motivation_method=params['intrinsic_motivation_method'],
        comm=comm)
    logger.info('Using ' + role + ' agent.')
    # logger.info('Using ' + role + ' agent with the following configuration:')
    # logger.info(str(agent.__dict__.items()))

    return agent
Пример #7
0
def log_params(params, logger=logger):
    for key in sorted(params.keys()):
        logger.info('{}: {}'.format(key, params[key]))
def train(memory, agents, saver, sess, train_rollout_worker,
          eval_rollout_worker, n_epochs, n_cycles, n_batches, batch_size,
          rollout_batches_per_cycle, n_test_rollouts, heatmaps,
          dynamics_loss_mapper, do_evaluation, save_at_score, stop_at_score,
          save_checkpoints_at, **kwargs):

    rank = MPI.COMM_WORLD.Get_rank()

    logger.info("Training...")

    batch = 0

    should_quit_early = False

    for epoch in range(1, n_epochs + 1):
        epoch_start_time = datetime.now()

        if dynamics_loss_mapper is not None:
            dynamics_loss_mapper.set_record_write(
                prefix='epoch{}_rank{}'.format(epoch, rank))

        # train
        train_rollout_worker.clear_history()
        for cycle_index in range(n_cycles):
            for _ in range(rollout_batches_per_cycle):

                episode = train_rollout_worker.generate_rollouts(
                    render_override=False,
                    heatmap_prefix='epoch{}_rank{}'.format(epoch, rank)
                    if heatmaps else None)

                memory.store_episode(episode)
                for agent in agents.values():
                    agent.update_normalizers(episode)

            param_noise_distances = {}

            # Adapt param noise.
            if memory.nb_entries >= batch_size:
                for role, agent in agents.items():
                    param_noise_distances[role] = agent.adapt_param_noise()

            for train_step in range(n_batches):
                critic_losses = {}
                actor_losses = {}
                for role, agent in agents.items():
                    critic_losses[role], actor_losses[role] = agent.train()
                for agent in agents.values():
                    agent.update_target_net()

                batch += 1

        if heatmaps:
            train_rollout_worker.flush_env_location_records()
            MPI.COMM_WORLD.Barrier()
            logger.info("Creating heatmap...")
            if rank == 0:
                heatmap_save_path = generate_3d_fetch_stack_heatmap_from_npy_records(
                    working_dir=os.path.join(logger.get_dir(), 'heatmaps'),
                    file_prefix='epoch{}'.format(epoch),
                    delete_records=True)
                logger.info("Heatmap saved to {}".format(heatmap_save_path))

        # test
        if do_evaluation:
            eval_rollout_worker.clear_history()
            for _ in range(n_test_rollouts):
                eval_rollout_worker.generate_rollouts()

            current_score = mpi_average(eval_rollout_worker.current_score())

            if current_score >= save_at_score and rank == 0:
                save_path = os.path.join(logger.get_dir(), 'saved_model',
                                         'model.ckpt')
                logger.info("Saving models to {}".format(save_path))
                saver.save(sess, save_path)

            if save_checkpoints_at is not None:
                for score in save_checkpoints_at.copy():
                    if current_score >= score and rank == 0:
                        logger.info("Reached checkpoint for {}".format(score))
                        save_path = os.path.join(
                            logger.get_dir(), 'saved_model',
                            'model_score_{}.ckpt'.format(
                                str(score).replace(".", "p")))
                        logger.info("Saving models to {}".format(save_path))
                        saver.save(sess, save_path)
                        save_checkpoints_at.remove(score)

            if stop_at_score is not None and current_score >= stop_at_score:
                logger.info("Stopping score of {} reached. Quitting...".format(
                    stop_at_score))
                should_quit_early = True

        # record logs
        logger.record_tabular('epoch', epoch)
        timesteps = MPI.COMM_WORLD.Get_size(
        ) * epoch * n_cycles * rollout_batches_per_cycle * train_rollout_worker.rollout_batch_size * train_rollout_worker.T
        logger.record_tabular('timesteps', timesteps)
        if do_evaluation:
            for key, val in eval_rollout_worker.logs('test'):
                logger.record_tabular(key, mpi_average(val))
        for key, val in train_rollout_worker.logs('train'):
            logger.record_tabular(key, mpi_average(val))
        for role, agent in agents.items():
            for key, val in agent.get_stats().items():
                logger.record_tabular("{}_agent_{}".format(role, key),
                                      mpi_average(val))

        if rank == 0:
            logger.dump_tabular()

        # make sure that different threads have different seeds
        local_uniform = np.random.uniform(size=(1, ))
        root_uniform = local_uniform.copy()
        MPI.COMM_WORLD.Bcast(root_uniform, root=0)
        if rank != 0:
            assert local_uniform[0] != root_uniform[0]

        epoch_end_time = datetime.now()
        if rank == 0:
            logger.info("(epoch took {} seconds)".format(
                (epoch_end_time - epoch_start_time).total_seconds()))
            logger.info("(completed at {})".format(epoch_end_time))

        if should_quit_early:
            break

    if rank == 0:
        save_path = os.path.join(logger.get_dir(), 'saved_model', 'model.ckpt')
        logger.info("Saving models to {}".format(save_path))
        saver.save(sess, save_path)
    def __init__(self, obs0, action, obs1, clip_norm, hidden, layers):

        logger.info("Using Random Network Distillation")

        rep_size = hidden

        # RND bonus.

        # Random target network.
        # for ph in self.ph_ob.values():
        #     if len(ph.shape.as_list()) == 5:  # B,T,H,W,C
        #         logger.info("CnnTarget: using '%s' shape %s as image input" % (ph.name, str(ph.shape)))
        #         xr = ph[:, 1:]
        #         xr = tf.cast(xr, tf.float32)
        #         xr = tf.reshape(xr, (-1, *ph.shape.as_list()[-3:]))[:, :, :, -1:]
        #         xr = tf.clip_by_value((xr - self.ph_mean) / self.ph_std, -5.0, 5.0)
        #
        #         xr = tf.nn.leaky_relu(conv(xr, 'c1r', nf=convfeat * 1, rf=8, stride=4, init_scale=np.sqrt(2)))
        #         xr = tf.nn.leaky_relu(conv(xr, 'c2r', nf=convfeat * 2 * 1, rf=4, stride=2, init_scale=np.sqrt(2)))
        #         xr = tf.nn.leaky_relu(conv(xr, 'c3r', nf=convfeat * 2 * 1, rf=3, stride=1, init_scale=np.sqrt(2)))
        #         rgbr = [to2d(xr)]
        #         X_r = fc(rgbr[0], 'fc1r', nh=rep_size, init_scale=np.sqrt(2))

        with tf.variable_scope('random_network_distillation'):
            self.rnd_scope = tf.get_variable_scope().name
            # Random Target Network

            with tf.variable_scope('target_network'):
                xr = nn(obs1, [hidden] * layers + [rep_size])
                #
                # xr = tf.nn.leaky_relu(fc(obs1, "fc1r", nh=hidden*2, init_scale=np.sqrt(2)))
                # xr = tf.nn.leaky_relu(fc(xr, "fc2r", nh=hidden*2, init_scale=np.sqrt(2)))
                # xr = tf.nn.leaky_relu(fc(xr, "fc3r", nh=hidden, init_scale=np.sqrt(2)))
                # xr = tf.nn.relu(fc(xr, "fc4r", nh=hidden, init_scale=np.sqrt(2)))
                # xr = tf.nn.relu(fc(xr, "fc5r", nh=hidden, init_scale=np.sqrt(2)))
                # xr = fc(xr, "fc6r", nh=rep_size, init_scale=np.sqrt(2))

            with tf.variable_scope('predictor_network'):
                self.predictor_scope = tf.get_variable_scope().name

                # Predictor network.
                # xr_hat = tf.nn.leaky_relu(fc(obs1, "fcr_hat1", nh=hidden*2, init_scale=np.sqrt(2)))
                # xr_hat = tf.nn.leaky_relu(fc(xr_hat, "fcr_hat2", nh=hidden*2, init_scale=np.sqrt(2)))
                # xr_hat = tf.nn.leaky_relu(fc(xr_hat, "fcr_hat3", nh=hidden, init_scale=np.sqrt(2)))
                # xr_hat = tf.nn.relu(fc(xr_hat, "fcr_hat4", nh=hidden, init_scale=np.sqrt(2)))
                # xr_hat = tf.nn.relu(fc(xr_hat, "fcr_hat5", nh=hidden, init_scale=np.sqrt(2)))
                # xr_hat = tf.nn.relu(fc(xr_hat, "fcr_hat6", nh=hidden, init_scale=np.sqrt(2)))
                # xr_hat = tf.nn.relu(fc(xr_hat, "fcr_hat7", nh=hidden, init_scale=np.sqrt(2)))
                # xr_hat = fc(xr_hat, "fcr_hat8", nh=rep_size, init_scale=np.sqrt(2))

                xr_hat = nn(obs1, [hidden] * layers + [rep_size])


        # # Predictor network.
        # for ph in self.ph_ob.values():
        #     if len(ph.shape.as_list()) == 5:  # B,T,H,W,C
        #         logger.info("CnnTarget: using '%s' shape %s as image input" % (ph.name, str(ph.shape)))
        #         xrp = ph[:, 1:]
        #         xrp = tf.cast(xrp, tf.float32)
        #         xrp = tf.reshape(xrp, (-1, *ph.shape.as_list()[-3:]))[:, :, :, -1:]
        #         xrp = tf.clip_by_value((xrp - self.ph_mean) / self.ph_std, -5.0, 5.0)
        #
        #         xrp = tf.nn.leaky_relu(conv(xrp, 'c1rp_pred', nf=convfeat, rf=8, stride=4, init_scale=np.sqrt(2)))
        #         xrp = tf.nn.leaky_relu(conv(xrp, 'c2rp_pred', nf=convfeat * 2, rf=4, stride=2, init_scale=np.sqrt(2)))
        #         xrp = tf.nn.leaky_relu(conv(xrp, 'c3rp_pred', nf=convfeat * 2, rf=3, stride=1, init_scale=np.sqrt(2)))
        #         rgbrp = to2d(xrp)
        #         # X_r_hat = tf.nn.relu(fc(rgb[0], 'fc1r_hat1', nh=256 * enlargement, init_scale=np.sqrt(2)))
        #         X_r_hat = tf.nn.relu(fc(rgbrp, 'fc1r_hat1_pred', nh=256 * enlargement, init_scale=np.sqrt(2)))
        #         X_r_hat = tf.nn.relu(fc(X_r_hat, 'fc1r_hat2_pred', nh=256 * enlargement, init_scale=np.sqrt(2)))
        #         X_r_hat = fc(X_r_hat, 'fc1r_hat3_pred', nh=rep_size, init_scale=np.sqrt(2))

        # self.feat_var = tf.reduce_mean(tf.nn.moments(X_r, axes=[0])[1])
        # self.max_feat = tf.reduce_max(tf.abs(X_r))
        # self.int_rew = tf.reduce_mean(tf.square(tf.stop_gradient(xr) - xr_hat), axis=-1, keep_dims=True)

        # targets = tf.stop_gradient(X_r)
        # # self.aux_loss = tf.reduce_mean(tf.square(noisy_targets-X_r_hat))
        # self.aux_loss = tf.reduce_mean(tf.square(targets - X_r_hat), -1)
        # mask = tf.random_uniform(shape=tf.shape(self.aux_loss), minval=0., maxval=1., dtype=tf.float32)
        # mask = tf.cast(mask < self.proportion_of_exp_used_for_predictor_update, tf.float32)
        # self.aux_loss = tf.reduce_sum(mask * self.aux_loss) / tf.maximum(tf.reduce_sum(mask), 1.)
        #

        total_parameters = 0
        for variable in _vars(self.predictor_scope):
            # shape is an array of tf.Dimension
            shape = variable.get_shape()
            # print(shape)
            # print(len(shape))
            variable_parameters = 1
            for dim in shape:
                # print(dim)
                variable_parameters *= dim.value
            # print(variable_parameters)
            total_parameters += variable_parameters
        logger.info("params in target rnd network: {}".format(total_parameters))

        self.feat_var = tf.reduce_mean(tf.nn.moments(xr, axes=[0])[1])
        self.max_feat = tf.reduce_max(tf.abs(xr))
        # loss functions
        self.per_sample_loss_tf = tf.reduce_mean(tf.square(tf.stop_gradient(xr) - xr_hat), axis=-1, keepdims=True)
        self.mean_loss_tf = tf.reduce_mean(self.per_sample_loss_tf)

        self.dynamics_grads = U.flatgrad(self.mean_loss_tf, _vars(self.predictor_scope), clip_norm=clip_norm)

        # optimizers
        self.dynamics_adam = MpiAdam(_vars(self.predictor_scope), scale_grad_by_procs=False)