示例#1
0
def experiment(variant):
    env = Point2DEnv(**variant['env_kwargs'])
    env = FlatGoalEnv(env)
    env = NormalizedBoxEnv(env)

    action_dim = int(np.prod(env.action_space.shape))
    obs_dim = int(np.prod(env.observation_space.shape))

    qf1 = FlattenMlp(input_size=obs_dim + action_dim,
                     output_size=1,
                     **variant['qf_kwargs'])
    qf2 = FlattenMlp(input_size=obs_dim + action_dim,
                     output_size=1,
                     **variant['qf_kwargs'])
    target_qf1 = FlattenMlp(input_size=obs_dim + action_dim,
                            output_size=1,
                            **variant['qf_kwargs'])
    target_qf2 = FlattenMlp(input_size=obs_dim + action_dim,
                            output_size=1,
                            **variant['qf_kwargs'])
    policy = TanhGaussianPolicy(obs_dim=obs_dim,
                                action_dim=action_dim,
                                **variant['policy_kwargs'])
    eval_env = expl_env = env

    eval_policy = MakeDeterministic(policy)
    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        policy,
    )
    replay_buffer = EnvReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
    )
    trainer = TwinSACTrainer(env=eval_env,
                             policy=policy,
                             qf1=qf1,
                             qf2=qf2,
                             target_qf1=target_qf1,
                             target_qf2=target_qf2,
                             **variant['trainer_kwargs'])
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        data_buffer=replay_buffer,
        **variant['algo_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
示例#2
0
def _pointmass_fixed_goal_experiment(vae_latent_size,
                                     replay_buffer_size,
                                     cnn_kwargs,
                                     vae_kwargs,
                                     policy_kwargs,
                                     qf_kwargs,
                                     e2e_trainer_kwargs,
                                     sac_trainer_kwargs,
                                     algorithm_kwargs,
                                     eval_path_collector_kwargs=None,
                                     expl_path_collector_kwargs=None,
                                     **kwargs):
    if expl_path_collector_kwargs is None:
        expl_path_collector_kwargs = {}
    if eval_path_collector_kwargs is None:
        eval_path_collector_kwargs = {}
    from multiworld.core.image_env import ImageEnv
    from multiworld.envs.pygame.point2d import Point2DEnv
    from multiworld.core.flat_goal_env import FlatGoalEnv
    env = Point2DEnv(
        images_are_rgb=True,
        render_onscreen=False,
        show_goal=False,
        ball_radius=2,
        render_size=48,
        fixed_goal=(0, 0),
    )
    env = ImageEnv(env, imsize=env.render_size, transpose=True, normalize=True)
    env = FlatGoalEnv(env)  #, append_goal_to_obs=True)
    input_width, input_height = env.image_shape

    action_dim = int(np.prod(env.action_space.shape))
    vae = ConvVAE(
        representation_size=vae_latent_size,
        input_channels=3,
        imsize=input_width,
        decoder_output_activation=nn.Sigmoid(),
        # decoder_distribution='gaussian_identity_variance',
        **vae_kwargs)
    encoder = Vae2Encoder(vae)

    def make_cnn():
        return networks.CNN(input_width=input_width,
                            input_height=input_height,
                            input_channels=3,
                            output_conv_channels=True,
                            output_size=None,
                            **cnn_kwargs)

    def make_qf():
        return networks.MlpQfWithObsProcessor(obs_processor=nn.Sequential(
            encoder,
            networks.Flatten(),
        ),
                                              output_size=1,
                                              input_size=action_dim +
                                              vae_latent_size,
                                              **qf_kwargs)

    qf1 = make_qf()
    qf2 = make_qf()
    target_qf1 = make_qf()
    target_qf2 = make_qf()
    action_dim = int(np.prod(env.action_space.shape))
    policy_cnn = make_cnn()
    policy = TanhGaussianPolicyAdapter(
        nn.Sequential(policy_cnn, networks.Flatten()),
        policy_cnn.conv_output_flat_size, action_dim, **policy_kwargs)
    eval_env = expl_env = env

    eval_policy = MakeDeterministic(policy)
    eval_path_collector = MdpPathCollector(eval_env, eval_policy,
                                           **eval_path_collector_kwargs)
    replay_buffer = EnvReplayBuffer(
        replay_buffer_size,
        expl_env,
    )
    vae_trainer = VAETrainer(vae)
    sac_trainer = SACTrainer(env=eval_env,
                             policy=policy,
                             qf1=qf1,
                             qf2=qf2,
                             target_qf1=target_qf1,
                             target_qf2=target_qf2,
                             **sac_trainer_kwargs)
    trainer = End2EndSACTrainer(
        sac_trainer=sac_trainer,
        vae_trainer=vae_trainer,
        **e2e_trainer_kwargs,
    )
    expl_path_collector = MdpPathCollector(expl_env, policy,
                                           **expl_path_collector_kwargs)
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **algorithm_kwargs)
    algorithm.to(ptu.device)
    algorithm.train()
示例#3
0
def experiment(variant):
    env = RobosuiteStateWrapperEnv(wrapped_env_id=variant['env_id'],
                                   **variant['env_kwargs'])  #

    obs_dim = env.observation_space.low.size
    action_dim = env.action_space.low.size

    hidden_sizes = variant['hidden_sizes']
    qf1 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=hidden_sizes,
    )
    qf2 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=hidden_sizes,
    )
    target_qf1 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=hidden_sizes,
    )
    target_qf2 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=hidden_sizes,
    )
    policy = TanhGaussianPolicy(
        obs_dim=obs_dim,
        action_dim=action_dim,
        hidden_sizes=hidden_sizes,
    )

    es = OUStrategy(action_space=env.action_space,
                    max_sigma=variant['exploration_noise'],
                    min_sigma=variant['exploration_noise'])

    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )

    eval_policy = MakeDeterministic(policy)
    eval_path_collector = MdpPathCollector(
        env,
        eval_policy,
    )
    expl_path_collector = MdpPathCollector(
        env,
        exploration_policy,
    )
    replay_buffer = EnvReplayBuffer(
        variant['replay_buffer_size'],
        env,
    )
    trainer = SACTrainer(env=env,
                         policy=policy,
                         qf1=qf1,
                         qf2=qf2,
                         target_qf1=target_qf1,
                         target_qf2=target_qf2,
                         **variant['trainer_kwargs'])
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=env,
        evaluation_env=env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        max_path_length=variant['max_path_length'],
        batch_size=variant['batch_size'],
        num_epochs=variant['num_epochs'],
        num_eval_steps_per_epoch=variant['num_eval_steps_per_epoch'],
        num_expl_steps_per_train_loop=variant['num_expl_steps_per_train_loop'],
        num_trains_per_train_loop=variant['num_trains_per_train_loop'],
        min_num_steps_before_training=variant['min_num_steps_before_training'],
    )
    algorithm.to(ptu.device)
    algorithm.train()
def experiment(variant):
    expl_env = roboverse.make(variant['env'],
                              gui=False,
                              randomize=True,
                              observation_mode='state',
                              reward_type='shaped',
                              transpose_image=True)
    eval_env = expl_env
    action_dim = int(np.prod(eval_env.action_space.shape))
    obs_dim = 11

    M = variant['layer_size']
    qf1 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    qf2 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    target_qf1 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    target_qf2 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    policy_class = variant.get("policy_class", TanhGaussianPolicy)
    policy = policy_class(
        obs_dim=obs_dim,
        action_dim=action_dim,
        **variant['policy_kwargs'],
    )

    buffer_policy = policy_class(
        obs_dim=obs_dim,
        action_dim=action_dim,
        **variant['policy_kwargs'],
    )

    trainer = AWRSACTrainer(env=eval_env,
                            policy=policy,
                            qf1=qf1,
                            qf2=qf2,
                            target_qf1=target_qf1,
                            target_qf2=target_qf2,
                            buffer_policy=buffer_policy,
                            **variant['trainer_kwargs'])

    expl_policy = policy
    expl_path_collector = MdpPathCollector(
        expl_env,
        expl_policy,
    )
    eval_policy = MakeDeterministic(policy)
    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )

    replay_buffer_kwargs = dict(
        max_replay_buffer_size=variant['replay_buffer_size'],
        env=expl_env,
    )

    replay_buffer = EnvReplayBuffer(**replay_buffer_kwargs)

    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        max_path_length=variant['max_path_length'],
        batch_size=variant['batch_size'],
        num_epochs=variant['num_epochs'],
        num_eval_steps_per_epoch=variant['num_eval_steps_per_epoch'],
        num_expl_steps_per_train_loop=variant['num_expl_steps_per_train_loop'],
        num_trains_per_train_loop=variant['num_trains_per_train_loop'],
        min_num_steps_before_training=variant['min_num_steps_before_training'],
    )

    algorithm.to(ptu.device)

    demo_train_buffer = EnvReplayBuffer(**replay_buffer_kwargs, )
    demo_test_buffer = EnvReplayBuffer(**replay_buffer_kwargs, )

    path_loader_kwargs = variant.get("path_loader_kwargs", {})

    save_paths = None  # FIXME(avi)
    if variant.get('save_paths', False):
        algorithm.post_train_funcs.append(save_paths)
    if variant.get('load_demos', False):
        path_loader_class = variant.get('path_loader_class', MDPPathLoader)
        path_loader = path_loader_class(trainer,
                                        replay_buffer=replay_buffer,
                                        demo_train_buffer=demo_train_buffer,
                                        demo_test_buffer=demo_test_buffer,
                                        **path_loader_kwargs)
        path_loader.load_demos()
    if variant.get('pretrain_policy', False):
        trainer.pretrain_policy_with_bc()
    if variant.get('pretrain_rl', False):
        trainer.pretrain_q_with_bc_data()
    if variant.get('save_pretrained_algorithm', False):
        p_path = osp.join(logger.get_snapshot_dir(), 'pretrain_algorithm.p')
        pt_path = osp.join(logger.get_snapshot_dir(), 'pretrain_algorithm.pt')
        data = algorithm._get_snapshot()
        data['algorithm'] = algorithm
        torch.save(data, open(pt_path, "wb"))
        torch.save(data, open(p_path, "wb"))
    if variant.get('train_rl', True):
        algorithm.train()
def experiment(variant):
    env_params = ENV_PARAMS[variant['env']]
    variant.update(env_params)
    variant['path_loader_kwargs']['demo_path'] = env_params['demo_path']
    variant['trainer_kwargs']['bc_num_pretrain_steps'] = env_params['bc_num_pretrain_steps']

    if 'env_id' in env_params:
        expl_env = gym.make(env_params['env_id'])
        eval_env = gym.make(env_params['env_id'])
    obs_dim = expl_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size
    N = variant['num_layers']
    M = variant['layer_size']
    qf1 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[M]*N,
    )
    qf2 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[M] * N,
    )
    target_qf1 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[M] * N,
    )
    target_qf2 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[M] * N,
    )
    if variant.get('policy_class') == TanhGaussianPolicy:
        policy = TanhGaussianPolicy(
        obs_dim=obs_dim,
        action_dim=action_dim,
        hidden_sizes=[M] * N,
    )
    else:
        policy = GaussianPolicy(
        obs_dim=obs_dim,
        action_dim=action_dim,
        hidden_sizes=[M] * N,
        max_log_std=0,
        min_log_std=-6,
        )

    eval_policy = MakeDeterministic(policy)
    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    replay_buffer = AWREnvReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
        use_weights=variant['use_weights'],
        policy=policy,
        qf1=qf1,
        weight_update_period=variant['weight_update_period'],
        beta=variant['trainer_kwargs']['beta'],
    )
    trainer = AWRSACTrainer(
        env=eval_env,
        policy=policy,
        qf1=qf1,
        qf2=qf2,
        target_qf1=target_qf1,
        target_qf2=target_qf2,
        **variant['trainer_kwargs']
    )
    if variant['collection_mode'] == 'online':
        expl_path_collector = MdpStepCollector(
            expl_env,
            policy,
        )
        algorithm = TorchOnlineRLAlgorithm(
            trainer=trainer,
            exploration_env=expl_env,
            evaluation_env=eval_env,
            exploration_data_collector=expl_path_collector,
            evaluation_data_collector=eval_path_collector,
            replay_buffer=replay_buffer,
            max_path_length=variant['max_path_length'],
            batch_size=variant['batch_size'],
            num_epochs=variant['num_epochs'],
            num_eval_steps_per_epoch=variant['num_eval_steps_per_epoch'],
            num_expl_steps_per_train_loop=variant['num_expl_steps_per_train_loop'],
            num_trains_per_train_loop=variant['num_trains_per_train_loop'],
            min_num_steps_before_training=variant['min_num_steps_before_training'],
        )
    else:
        expl_path_collector = MdpPathCollector(
            expl_env,
            policy,
        )
        algorithm = TorchBatchRLAlgorithm(
            trainer=trainer,
            exploration_env=expl_env,
            evaluation_env=eval_env,
            exploration_data_collector=expl_path_collector,
            evaluation_data_collector=eval_path_collector,
            replay_buffer=replay_buffer,
            max_path_length=variant['max_path_length'],
            batch_size=variant['batch_size'],
            num_epochs=variant['num_epochs'],
            num_eval_steps_per_epoch=variant['num_eval_steps_per_epoch'],
            num_expl_steps_per_train_loop=variant['num_expl_steps_per_train_loop'],
            num_trains_per_train_loop=variant['num_trains_per_train_loop'],
            min_num_steps_before_training=variant['min_num_steps_before_training'],
        )
    algorithm.to(ptu.device)

    demo_train_buffer = EnvReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
    )
    demo_test_buffer = EnvReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
    )
    path_loader_class = variant.get('path_loader_class', MDPPathLoader)
    path_loader = path_loader_class(trainer,
                                    replay_buffer=replay_buffer,
                                    demo_train_buffer=demo_train_buffer,
                                    demo_test_buffer=demo_test_buffer,
                                    **variant['path_loader_kwargs']
                                    )
    if variant.get('load_demos', False):
        path_loader.load_demos()
    if variant.get('pretrain_policy', False):
        trainer.pretrain_policy_with_bc()
    if variant.get('pretrain_rl', False):
        trainer.pretrain_q_with_bc_data()
    if variant.get('train_rl', True):
        algorithm.train()
示例#6
0
def experiment(variant):
    import gym
    from multiworld.envs.mujoco import register_custom_envs
    from multiworld.core.flat_goal_env import FlatGoalEnv

    register_custom_envs()
    expl_env = FlatGoalEnv(
        gym.make(variant['env_id']),
        obs_keys=['state_observation'],
        goal_keys=['xy_desired_goal'],
        append_goal_to_obs=False,
    )
    eval_env = FlatGoalEnv(
        gym.make(variant['env_id']),
        obs_keys=['state_observation'],
        goal_keys=['xy_desired_goal'],
        append_goal_to_obs=False,
    )

    obs_dim = expl_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size

    M = variant['layer_size']
    qf1 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    qf2 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    target_qf1 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    target_qf2 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    policy = TanhGaussianPolicy(
        obs_dim=obs_dim,
        action_dim=action_dim,
        hidden_sizes=[M, M],
    )
    eval_policy = MakeDeterministic(policy)
    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    replay_buffer = EnvReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
    )
    trainer = SACTrainer(
        env=eval_env,
        policy=policy,
        qf1=qf1,
        qf2=qf2,
        target_qf1=target_qf1,
        target_qf2=target_qf2,
        **variant['trainer_kwargs']
    )
    if variant['collection_mode'] == 'online':
        expl_path_collector = MdpStepCollector(
            expl_env,
            policy,
        )
        algorithm = TorchOnlineRLAlgorithm(
            trainer=trainer,
            exploration_env=expl_env,
            evaluation_env=eval_env,
            exploration_data_collector=expl_path_collector,
            evaluation_data_collector=eval_path_collector,
            replay_buffer=replay_buffer,
            max_path_length=variant['max_path_length'],
            batch_size=variant['batch_size'],
            num_epochs=variant['num_epochs'],
            num_eval_steps_per_epoch=variant['num_eval_steps_per_epoch'],
            num_expl_steps_per_train_loop=variant['num_expl_steps_per_train_loop'],
            num_trains_per_train_loop=variant['num_trains_per_train_loop'],
            min_num_steps_before_training=variant['min_num_steps_before_training'],
        )
    else:
        expl_path_collector = MdpPathCollector(
            expl_env,
            policy,
        )
        algorithm = TorchBatchRLAlgorithm(
            trainer=trainer,
            exploration_env=expl_env,
            evaluation_env=eval_env,
            exploration_data_collector=expl_path_collector,
            evaluation_data_collector=eval_path_collector,
            replay_buffer=replay_buffer,
            max_path_length=variant['max_path_length'],
            batch_size=variant['batch_size'],
            num_epochs=variant['num_epochs'],
            num_eval_steps_per_epoch=variant['num_eval_steps_per_epoch'],
            num_expl_steps_per_train_loop=variant['num_expl_steps_per_train_loop'],
            num_trains_per_train_loop=variant['num_trains_per_train_loop'],
            min_num_steps_before_training=variant['min_num_steps_before_training'],
        )
    algorithm.to(ptu.device)
    algorithm.train()
def experiment(variant):

    expl_env = roboverse.make(variant['env'],
                              gui=False,
                              randomize=variant['randomize_env'],
                              observation_mode=variant['obs'],
                              reward_type='shaped',
                              transpose_image=True)

    if variant['obs'] == 'pixels_debug':
        robot_state_dims = 11
    elif variant['obs'] == 'pixels':
        robot_state_dims = 4
    else:
        raise NotImplementedError

    expl_env = FlatEnv(expl_env,
                       use_robot_state=variant['use_robot_state'],
                       robot_state_dims=robot_state_dims)
    eval_env = expl_env

    img_width, img_height = eval_env.image_shape
    num_channels = 3

    action_dim = int(np.prod(eval_env.action_space.shape))
    cnn_params = variant['cnn_params']
    cnn_params.update(
        input_width=img_width,
        input_height=img_height,
        input_channels=num_channels,
    )
    if variant['use_robot_state']:
        cnn_params.update(
            added_fc_input_size=robot_state_dims,
            output_conv_channels=False,
            hidden_sizes=[400, 400],
            output_size=200,
        )
    else:
        cnn_params.update(
            added_fc_input_size=0,
            output_conv_channels=True,
            output_size=None,
        )
    qf_cnn = CNN(**cnn_params)

    if variant['use_robot_state']:
        qf_obs_processor = qf_cnn
    else:
        qf_obs_processor = nn.Sequential(
            qf_cnn,
            Flatten(),
        )

    qf_kwargs = copy.deepcopy(variant['qf_kwargs'])
    qf_kwargs['obs_processor'] = qf_obs_processor
    qf_kwargs['output_size'] = 1

    if variant['use_robot_state']:
        qf_kwargs['input_size'] = (action_dim + qf_cnn.output_size)
    else:
        qf_kwargs['input_size'] = (action_dim + qf_cnn.conv_output_flat_size)

    qf1 = MlpQfWithObsProcessor(**qf_kwargs)
    qf2 = MlpQfWithObsProcessor(**qf_kwargs)

    target_qf_cnn = CNN(**cnn_params)
    if variant['use_robot_state']:
        target_qf_obs_processor = target_qf_cnn
    else:
        target_qf_obs_processor = nn.Sequential(
            target_qf_cnn,
            Flatten(),
        )

    target_qf_kwargs = copy.deepcopy(variant['qf_kwargs'])
    target_qf_kwargs['obs_processor'] = target_qf_obs_processor
    target_qf_kwargs['output_size'] = 1

    if variant['use_robot_state']:
        target_qf_kwargs['input_size'] = (action_dim +
                                          target_qf_cnn.output_size)
    else:
        target_qf_kwargs['input_size'] = (action_dim +
                                          target_qf_cnn.conv_output_flat_size)

    target_qf1 = MlpQfWithObsProcessor(**target_qf_kwargs)
    target_qf2 = MlpQfWithObsProcessor(**target_qf_kwargs)

    action_dim = int(np.prod(eval_env.action_space.shape))
    policy_cnn = CNN(**cnn_params)
    if variant['use_robot_state']:
        policy_obs_processor = policy_cnn
    else:
        policy_obs_processor = nn.Sequential(
            policy_cnn,
            Flatten(),
        )

    if variant['use_robot_state']:
        policy = TanhGaussianPolicyAdapter(policy_obs_processor,
                                           policy_cnn.output_size, action_dim,
                                           **variant['policy_kwargs'])
    else:
        policy = TanhGaussianPolicyAdapter(policy_obs_processor,
                                           policy_cnn.conv_output_flat_size,
                                           action_dim,
                                           **variant['policy_kwargs'])

    eval_policy = MakeDeterministic(policy)
    eval_path_collector = MdpPathCollector(
        eval_env, eval_policy, **variant['eval_path_collector_kwargs'])
    replay_buffer = EnvReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
    )
    trainer = SACTrainer(env=eval_env,
                         policy=policy,
                         qf1=qf1,
                         qf2=qf2,
                         target_qf1=target_qf1,
                         target_qf2=target_qf2,
                         **variant['trainer_kwargs'])
    if variant['collection_mode'] == 'batch':
        expl_path_collector = MdpPathCollector(
            expl_env, policy, **variant['expl_path_collector_kwargs'])
        algorithm = TorchBatchRLAlgorithm(
            trainer=trainer,
            exploration_env=expl_env,
            evaluation_env=eval_env,
            exploration_data_collector=expl_path_collector,
            evaluation_data_collector=eval_path_collector,
            replay_buffer=replay_buffer,
            **variant['algo_kwargs'])
    elif variant['collection_mode'] == 'online':
        expl_path_collector = MdpStepCollector(
            expl_env, policy, **variant['expl_path_collector_kwargs'])
        algorithm = TorchOnlineRLAlgorithm(
            trainer=trainer,
            exploration_env=expl_env,
            evaluation_env=eval_env,
            exploration_data_collector=expl_path_collector,
            evaluation_data_collector=eval_path_collector,
            replay_buffer=replay_buffer,
            **variant['algo_kwargs'])
    else:
        raise NotImplementedError

    video_func = VideoSaveFunctionBullet(variant)
    algorithm.post_train_funcs.append(video_func)

    algorithm.to(ptu.device)
    algorithm.train()
def experiment(variant):
    env_params = ENV_PARAMS[variant['env']]
    variant.update(env_params)

    if 'env_id' in env_params:

        expl_env = gym.make(env_params['env_id'])
        eval_env = gym.make(env_params['env_id'])
    else:
        expl_env = NormalizedBoxEnv(variant['env_class']())
        eval_env = NormalizedBoxEnv(variant['env_class']())

    path_loader_kwargs = variant.get("path_loader_kwargs", {})
    stack_obs = path_loader_kwargs.get("stack_obs", 1)
    expl_env = StackObservationEnv(expl_env, stack_obs=stack_obs)
    eval_env = StackObservationEnv(eval_env, stack_obs=stack_obs)

    obs_dim = expl_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size
    if hasattr(expl_env, 'info_sizes'):
        env_info_sizes = expl_env.info_sizes
    else:
        env_info_sizes = dict()

    replay_buffer_kwargs=dict(
        max_replay_buffer_size=variant['replay_buffer_size'],
        env=expl_env,
    )


    M = variant['layer_size']
    qf1 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    qf2 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    target_qf1 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    target_qf2 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    policy = TanhGaussianPolicy(
        obs_dim=obs_dim,
        action_dim=action_dim,
        **variant['policy_kwargs'],
    )
    eval_policy = MakeDeterministic(policy)
    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    replay_buffer = EnvReplayBuffer(
        **replay_buffer_kwargs,
    )
    trainer = AWRSACTrainer(
        env=eval_env,
        policy=policy,
        qf1=qf1,
        qf2=qf2,
        target_qf1=target_qf1,
        target_qf2=target_qf2,
        **variant['trainer_kwargs']
    )
    if variant['collection_mode'] == 'online':
        expl_path_collector = MdpStepCollector(
            expl_env,
            policy,
        )
        algorithm = TorchOnlineRLAlgorithm(
            trainer=trainer,
            exploration_env=expl_env,
            evaluation_env=eval_env,
            exploration_data_collector=expl_path_collector,
            evaluation_data_collector=eval_path_collector,
            replay_buffer=replay_buffer,
            max_path_length=variant['max_path_length'],
            batch_size=variant['batch_size'],
            num_epochs=variant['num_epochs'],
            num_eval_steps_per_epoch=variant['num_eval_steps_per_epoch'],
            num_expl_steps_per_train_loop=variant['num_expl_steps_per_train_loop'],
            num_trains_per_train_loop=variant['num_trains_per_train_loop'],
            min_num_steps_before_training=variant['min_num_steps_before_training'],
        )
    else:
        if variant.get("deterministic_exploration", False):
            expl_policy = eval_policy
        else:
            expl_policy = policy
        expl_path_collector = MdpPathCollector(
            expl_env,
            expl_policy,
        )
        algorithm = TorchBatchRLAlgorithm(
            trainer=trainer,
            exploration_env=expl_env,
            evaluation_env=eval_env,
            exploration_data_collector=expl_path_collector,
            evaluation_data_collector=eval_path_collector,
            replay_buffer=replay_buffer,
            max_path_length=variant['max_path_length'],
            batch_size=variant['batch_size'],
            num_epochs=variant['num_epochs'],
            num_eval_steps_per_epoch=variant['num_eval_steps_per_epoch'],
            num_expl_steps_per_train_loop=variant['num_expl_steps_per_train_loop'],
            num_trains_per_train_loop=variant['num_trains_per_train_loop'],
            min_num_steps_before_training=variant['min_num_steps_before_training'],
        )
    algorithm.to(ptu.device)

    demo_train_buffer = EnvReplayBuffer(
        **replay_buffer_kwargs,
    )
    demo_test_buffer = EnvReplayBuffer(
        **replay_buffer_kwargs,
    )

    if variant.get('save_paths', False):
        algorithm.post_train_funcs.append(save_paths)

    if variant.get('load_demos', False):
        path_loader_class = variant.get('path_loader_class', MDPPathLoader)
        path_loader = path_loader_class(trainer,
            replay_buffer=replay_buffer,
            demo_train_buffer=demo_train_buffer,
            demo_test_buffer=demo_test_buffer,
            **path_loader_kwargs
        )
        path_loader.load_demos()
    if variant.get('pretrain_policy', False):
        trainer.pretrain_policy_with_bc()
    if variant.get('pretrain_rl', False):
        trainer.pretrain_q_with_bc_data()

    algorithm.train()
def experiment(variant):

    expl_env = PointmassBaseEnv()
    eval_env = expl_env

    action_dim = int(np.prod(eval_env.action_space.shape))
    state_dim = 3

    qf_kwargs = copy.deepcopy(variant['qf_kwargs'])
    qf_kwargs['output_size'] = 1
    qf_kwargs['input_size'] = action_dim + state_dim
    qf1 = MlpQf(**qf_kwargs)
    qf2 = MlpQf(**qf_kwargs)

    target_qf_kwargs = copy.deepcopy(qf_kwargs)
    target_qf1 = MlpQf(**target_qf_kwargs)
    target_qf2 = MlpQf(**target_qf_kwargs)

    policy_kwargs = copy.deepcopy(variant['policy_kwargs'])
    policy_kwargs['action_dim'] = action_dim
    policy_kwargs['obs_dim'] = state_dim
    policy = TanhGaussianPolicy(**policy_kwargs)

    eval_policy = MakeDeterministic(policy)
    eval_path_collector = MdpPathCollector(
        eval_env, eval_policy, **variant['eval_path_collector_kwargs'])
    replay_buffer = EnvReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
    )
    trainer = SACTrainer(env=eval_env,
                         policy=policy,
                         qf1=qf1,
                         qf2=qf2,
                         target_qf1=target_qf1,
                         target_qf2=target_qf2,
                         **variant['trainer_kwargs'])
    if variant['collection_mode'] == 'batch':
        expl_path_collector = MdpPathCollector(
            expl_env, policy, **variant['expl_path_collector_kwargs'])
        algorithm = TorchBatchRLAlgorithm(
            trainer=trainer,
            exploration_env=expl_env,
            evaluation_env=eval_env,
            exploration_data_collector=expl_path_collector,
            evaluation_data_collector=eval_path_collector,
            replay_buffer=replay_buffer,
            **variant['algo_kwargs'])
    elif variant['collection_mode'] == 'online':
        expl_path_collector = MdpStepCollector(
            expl_env, policy, **variant['expl_path_collector_kwargs'])
        algorithm = TorchOnlineRLAlgorithm(
            trainer=trainer,
            exploration_env=expl_env,
            evaluation_env=eval_env,
            exploration_data_collector=expl_path_collector,
            evaluation_data_collector=eval_path_collector,
            replay_buffer=replay_buffer,
            **variant['algo_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
def experiment(variant):

    expl_env = FlatEnv(PointmassBaseEnv(observation_mode='pixels',
                                        transpose_image=True),
                       use_robot_state=False)

    eval_env = expl_env

    img_width, img_height = (48, 48)
    num_channels = 3

    action_dim = int(np.prod(eval_env.action_space.shape))
    cnn_params = variant['cnn_params']
    cnn_params.update(
        input_width=img_width,
        input_height=img_height,
        input_channels=num_channels,
        added_fc_input_size=4,
        output_conv_channels=True,
        output_size=None,
    )

    qf_cnn = CNN(**cnn_params)
    qf_obs_processor = nn.Sequential(
        qf_cnn,
        Flatten(),
    )

    qf_kwargs = copy.deepcopy(variant['qf_kwargs'])
    qf_kwargs['obs_processor'] = qf_obs_processor
    qf_kwargs['output_size'] = 1
    qf_kwargs['input_size'] = (action_dim + qf_cnn.conv_output_flat_size)
    qf1 = MlpQfWithObsProcessor(**qf_kwargs)
    qf2 = MlpQfWithObsProcessor(**qf_kwargs)

    target_qf_cnn = CNN(**cnn_params)
    target_qf_obs_processor = nn.Sequential(
        target_qf_cnn,
        Flatten(),
    )
    target_qf_kwargs = copy.deepcopy(variant['qf_kwargs'])
    target_qf_kwargs['obs_processor'] = target_qf_obs_processor
    target_qf_kwargs['output_size'] = 1
    target_qf_kwargs['input_size'] = (action_dim +
                                      target_qf_cnn.conv_output_flat_size)
    target_qf1 = MlpQfWithObsProcessor(**target_qf_kwargs)
    target_qf2 = MlpQfWithObsProcessor(**target_qf_kwargs)

    action_dim = int(np.prod(eval_env.action_space.shape))
    policy_cnn = CNN(**cnn_params)
    policy_obs_processor = nn.Sequential(
        policy_cnn,
        Flatten(),
    )
    policy = TanhGaussianPolicyAdapter(policy_obs_processor,
                                       policy_cnn.conv_output_flat_size,
                                       action_dim, **variant['policy_kwargs'])

    eval_policy = MakeDeterministic(policy)
    eval_path_collector = MdpPathCollector(
        eval_env, eval_policy, **variant['eval_path_collector_kwargs'])
    replay_buffer = EnvReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
    )
    trainer = SACTrainer(env=eval_env,
                         policy=policy,
                         qf1=qf1,
                         qf2=qf2,
                         target_qf1=target_qf1,
                         target_qf2=target_qf2,
                         **variant['trainer_kwargs'])
    if variant['collection_mode'] == 'batch':
        expl_path_collector = MdpPathCollector(
            expl_env, policy, **variant['expl_path_collector_kwargs'])
        algorithm = TorchBatchRLAlgorithm(
            trainer=trainer,
            exploration_env=expl_env,
            evaluation_env=eval_env,
            exploration_data_collector=expl_path_collector,
            evaluation_data_collector=eval_path_collector,
            replay_buffer=replay_buffer,
            **variant['algo_kwargs'])
    elif variant['collection_mode'] == 'online':
        expl_path_collector = MdpStepCollector(
            expl_env, policy, **variant['expl_path_collector_kwargs'])
        algorithm = TorchOnlineRLAlgorithm(
            trainer=trainer,
            exploration_env=expl_env,
            evaluation_env=eval_env,
            exploration_data_collector=expl_path_collector,
            evaluation_data_collector=eval_path_collector,
            replay_buffer=replay_buffer,
            **variant['algo_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
def experiment(variant):
    # from softlearning.environments.gym import register_image_reach
    # register_image_reach()
    # env = gym.envs.make(
    #     'Pusher2d-ImageReach-v0',
    # )
    from softlearning.environments.gym.mujoco.image_pusher_2d import (
        ImageForkReacher2dEnv)

    env_kwargs = {
        'image_shape': (32, 32, 3),
        'arm_goal_distance_cost_coeff': 1.0,
        'arm_object_distance_cost_coeff': 0.0,
    }

    eval_env = ImageForkReacher2dEnv(**env_kwargs)
    expl_env = ImageForkReacher2dEnv(**env_kwargs)

    input_width, input_height, input_channels = eval_env.image_shape
    image_dim = input_width * input_height * input_channels

    action_dim = int(np.prod(eval_env.action_space.shape))
    cnn_params = variant['cnn_params']
    cnn_params.update(
        input_width=input_width,
        input_height=input_height,
        input_channels=input_channels,
        added_fc_input_size=4,
        output_conv_channels=True,
        output_size=None,
    )
    non_image_dim = int(np.prod(eval_env.observation_space.shape)) - image_dim
    if variant['shared_qf_conv']:
        qf_cnn = CNN(**cnn_params)
        qf_obs_processor = nn.Sequential(
            Split(qf_cnn, identity, image_dim),
            FlattenEach(),
            Concat(),
        )

        qf_kwargs = copy.deepcopy(variant['qf_kwargs'])
        qf_kwargs['obs_processor'] = qf_obs_processor
        qf_kwargs['output_size'] = 1
        qf_kwargs['input_size'] = (action_dim + qf_cnn.conv_output_flat_size +
                                   non_image_dim)
        qf1 = MlpQfWithObsProcessor(**qf_kwargs)
        qf2 = MlpQfWithObsProcessor(**qf_kwargs)

        target_qf_cnn = CNN(**cnn_params)
        target_qf_obs_processor = nn.Sequential(
            Split(target_qf_cnn, identity, image_dim),
            FlattenEach(),
            Concat(),
        )
        target_qf_kwargs = copy.deepcopy(variant['qf_kwargs'])
        target_qf_kwargs['obs_processor'] = target_qf_obs_processor
        target_qf_kwargs['output_size'] = 1
        target_qf_kwargs['input_size'] = (action_dim +
                                          target_qf_cnn.conv_output_flat_size +
                                          non_image_dim)
        target_qf1 = MlpQfWithObsProcessor(**target_qf_kwargs)
        target_qf2 = MlpQfWithObsProcessor(**target_qf_kwargs)
    else:
        qf1_cnn = CNN(**cnn_params)
        cnn_output_dim = qf1_cnn.conv_output_flat_size
        qf1 = MlpQfWithObsProcessor(obs_processor=qf1_cnn,
                                    output_size=1,
                                    input_size=action_dim + cnn_output_dim,
                                    **variant['qf_kwargs'])
        qf2 = MlpQfWithObsProcessor(obs_processor=CNN(**cnn_params),
                                    output_size=1,
                                    input_size=action_dim + cnn_output_dim,
                                    **variant['qf_kwargs'])
        target_qf1 = MlpQfWithObsProcessor(obs_processor=CNN(**cnn_params),
                                           output_size=1,
                                           input_size=action_dim +
                                           cnn_output_dim,
                                           **variant['qf_kwargs'])
        target_qf2 = MlpQfWithObsProcessor(obs_processor=CNN(**cnn_params),
                                           output_size=1,
                                           input_size=action_dim +
                                           cnn_output_dim,
                                           **variant['qf_kwargs'])
    action_dim = int(np.prod(eval_env.action_space.shape))
    policy_cnn = CNN(**cnn_params)
    policy_obs_processor = nn.Sequential(
        Split(policy_cnn, identity, image_dim),
        FlattenEach(),
        Concat(),
    )
    policy = TanhGaussianPolicyAdapter(
        policy_obs_processor, policy_cnn.conv_output_flat_size + non_image_dim,
        action_dim, **variant['policy_kwargs'])

    eval_policy = MakeDeterministic(policy)
    eval_path_collector = MdpPathCollector(
        eval_env, eval_policy, **variant['eval_path_collector_kwargs'])
    replay_buffer = EnvReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
    )
    trainer = SACTrainer(env=eval_env,
                         policy=policy,
                         qf1=qf1,
                         qf2=qf2,
                         target_qf1=target_qf1,
                         target_qf2=target_qf2,
                         **variant['trainer_kwargs'])
    if variant['collection_mode'] == 'batch':
        expl_path_collector = MdpPathCollector(
            expl_env, policy, **variant['expl_path_collector_kwargs'])
        algorithm = TorchBatchRLAlgorithm(
            trainer=trainer,
            exploration_env=expl_env,
            evaluation_env=eval_env,
            exploration_data_collector=expl_path_collector,
            evaluation_data_collector=eval_path_collector,
            replay_buffer=replay_buffer,
            **variant['algo_kwargs'])
    elif variant['collection_mode'] == 'online':
        expl_path_collector = MdpStepCollector(
            expl_env, policy, **variant['expl_path_collector_kwargs'])
        algorithm = TorchOnlineRLAlgorithm(
            trainer=trainer,
            exploration_env=expl_env,
            evaluation_env=eval_env,
            exploration_data_collector=expl_path_collector,
            evaluation_data_collector=eval_path_collector,
            replay_buffer=replay_buffer,
            **variant['algo_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
示例#12
0
def experiment(variant):
    if variant.get("pretrained_algorithm_path", False):
        resume(variant)
        return

    if 'env' in variant:
        env_params = ENV_PARAMS[variant['env']]
        variant.update(env_params)

        if 'env_id' in env_params:
            if env_params['env_id'] in [
                    'pen-v0', 'pen-sparse-v0', 'door-v0', 'relocate-v0',
                    'hammer-v0', 'pen-sparse-v0', 'door-sparse-v0',
                    'relocate-sparse-v0', 'hammer-sparse-v0'
            ]:
                import mj_envs
            expl_env = gym.make(env_params['env_id'])
            eval_env = gym.make(env_params['env_id'])
        else:
            expl_env = NormalizedBoxEnv(variant['env_class']())
            eval_env = NormalizedBoxEnv(variant['env_class']())

        if variant.get('sparse_reward', False):
            expl_env = RewardWrapperEnv(expl_env, compute_hand_sparse_reward)
            eval_env = RewardWrapperEnv(eval_env, compute_hand_sparse_reward)

        if variant.get('add_env_demos', False):
            variant["path_loader_kwargs"]["demo_paths"].append(
                variant["env_demo_path"])

        if variant.get('add_env_offpolicy_data', False):
            variant["path_loader_kwargs"]["demo_paths"].append(
                variant["env_offpolicy_data_path"])
    else:
        expl_env = encoder_wrapped_env(variant)
        eval_env = encoder_wrapped_env(variant)

    path_loader_kwargs = variant.get("path_loader_kwargs", {})
    stack_obs = path_loader_kwargs.get("stack_obs", 1)
    if stack_obs > 1:
        expl_env = StackObservationEnv(expl_env, stack_obs=stack_obs)
        eval_env = StackObservationEnv(eval_env, stack_obs=stack_obs)

    obs_dim = expl_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size
    if hasattr(expl_env, 'info_sizes'):
        env_info_sizes = expl_env.info_sizes
    else:
        env_info_sizes = dict()

    M = variant['layer_size']
    qf1 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    qf2 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    target_qf1 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    target_qf2 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    policy_class = variant.get("policy_class", TanhGaussianPolicy)
    policy = policy_class(
        obs_dim=obs_dim,
        action_dim=action_dim,
        **variant['policy_kwargs'],
    )

    buffer_policy = policy_class(
        obs_dim=obs_dim,
        action_dim=action_dim,
        **variant['policy_kwargs'],
    )

    eval_policy = MakeDeterministic(policy)
    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )

    expl_policy = policy
    exploration_kwargs = variant.get('exploration_kwargs', {})
    if exploration_kwargs:
        if exploration_kwargs.get("deterministic_exploration", False):
            expl_policy = MakeDeterministic(policy)

        exploration_strategy = exploration_kwargs.get("strategy", None)
        if exploration_strategy is None:
            pass
        elif exploration_strategy == 'ou':
            es = OUStrategy(
                action_space=expl_env.action_space,
                max_sigma=exploration_kwargs['noise'],
                min_sigma=exploration_kwargs['noise'],
            )
            expl_policy = PolicyWrappedWithExplorationStrategy(
                exploration_strategy=es,
                policy=expl_policy,
            )
        elif exploration_strategy == 'gauss_eps':
            es = GaussianAndEpislonStrategy(
                action_space=expl_env.action_space,
                max_sigma=exploration_kwargs['noise'],
                min_sigma=exploration_kwargs['noise'],  # constant sigma
                epsilon=0,
            )
            expl_policy = PolicyWrappedWithExplorationStrategy(
                exploration_strategy=es,
                policy=expl_policy,
            )
        else:
            error

    if variant.get('replay_buffer_class',
                   EnvReplayBuffer) == AWREnvReplayBuffer:
        main_replay_buffer_kwargs = variant['replay_buffer_kwargs']
        main_replay_buffer_kwargs['env'] = expl_env
        main_replay_buffer_kwargs['qf1'] = qf1
        main_replay_buffer_kwargs['qf2'] = qf2
        main_replay_buffer_kwargs['policy'] = policy
    else:
        main_replay_buffer_kwargs = dict(
            max_replay_buffer_size=variant['replay_buffer_size'],
            env=expl_env,
        )
    replay_buffer_kwargs = dict(
        max_replay_buffer_size=variant['replay_buffer_size'],
        env=expl_env,
    )

    replay_buffer = variant.get('replay_buffer_class',
                                EnvReplayBuffer)(**main_replay_buffer_kwargs, )
    trainer = AWRSACTrainer(env=eval_env,
                            policy=policy,
                            qf1=qf1,
                            qf2=qf2,
                            target_qf1=target_qf1,
                            target_qf2=target_qf2,
                            buffer_policy=buffer_policy,
                            **variant['trainer_kwargs'])
    if variant['collection_mode'] == 'online':
        expl_path_collector = MdpStepCollector(
            expl_env,
            policy,
        )
        algorithm = TorchOnlineRLAlgorithm(
            trainer=trainer,
            exploration_env=expl_env,
            evaluation_env=eval_env,
            exploration_data_collector=expl_path_collector,
            evaluation_data_collector=eval_path_collector,
            replay_buffer=replay_buffer,
            max_path_length=variant['max_path_length'],
            batch_size=variant['batch_size'],
            num_epochs=variant['num_epochs'],
            num_eval_steps_per_epoch=variant['num_eval_steps_per_epoch'],
            num_expl_steps_per_train_loop=variant[
                'num_expl_steps_per_train_loop'],
            num_trains_per_train_loop=variant['num_trains_per_train_loop'],
            min_num_steps_before_training=variant[
                'min_num_steps_before_training'],
        )
    else:
        expl_path_collector = MdpPathCollector(
            expl_env,
            expl_policy,
        )
        algorithm = TorchBatchRLAlgorithm(
            trainer=trainer,
            exploration_env=expl_env,
            evaluation_env=eval_env,
            exploration_data_collector=expl_path_collector,
            evaluation_data_collector=eval_path_collector,
            replay_buffer=replay_buffer,
            max_path_length=variant['max_path_length'],
            batch_size=variant['batch_size'],
            num_epochs=variant['num_epochs'],
            num_eval_steps_per_epoch=variant['num_eval_steps_per_epoch'],
            num_expl_steps_per_train_loop=variant[
                'num_expl_steps_per_train_loop'],
            num_trains_per_train_loop=variant['num_trains_per_train_loop'],
            min_num_steps_before_training=variant[
                'min_num_steps_before_training'],
        )
    algorithm.to(ptu.device)

    demo_train_buffer = EnvReplayBuffer(**replay_buffer_kwargs, )
    demo_test_buffer = EnvReplayBuffer(**replay_buffer_kwargs, )

    if variant.get('save_paths', False):
        algorithm.post_train_funcs.append(save_paths)
    if variant.get('load_demos', False):
        path_loader_class = variant.get('path_loader_class', MDPPathLoader)
        path_loader = path_loader_class(trainer,
                                        replay_buffer=replay_buffer,
                                        demo_train_buffer=demo_train_buffer,
                                        demo_test_buffer=demo_test_buffer,
                                        **path_loader_kwargs)
        path_loader.load_demos()
    if variant.get('pretrain_policy', False):
        trainer.pretrain_policy_with_bc()
    if variant.get('pretrain_rl', False):
        trainer.pretrain_q_with_bc_data()
    if variant.get('save_pretrained_algorithm', False):
        p_path = osp.join(logger.get_snapshot_dir(), 'pretrain_algorithm.p')
        pt_path = osp.join(logger.get_snapshot_dir(), 'pretrain_algorithm.pt')
        data = algorithm._get_snapshot()
        data['algorithm'] = algorithm
        torch.save(data, open(pt_path, "wb"))
        torch.save(data, open(p_path, "wb"))
    if variant.get('train_rl', True):
        algorithm.train()
def experiment(variant):

    ptu.set_gpu_mode(True, 0)

    from softlearning.environments.gym import register_image_reach
    register_image_reach()
    env = gym.make('Pusher2d-ImageReach-v0', arm_goal_distance_cost_coeff=1.0, arm_object_distance_cost_coeff=0.0)

    #import ipdb; ipdb.set_trace()
    input_width, input_height = env.image_shape

    action_dim = int(np.prod(env.action_space.shape))
    cnn_params = variant['cnn_params']
    cnn_params.update(
        input_width=input_width,
        input_height=input_height,
        input_channels=3,
        added_fc_input_size=4,
        output_conv_channels=True,
        output_size=None,
    )
    if variant['shared_qf_conv']:
        qf_cnn = CNN(**cnn_params)
        qf1 = MlpQfWithObsProcessor(
            obs_processor=qf_cnn,
            output_size=1,
            input_size=action_dim+qf_cnn.conv_output_flat_size,
            **variant['qf_kwargs']
        )
        qf2 = MlpQfWithObsProcessor(
            obs_processor=qf_cnn,
            output_size=1,
            input_size=action_dim+qf_cnn.conv_output_flat_size,
            **variant['qf_kwargs']
        )
        target_qf_cnn = CNN(**cnn_params)
        target_qf1 = MlpQfWithObsProcessor(
            obs_processor=target_qf_cnn,
            output_size=1,
            input_size=action_dim+qf_cnn.conv_output_flat_size,
            **variant['qf_kwargs']
        )
        target_qf2 = MlpQfWithObsProcessor(
            obs_processor=target_qf_cnn,
            output_size=1,
            input_size=action_dim+qf_cnn.conv_output_flat_size,
            **variant['qf_kwargs']
        )
    else:
        qf1_cnn = CNN(**cnn_params)
        cnn_output_dim = qf1_cnn.conv_output_flat_size
        qf1 = MlpQfWithObsProcessor(
            obs_processor=qf1_cnn,
            output_size=1,
            input_size=action_dim+cnn_output_dim,
            **variant['qf_kwargs']
        )
        qf2 = MlpQfWithObsProcessor(
            obs_processor=CNN(**cnn_params),
            output_size=1,
            input_size=action_dim+cnn_output_dim,
            **variant['qf_kwargs']
        )
        target_qf1 = MlpQfWithObsProcessor(
            obs_processor=CNN(**cnn_params),
            output_size=1,
            input_size=action_dim+cnn_output_dim,
            **variant['qf_kwargs']
        )
        target_qf2 = MlpQfWithObsProcessor(
            obs_processor=CNN(**cnn_params),
            output_size=1,
            input_size=action_dim+cnn_output_dim,
            **variant['qf_kwargs']
        )
    action_dim = int(np.prod(env.action_space.shape))
    policy_cnn = CNN(**cnn_params)
    policy = TanhGaussianPolicyAdapter(
        policy_cnn,
        policy_cnn.conv_output_flat_size,
        action_dim,
    )
    eval_env = expl_env = env

    eval_policy = MakeDeterministic(policy)
    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
        **variant['eval_path_collector_kwargs']
    )
    replay_buffer = EnvReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
    )
    trainer = SACTrainer(
        env=eval_env,
        policy=policy,
        qf1=qf1,
        qf2=qf2,
        target_qf1=target_qf1,
        target_qf2=target_qf2,
        **variant['trainer_kwargs']
    )
    if variant['collection_mode'] == 'batch':
        expl_path_collector = MdpPathCollector(
            expl_env,
            policy,
            **variant['expl_path_collector_kwargs']
        )
        algorithm = TorchBatchRLAlgorithm(
            trainer=trainer,
            exploration_env=expl_env,
            evaluation_env=eval_env,
            exploration_data_collector=expl_path_collector,
            evaluation_data_collector=eval_path_collector,
            replay_buffer=replay_buffer,
            **variant['algo_kwargs']
        )
    elif variant['collection_mode'] == 'online':
        expl_path_collector = MdpStepCollector(
            expl_env,
            policy,
            **variant['expl_path_collector_kwargs']
        )
        algorithm = TorchOnlineRLAlgorithm(
            trainer=trainer,
            exploration_env=expl_env,
            evaluation_env=eval_env,
            exploration_data_collector=expl_path_collector,
            evaluation_data_collector=eval_path_collector,
            replay_buffer=replay_buffer,
            **variant['algo_kwargs']
        )
    elif variant['collection_mode'] == 'parallel':
        expl_path_collector = MdpPathCollector(
            expl_env,
            policy,
            **variant['expl_path_collector_kwargs']
        )
        algorithm = TorchParallelRLAlgorithm(
            trainer=trainer,
            exploration_env=expl_env,
            evaluation_env=eval_env,
            exploration_data_collector=expl_path_collector,
            evaluation_data_collector=eval_path_collector,
            replay_buffer=replay_buffer,
            **variant['algo_kwargs']
        )
    algorithm.to(ptu.device)
    algorithm.train()
示例#14
0
def experiment(variant):
    from multiworld.envs.mujoco import register_goal_example_envs
    register_goal_example_envs()

    eval_env = gym.make('Image48SawyerPushForwardEnv-v0')
    expl_env = gym.make('Image48SawyerPushForwardEnv-v0')
    # Hack for now
    eval_env.wrapped_env.transpose = True
    expl_env.wrapped_env.transpose = True
    # More hacks, use a dense reward instead
    eval_env.wrapped_env.wrapped_env.reward_type = 'puck_distance'
    expl_env.wrapped_env.wrapped_env.reward_type = 'puck_distance'

    img_width, img_height = eval_env.image_shape
    num_channels = 3

    action_dim = int(np.prod(eval_env.action_space.shape))
    cnn_params = variant['cnn_params']
    cnn_params.update(
        input_width=img_width,
        input_height=img_height,
        input_channels=num_channels,
        added_fc_input_size=4,
        output_conv_channels=True,
        output_size=None,
    )

    qf_cnn = CNN(**cnn_params)
    qf_obs_processor = nn.Sequential(
        qf_cnn,
        Flatten(),
    )

    qf_kwargs = copy.deepcopy(variant['qf_kwargs'])
    qf_kwargs['obs_processor'] = qf_obs_processor
    qf_kwargs['output_size'] = 1
    qf_kwargs['input_size'] = (action_dim + qf_cnn.conv_output_flat_size)
    qf1 = MlpQfWithObsProcessor(**qf_kwargs)
    qf2 = MlpQfWithObsProcessor(**qf_kwargs)

    target_qf_cnn = CNN(**cnn_params)
    target_qf_obs_processor = nn.Sequential(
        target_qf_cnn,
        Flatten(),
    )
    target_qf_kwargs = copy.deepcopy(variant['qf_kwargs'])
    target_qf_kwargs['obs_processor'] = target_qf_obs_processor
    target_qf_kwargs['output_size'] = 1
    target_qf_kwargs['input_size'] = (action_dim +
                                      target_qf_cnn.conv_output_flat_size)
    target_qf1 = MlpQfWithObsProcessor(**target_qf_kwargs)
    target_qf2 = MlpQfWithObsProcessor(**target_qf_kwargs)

    action_dim = int(np.prod(eval_env.action_space.shape))
    policy_cnn = CNN(**cnn_params)
    policy_obs_processor = nn.Sequential(
        policy_cnn,
        Flatten(),
    )
    policy = TanhGaussianPolicyAdapter(policy_obs_processor,
                                       policy_cnn.conv_output_flat_size,
                                       action_dim, **variant['policy_kwargs'])

    eval_policy = MakeDeterministic(policy)
    eval_path_collector = MdpPathCollector(
        eval_env, eval_policy, **variant['eval_path_collector_kwargs'])
    replay_buffer = EnvReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
    )
    trainer = SACTrainer(env=eval_env,
                         policy=policy,
                         qf1=qf1,
                         qf2=qf2,
                         target_qf1=target_qf1,
                         target_qf2=target_qf2,
                         **variant['trainer_kwargs'])
    if variant['collection_mode'] == 'batch':
        expl_path_collector = MdpPathCollector(
            expl_env, policy, **variant['expl_path_collector_kwargs'])
        algorithm = TorchBatchRLAlgorithm(
            trainer=trainer,
            exploration_env=expl_env,
            evaluation_env=eval_env,
            exploration_data_collector=expl_path_collector,
            evaluation_data_collector=eval_path_collector,
            replay_buffer=replay_buffer,
            **variant['algo_kwargs'])
    elif variant['collection_mode'] == 'online':
        expl_path_collector = MdpStepCollector(
            expl_env, policy, **variant['expl_path_collector_kwargs'])
        algorithm = TorchOnlineRLAlgorithm(
            trainer=trainer,
            exploration_env=expl_env,
            evaluation_env=eval_env,
            exploration_data_collector=expl_path_collector,
            evaluation_data_collector=eval_path_collector,
            replay_buffer=replay_buffer,
            **variant['algo_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
示例#15
0
def experiment(variant):
    import multiworld.envs.pygame
    env = gym.make('Point2DEnv-ImageFixedGoal-v0')
    input_width, input_height = env.image_shape

    action_dim = int(np.prod(env.action_space.shape))
    cnn_params = variant['cnn_params']
    cnn_params.update(
        input_width=input_width,
        input_height=input_height,
        input_channels=3,
        output_conv_channels=True,
        output_size=None,
    )
    if variant['shared_qf_conv']:
        qf_cnn = PretrainedCNN(**cnn_params)
        qf1 = MlpQfWithObsProcessor(
            obs_processor=nn.Sequential(qf_cnn, Flatten()),
            output_size=1,
            input_size=action_dim + qf_cnn.conv_output_flat_size,
            **variant['qf_kwargs'])
        qf2 = MlpQfWithObsProcessor(
            obs_processor=nn.Sequential(qf_cnn, Flatten()),
            output_size=1,
            input_size=action_dim + qf_cnn.conv_output_flat_size,
            **variant['qf_kwargs'])
        target_qf_cnn = PretrainedCNN(**cnn_params)
        target_qf1 = MlpQfWithObsProcessor(
            obs_processor=nn.Sequential(target_qf_cnn, Flatten()),
            output_size=1,
            input_size=action_dim + target_qf_cnn.conv_output_flat_size,
            **variant['qf_kwargs'])
        target_qf2 = MlpQfWithObsProcessor(
            obs_processor=nn.Sequential(target_qf_cnn, Flatten()),
            output_size=1,
            input_size=action_dim + target_qf_cnn.conv_output_flat_size,
            **variant['qf_kwargs'])
    else:
        qf1_cnn = PretrainedCNN(**cnn_params)
        cnn_output_dim = qf1_cnn.conv_output_flat_size
        qf1 = MlpQfWithObsProcessor(obs_processor=nn.Sequential(
            qf1_cnn, Flatten()),
                                    output_size=1,
                                    input_size=action_dim + cnn_output_dim,
                                    **variant['qf_kwargs'])
        qf2 = MlpQfWithObsProcessor(obs_processor=nn.Sequential(
            PretrainedCNN(**cnn_params), Flatten()),
                                    output_size=1,
                                    input_size=action_dim + cnn_output_dim,
                                    **variant['qf_kwargs'])
        target_qf1 = MlpQfWithObsProcessor(
            obs_processor=nn.Sequential(PretrainedCNN(**cnn_params),
                                        Flatten()),
            output_size=1,
            input_size=action_dim + cnn_output_dim,
            **variant['qf_kwargs'])
        target_qf2 = MlpQfWithObsProcessor(
            obs_processor=nn.Sequential(PretrainedCNN(**cnn_params),
                                        Flatten()),
            output_size=1,
            input_size=action_dim + cnn_output_dim,
            **variant['qf_kwargs'])
    action_dim = int(np.prod(env.action_space.shape))
    policy_cnn = PretrainedCNN(**cnn_params)
    policy = TanhGaussianPolicyAdapter(nn.Sequential(policy_cnn, Flatten()),
                                       policy_cnn.conv_output_flat_size,
                                       action_dim, **variant['policy_kwargs'])
    eval_env = expl_env = env

    eval_policy = MakeDeterministic(policy)
    eval_path_collector = MdpPathCollector(
        eval_env, eval_policy, **variant['eval_path_collector_kwargs'])
    replay_buffer = EnvReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
    )
    trainer = SACTrainer(env=eval_env,
                         policy=policy,
                         qf1=qf1,
                         qf2=qf2,
                         target_qf1=target_qf1,
                         target_qf2=target_qf2,
                         **variant['trainer_kwargs'])
    if variant['collection_mode'] == 'batch':
        expl_path_collector = MdpPathCollector(
            expl_env, policy, **variant['expl_path_collector_kwargs'])
        algorithm = TorchBatchRLAlgorithm(
            trainer=trainer,
            exploration_env=expl_env,
            evaluation_env=eval_env,
            exploration_data_collector=expl_path_collector,
            evaluation_data_collector=eval_path_collector,
            replay_buffer=replay_buffer,
            **variant['algo_kwargs'])
    elif variant['collection_mode'] == 'online':
        expl_path_collector = MdpStepCollector(
            expl_env, policy, **variant['expl_path_collector_kwargs'])
        algorithm = TorchOnlineRLAlgorithm(
            trainer=trainer,
            exploration_env=expl_env,
            evaluation_env=eval_env,
            exploration_data_collector=expl_path_collector,
            evaluation_data_collector=eval_path_collector,
            replay_buffer=replay_buffer,
            **variant['algo_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
示例#16
0
    def __init__(
            self,
            env,
            exploration_policy: ExplorationPolicy,
            training_env=None,
            eval_sampler=None,
            eval_policy=None,
            collection_mode='online',

            num_epochs=100,
            epoch_freq=1,
            num_steps_per_epoch=10000,
            num_steps_per_eval=1000,
            num_updates_per_env_step=1,
            num_updates_per_epoch=None,
            batch_size=1024,
            max_path_length=1000,
            discount=0.99,
            reward_scale=1,
            min_num_steps_before_training=None,
            replay_buffer_size=1000000,
            replay_buffer=None,
            train_on_eval_paths=False,
            do_training=True,
            oracle_transition_data=None,

            # I/O parameters
            render=False,
            render_during_eval=False,
            save_replay_buffer=False,
            save_algorithm=False,
            save_environment=False,
            normalize_env=True,

            # Remote env parameters
            sim_throttle=True,
            parallel_step_ratio=1,
            parallel_env_params=None,

            env_info_sizes=None,

            num_rollouts_per_eval=None,

            epoch_list=None,

            **kwargs
    ):
        """
        Base class for RL Algorithms
        :param env: Environment used to evaluate.
        :param exploration_policy: Policy used to explore
        :param training_env: Environment used by the algorithm. By default, a
        copy of `env` will be made.
        :param num_epochs:
        :param num_steps_per_epoch:
        :param num_steps_per_eval:
        :param num_updates_per_env_step: Used by online training mode.
        :param num_updates_per_epoch: Used by batch training mode.
        :param batch_size:
        :param max_path_length:
        :param discount:
        :param replay_buffer_size:
        :param reward_scale:
        :param render:
        :param save_replay_buffer:
        :param save_algorithm:
        :param save_environment:
        :param eval_sampler:
        :param eval_policy: Policy to evaluate with.
        :param collection_mode:
        :param sim_throttle:
        :param normalize_env:
        :param parallel_step_to_train_ratio:
        :param replay_buffer:
        """
        assert collection_mode in ['online', 'online-parallel', 'offline',
                                   'batch']
        if collection_mode == 'batch':
            assert num_updates_per_epoch is not None
        self.training_env = training_env or pickle.loads(pickle.dumps(env))
        self.normalize_env = normalize_env
        self.exploration_policy = exploration_policy
        self.num_epochs = num_epochs
        self.epoch_freq = epoch_freq
        self.epoch_list = epoch_list
        self.num_env_steps_per_epoch = num_steps_per_epoch
        if collection_mode == 'online' or collection_mode == 'online-parallel':
            self.num_updates_per_train_call = num_updates_per_env_step
        else:
            self.num_updates_per_train_call = num_updates_per_epoch
        self.batch_size = batch_size
        self.max_path_length = max_path_length
        self.num_rollouts_per_eval = num_rollouts_per_eval
        if self.num_rollouts_per_eval is not None:
            self.num_steps_per_eval = self.num_rollouts_per_eval * self.max_path_length
        else:
            self.num_steps_per_eval = num_steps_per_eval
        self.discount = discount
        self.replay_buffer_size = replay_buffer_size
        self.reward_scale = reward_scale
        self.render = render
        self.render_during_eval = render_during_eval
        self.collection_mode = collection_mode
        self.save_replay_buffer = save_replay_buffer
        self.save_algorithm = save_algorithm
        self.save_environment = save_environment
        if min_num_steps_before_training is None:
            min_num_steps_before_training = self.num_env_steps_per_epoch
        self.min_num_steps_before_training = min_num_steps_before_training
        if eval_sampler is None:
            if eval_policy is None:
                eval_policy = exploration_policy
            eval_sampler = InPlacePathSampler(
                env=env,
                policy=eval_policy,
                max_samples=self.num_steps_per_eval + self.max_path_length,
                max_path_length=self.max_path_length,
                render=render_during_eval,
            )
        self.eval_policy = eval_policy
        self.eval_sampler = eval_sampler
        self.eval_statistics = OrderedDict()
        self.need_to_update_eval_statistics = True

        self.action_space = env.action_space
        self.obs_space = env.observation_space
        self.env = env
        if replay_buffer is None:
            self.replay_buffer = EnvReplayBuffer(
                self.replay_buffer_size,
                self.env,
                env_info_sizes=env_info_sizes,
            )
        else:
            self.replay_buffer = replay_buffer

        self._n_env_steps_total = 0
        self._n_train_steps_total = 0
        self._n_rollouts_total = 0
        self._do_train_time = 0
        self._epoch_start_time = None
        self._algo_start_time = None
        self._old_table_keys = None
        self._current_path_builder = PathBuilder()
        self._exploration_paths = []
        self.train_on_eval_paths = train_on_eval_paths
        self.do_training = do_training
        self.oracle_transition_data = oracle_transition_data

        self.parallel_step_ratio = parallel_step_ratio
        self.sim_throttle = sim_throttle
        self.parallel_env_params = parallel_env_params or {}
        self.init_rollout_function()
        self.post_epoch_funcs = []
        self._exploration_policy_noise = 0
示例#17
0
def experiment(variant):
    env_params = ENV_PARAMS[variant['env']]
    variant.update(env_params)

    expl_env = NormalizedBoxEnv(variant['env_class']())
    eval_env = NormalizedBoxEnv(variant['env_class']())
    obs_dim = expl_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size

    M = variant['layer_size']
    qf1 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    qf2 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    target_qf1 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    target_qf2 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    policy = TanhGaussianPolicy(
        obs_dim=obs_dim,
        action_dim=action_dim,
        hidden_sizes=[M, M],
    )
    eval_policy = MakeDeterministic(policy)
    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    replay_buffer = EnvReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
    )
    trainer = SACTrainer(
        env=eval_env,
        policy=policy,
        qf1=qf1,
        qf2=qf2,
        target_qf1=target_qf1,
        target_qf2=target_qf2,
        **variant['trainer_kwargs']
    )
    if variant['collection_mode'] == 'online':
        expl_path_collector = MdpStepCollector(
            expl_env,
            policy,
        )
        algorithm = TorchOnlineRLAlgorithm(
            trainer=trainer,
            exploration_env=expl_env,
            evaluation_env=eval_env,
            exploration_data_collector=expl_path_collector,
            evaluation_data_collector=eval_path_collector,
            replay_buffer=replay_buffer,
            max_path_length=variant['max_path_length'],
            batch_size=variant['batch_size'],
            num_epochs=variant['num_epochs'],
            num_eval_steps_per_epoch=variant['num_eval_steps_per_epoch'],
            num_expl_steps_per_train_loop=variant['num_expl_steps_per_train_loop'],
            num_trains_per_train_loop=variant['num_trains_per_train_loop'],
            min_num_steps_before_training=variant['min_num_steps_before_training'],
        )
    else:
        expl_path_collector = MdpPathCollector(
            expl_env,
            policy,
        )
        algorithm = TorchBatchRLAlgorithm(
            trainer=trainer,
            exploration_env=expl_env,
            evaluation_env=eval_env,
            exploration_data_collector=expl_path_collector,
            evaluation_data_collector=eval_path_collector,
            replay_buffer=replay_buffer,
            max_path_length=variant['max_path_length'],
            batch_size=variant['batch_size'],
            num_epochs=variant['num_epochs'],
            num_eval_steps_per_epoch=variant['num_eval_steps_per_epoch'],
            num_expl_steps_per_train_loop=variant['num_expl_steps_per_train_loop'],
            num_trains_per_train_loop=variant['num_trains_per_train_loop'],
            min_num_steps_before_training=variant['min_num_steps_before_training'],
        )
    algorithm.to(ptu.device)
    algorithm.train()
示例#18
0
def experiment(variant):

    expl_env = roboverse.make(variant['env'],
                              gui=False,
                              randomize=True,
                              observation_mode='state',
                              reward_type='shaped',
                              transpose_image=True)
    eval_env = expl_env

    action_dim = int(np.prod(eval_env.action_space.shape))
    state_dim = eval_env.observation_space.shape[0]

    qf_kwargs = copy.deepcopy(variant['qf_kwargs'])
    qf_kwargs['output_size'] = 1
    qf_kwargs['input_size'] = action_dim + state_dim
    qf1 = MlpQf(**qf_kwargs)
    qf2 = MlpQf(**qf_kwargs)

    target_qf_kwargs = copy.deepcopy(qf_kwargs)
    target_qf1 = MlpQf(**target_qf_kwargs)
    target_qf2 = MlpQf(**target_qf_kwargs)

    policy_kwargs = copy.deepcopy(variant['policy_kwargs'])
    policy_kwargs['action_dim'] = action_dim
    policy_kwargs['obs_dim'] = state_dim
    policy = TanhGaussianPolicy(**policy_kwargs)

    eval_policy = MakeDeterministic(policy)
    eval_path_collector = MdpPathCollector(
        eval_env, eval_policy, **variant['eval_path_collector_kwargs'])
    replay_buffer = EnvReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
    )
    trainer = SACTrainer(env=eval_env,
                         policy=policy,
                         qf1=qf1,
                         qf2=qf2,
                         target_qf1=target_qf1,
                         target_qf2=target_qf2,
                         **variant['trainer_kwargs'])

    if variant['scripted_policy']:
        if 'V3-v0' in variant['env']:
            scripted_policy = GraspV3ScriptedPolicy(
                expl_env, noise_std=variant['scripted_noise_std'])
        elif 'V4-v0' in variant['env']:
            scripted_policy = GraspV4ScriptedPolicy(
                expl_env, noise_std=variant['scripted_noise_std'])
        elif 'V5-v0' in variant['env']:
            scripted_policy = GraspV5ScriptedPolicy(
                expl_env, noise_std=variant['scripted_noise_std'])
        else:
            raise NotImplementedError
    else:
        scripted_policy = None

    if variant['collection_mode'] == 'batch':
        expl_path_collector = MdpPathCollector(
            expl_env,
            policy,
            optional_expl_policy=scripted_policy,
            optional_expl_probability_init=0.5,
            **variant['expl_path_collector_kwargs'])
        algorithm = TorchBatchRLAlgorithm(
            trainer=trainer,
            exploration_env=expl_env,
            evaluation_env=eval_env,
            exploration_data_collector=expl_path_collector,
            evaluation_data_collector=eval_path_collector,
            replay_buffer=replay_buffer,
            **variant['algo_kwargs'])
    elif variant['collection_mode'] == 'online':
        expl_path_collector = MdpStepCollector(
            expl_env,
            policy,
            optional_expl_policy=scripted_policy,
            optional_expl_probability=0.9,
            **variant['expl_path_collector_kwargs'])
        algorithm = TorchOnlineRLAlgorithm(
            trainer=trainer,
            exploration_env=expl_env,
            evaluation_env=eval_env,
            exploration_data_collector=expl_path_collector,
            evaluation_data_collector=eval_path_collector,
            replay_buffer=replay_buffer,
            **variant['algo_kwargs'])

    dump_buffer_func = BufferSaveFunction(variant)
    # algorithm.post_train_funcs.append(dump_buffer_func)

    algorithm.to(ptu.device)
    algorithm.train()