示例#1
0
def experiment(variant):
    imsize = variant['imsize']
    history = variant['history']
    env = gym.make(variant['env_id'])
    env = NormalizedBoxEnv(
        ImageEnv(env,
                 imsize=imsize,
                 keep_prev=history - 1,
                 init_viewer=variant['init_viewer']))
    es = GaussianStrategy(action_space=env.action_space, )
    obs_dim = env.observation_space.low.size
    action_dim = env.action_space.low.size
    qf1 = MergedCNN(input_width=imsize,
                    input_height=imsize,
                    output_size=1,
                    input_channels=history,
                    added_fc_input_size=action_dim,
                    **variant['cnn_params'])

    qf2 = MergedCNN(input_width=imsize,
                    input_height=imsize,
                    output_size=1,
                    input_channels=history,
                    added_fc_input_size=action_dim,
                    **variant['cnn_params'])

    policy = CNNPolicy(
        input_width=imsize,
        input_height=imsize,
        output_size=action_dim,
        input_channels=history,
        **variant['cnn_params'],
        output_activation=torch.tanh,
    )

    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    algorithm = TD3(env,
                    qf1=qf1,
                    qf2=qf2,
                    policy=policy,
                    exploration_policy=exploration_policy,
                    policy_and_target_update_period=15,
                    policy_learning_rate=1e-5,
                    **variant['algo_kwargs'])
    """    algorithm = DDPG(
        env,
        qf=qf1,
        policy=policy,
#        qf_weight_decay=.01,
        exploration_policy=exploration_policy,
        **variant['algo_kwargs']
    )"""

    algorithm.to(ptu.device)
    algorithm.train()
def experiment(variant):

    ptu.set_gpu_mode(True, 0)

    imsize = variant['imsize']

    env = ImageForkReacher2dEnv(variant["arm_goal_distance_cost_coeff"],
                                variant["arm_object_distance_cost_coeff"],
                                [imsize, imsize, 3],
                                goal_object_distance_cost_coeff=variant[
                                    "goal_object_distance_cost_coeff"],
                                ctrl_cost_coeff=variant["ctrl_cost_coeff"])

    partial_obs_size = env.obs_dim - imsize * imsize * 3
    print("partial dim was " + str(partial_obs_size))
    env = NormalizedBoxEnv(env)

    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))

    qf1 = MergedCNN(input_width=imsize,
                    input_height=imsize,
                    output_size=1,
                    input_channels=3,
                    added_fc_input_size=action_dim,
                    **variant['cnn_params'])

    qf2 = MergedCNN(input_width=imsize,
                    input_height=imsize,
                    output_size=1,
                    input_channels=3,
                    added_fc_input_size=action_dim,
                    **variant['cnn_params'])

    vf = CNN(input_width=imsize,
             input_height=imsize,
             output_size=1,
             input_channels=3,
             **variant['cnn_params'])

    policy = TanhCNNGaussianPolicy(input_width=imsize,
                                   input_height=imsize,
                                   output_size=action_dim,
                                   input_channels=3,
                                   **variant['cnn_params'])

    algorithm = TwinSAC(env=env,
                        policy=policy,
                        qf1=qf1,
                        qf2=qf2,
                        vf=vf,
                        **variant['algo_params'])

    algorithm.to(ptu.device)
    algorithm.train()
def experiment(variant):
    imsize = variant['imsize']
    history = variant['history']

    #env = InvertedDoublePendulumEnv()#gym.make(variant['env_id'])
    #    env = SawyerXYZEnv()
    env = RandomGoalPusher2DEnv()
    partial_obs_size = env.obs_dim
    env = NormalizedBoxEnv(
        ImageMujocoWithObsEnv(env,
                              imsize=imsize,
                              keep_prev=history - 1,
                              init_camera=variant['init_camera']))
    #    es = GaussianStrategy(
    #        action_space=env.action_space,
    #    )
    es = OUStrategy(action_space=env.action_space)
    obs_dim = env.observation_space.low.size
    action_dim = env.action_space.low.size

    qf = MergedCNN(input_width=imsize,
                   input_height=imsize,
                   output_size=1,
                   input_channels=history,
                   added_fc_input_size=action_dim + partial_obs_size,
                   **variant['cnn_params'])

    policy = CNNPolicy(
        input_width=imsize,
        input_height=imsize,
        added_fc_input_size=partial_obs_size,
        output_size=action_dim,
        input_channels=history,
        **variant['cnn_params'],
        output_activation=torch.tanh,
    )

    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    algorithm = DDPG(
        env,
        qf=qf,
        policy=policy,
        #        qf_weight_decay=.01,
        exploration_policy=exploration_policy,
        **variant['algo_params'])

    algorithm.to(ptu.device)
    algorithm.train()
def experiment(variant):
    imsize = variant['imsize']
    history = variant['history']

    env = Pusher2DEnv()#gym.make(variant['env_id']).env
    env = NormalizedBoxEnv(ImageMujocoEnv(env,
                                    imsize=imsize,
                                    keep_prev=history - 1,
                                    init_camera=variant['init_camera']))
#    es = GaussianStrategy(
#        action_space=env.action_space,
#    )
    es = OUStrategy(action_space=env.action_space)
    obs_dim = env.observation_space.low.size
    action_dim = env.action_space.low.size

    qf = MergedCNN(input_width=imsize,
                   input_height=imsize,
                   output_size=1,
                   input_channels= history,
                   added_fc_input_size=action_dim,
                   **variant['cnn_params'])

    vf  = CNN(input_width=imsize,
               input_height=imsize,
               output_size=1,
               input_channels=history,
               **variant['cnn_params'])

    policy = TanhCNNGaussianPolicy(input_width=imsize,
                                   input_height=imsize,
                                   output_size=action_dim,
                                   input_channels=history,
                                   **variant['cnn_params'],
    )


    algorithm = SoftActorCritic(
        env=env,
        policy=policy,
        qf=qf,
        vf=vf,
        **variant['algo_params']
    )

    algorithm.to(ptu.device)
    algorithm.train()
def her_td3_experiment(variant):
    import multiworld.envs.mujoco
    import multiworld.envs.pygame
    import railrl.samplers.rollout_functions as rf
    import railrl.torch.pytorch_util as ptu
    from railrl.exploration_strategies.base import (
        PolicyWrappedWithExplorationStrategy)
    from railrl.exploration_strategies.epsilon_greedy import EpsilonGreedy
    from railrl.exploration_strategies.gaussian_strategy import GaussianStrategy
    from railrl.exploration_strategies.ou_strategy import OUStrategy
    from railrl.torch.grill.launcher import get_video_save_func
    from railrl.torch.her.her_td3 import HerTd3
    from railrl.data_management.obs_dict_replay_buffer import (
        ObsDictRelabelingBuffer)

    if 'env_id' in variant:
        env = gym.make(variant['env_id'])
    else:
        env = variant['env_class'](**variant['env_kwargs'])

    imsize = 84
    env = MujocoGymToMultiEnv(env.env)  # unwrap TimeLimit
    env = ImageEnv(env,
                   non_presampled_goal_img_is_garbage=True,
                   recompute_reward=False)

    observation_key = variant['observation_key']
    desired_goal_key = variant['desired_goal_key']
    variant['algo_kwargs']['her_kwargs']['observation_key'] = observation_key
    variant['algo_kwargs']['her_kwargs']['desired_goal_key'] = desired_goal_key
    if variant.get('normalize', False):
        raise NotImplementedError()

    achieved_goal_key = desired_goal_key.replace("desired", "achieved")
    replay_buffer = ObsDictRelabelingBuffer(
        env=env,
        observation_key=observation_key,
        desired_goal_key=desired_goal_key,
        achieved_goal_key=achieved_goal_key,
        **variant['replay_buffer_kwargs'])
    obs_dim = env.observation_space.spaces[observation_key].low.size
    action_dim = env.action_space.low.size
    goal_dim = env.observation_space.spaces[desired_goal_key].low.size
    exploration_type = variant['exploration_type']
    if exploration_type == 'ou':
        es = OUStrategy(action_space=env.action_space, **variant['es_kwargs'])
    elif exploration_type == 'gaussian':
        es = GaussianStrategy(
            action_space=env.action_space,
            **variant['es_kwargs'],
        )
    elif exploration_type == 'epsilon':
        es = EpsilonGreedy(
            action_space=env.action_space,
            **variant['es_kwargs'],
        )
    else:
        raise Exception("Invalid type: " + exploration_type)

    use_images_for_q = variant["use_images_for_q"]
    use_images_for_pi = variant["use_images_for_pi"]

    qs = []
    for i in range(2):
        if use_images_for_q:
            image_q = MergedCNN(input_width=imsize,
                                input_height=imsize,
                                output_size=1,
                                input_channels=3,
                                added_fc_input_size=action_dim,
                                **variant['cnn_params'])
            q = ImageStateQ(image_q, None)
        else:
            state_q = FlattenMlp(input_size=action_dim + goal_dim,
                                 output_size=1,
                                 **variant['qf_kwargs'])
            q = ImageStateQ(None, state_q)
        qs.append(q)
    qf1, qf2 = qs

    if use_images_for_pi:
        image_policy = CNNPolicy(
            input_width=imsize,
            input_height=imsize,
            output_size=action_dim,
            input_channels=3,
            **variant['cnn_params'],
            output_activation=torch.tanh,
        )
        policy = ImageStatePolicy(image_policy, None)
    else:
        state_policy = TanhMlpPolicy(input_size=goal_dim,
                                     output_size=action_dim,
                                     **variant['policy_kwargs'])
        policy = ImageStatePolicy(None, state_policy)

    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    algorithm = HerTd3(env,
                       qf1=qf1,
                       qf2=qf2,
                       policy=policy,
                       exploration_policy=exploration_policy,
                       replay_buffer=replay_buffer,
                       **variant['algo_kwargs'])
    if variant.get("save_video", False):
        rollout_function = rf.create_rollout_function(
            rf.multitask_rollout,
            max_path_length=algorithm.max_path_length,
            observation_key=algorithm.observation_key,
            desired_goal_key=algorithm.desired_goal_key,
        )
        video_func = get_video_save_func(
            rollout_function,
            env,
            policy,
            variant,
        )
        algorithm.post_epoch_funcs.append(video_func)
    algorithm.to(ptu.device)
    algorithm.train()
示例#6
0
def HER_baseline_td3_experiment(variant):
    import railrl.torch.pytorch_util as ptu
    from railrl.data_management.obs_dict_replay_buffer import \
        ObsDictRelabelingBuffer
    from railrl.exploration_strategies.base import (
        PolicyWrappedWithExplorationStrategy)
    from railrl.torch.her.her_td3 import HerTd3
    from railrl.torch.networks import MergedCNN, CNNPolicy
    import torch
    from multiworld.core.image_env import ImageEnv
    from railrl.misc.asset_loader import load_local_or_remote_file

    init_camera = variant.get("init_camera", None)
    presample_goals = variant.get('presample_goals', False)
    presampled_goals_path = get_presampled_goals_path(
        variant.get('presampled_goals_path', None))

    if 'env_id' in variant:
        import gym
        import multiworld
        multiworld.register_all_envs()
        env = gym.make(variant['env_id'])
    else:
        env = variant["env_class"](**variant['env_kwargs'])
    image_env = ImageEnv(
        env,
        variant.get('imsize'),
        reward_type='image_sparse',
        init_camera=init_camera,
        transpose=True,
        normalize=True,
    )
    if presample_goals:
        if presampled_goals_path is None:
            image_env.non_presampled_goal_img_is_garbage = True
            presampled_goals = variant['generate_goal_dataset_fctn'](
                env=image_env, **variant['goal_generation_kwargs'])
        else:
            presampled_goals = load_local_or_remote_file(
                presampled_goals_path).item()
        del image_env
        env = ImageEnv(
            env,
            variant.get('imsize'),
            reward_type='image_distance',
            init_camera=init_camera,
            transpose=True,
            normalize=True,
            presampled_goals=presampled_goals,
        )
    else:
        env = image_env

    es = get_exploration_strategy(variant, env)

    observation_key = variant.get('observation_key', 'image_observation')
    desired_goal_key = variant.get('desired_goal_key', 'image_desired_goal')
    achieved_goal_key = desired_goal_key.replace("desired", "achieved")
    imsize = variant['imsize']
    action_dim = env.action_space.low.size
    qf1 = MergedCNN(input_width=imsize,
                    input_height=imsize,
                    output_size=1,
                    input_channels=3 * 2,
                    added_fc_input_size=action_dim,
                    **variant['cnn_params'])
    qf2 = MergedCNN(input_width=imsize,
                    input_height=imsize,
                    output_size=1,
                    input_channels=3 * 2,
                    added_fc_input_size=action_dim,
                    **variant['cnn_params'])

    policy = CNNPolicy(
        input_width=imsize,
        input_height=imsize,
        added_fc_input_size=0,
        output_size=action_dim,
        input_channels=3 * 2,
        output_activation=torch.tanh,
        **variant['cnn_params'],
    )
    target_qf1 = MergedCNN(input_width=imsize,
                           input_height=imsize,
                           output_size=1,
                           input_channels=3 * 2,
                           added_fc_input_size=action_dim,
                           **variant['cnn_params'])
    target_qf2 = MergedCNN(input_width=imsize,
                           input_height=imsize,
                           output_size=1,
                           input_channels=3 * 2,
                           added_fc_input_size=action_dim,
                           **variant['cnn_params'])

    target_policy = CNNPolicy(
        input_width=imsize,
        input_height=imsize,
        added_fc_input_size=0,
        output_size=action_dim,
        input_channels=3 * 2,
        output_activation=torch.tanh,
        **variant['cnn_params'],
    )
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )

    replay_buffer = ObsDictRelabelingBuffer(
        env=env,
        observation_key=observation_key,
        desired_goal_key=desired_goal_key,
        achieved_goal_key=achieved_goal_key,
        **variant['replay_buffer_kwargs'])
    algo_kwargs = variant['algo_kwargs']
    algo_kwargs['replay_buffer'] = replay_buffer
    base_kwargs = algo_kwargs['base_kwargs']
    base_kwargs['training_env'] = env
    base_kwargs['render'] = variant["render"]
    base_kwargs['render_during_eval'] = variant["render"]
    her_kwargs = algo_kwargs['her_kwargs']
    her_kwargs['observation_key'] = observation_key
    her_kwargs['desired_goal_key'] = desired_goal_key
    algorithm = HerTd3(env,
                       qf1=qf1,
                       qf2=qf2,
                       policy=policy,
                       target_qf1=target_qf1,
                       target_qf2=target_qf2,
                       target_policy=target_policy,
                       exploration_policy=exploration_policy,
                       **variant['algo_kwargs'])

    algorithm.to(ptu.device)
    algorithm.train()