Пример #1
0
 def __init__(
         self,
         env,
         qf,
         exploration_policy,
         ddpg_kwargs,
         tdm_kwargs,
         base_kwargs,
         policy=None,
         replay_buffer=None,
 ):
     DDPG.__init__(
         self,
         env=env,
         qf=qf,
         policy=policy,
         exploration_policy=exploration_policy,
         replay_buffer=replay_buffer,
         **ddpg_kwargs,
         **base_kwargs
     )
     super().__init__(**tdm_kwargs)
     # Not supporting these in this implementation
     assert self.qf_weight_decay == 0
     assert self.residual_gradient_weight == 0
Пример #2
0
def experiment(variant):
    env = variant['env_class'](**variant['env_kwargs'])
    env = NormalizedBoxEnv(env, **variant['normalize_kwargs'])
    if variant['multitask']:
        env = MultitaskToFlatEnv(env)
    es = OUStrategy(action_space=env.action_space, **variant['ou_kwargs'])
    obs_dim = int(env.observation_space.flat_dim)
    action_dim = int(env.action_space.flat_dim)
    obs_normalizer = TorchFixedNormalizer(obs_dim)
    action_normalizer = TorchFixedNormalizer(action_dim)
    qf = MlpQf(input_size=obs_dim + action_dim,
               output_size=1,
               obs_normalizer=obs_normalizer,
               action_normalizer=action_normalizer,
               **variant['qf_kwargs'])
    policy = TanhMlpPolicy(input_size=obs_dim,
                           output_size=action_dim,
                           obs_normalizer=obs_normalizer,
                           **variant['policy_kwargs'])
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    algorithm = DDPG(env,
                     qf,
                     policy,
                     exploration_policy,
                     obs_normalizer=obs_normalizer,
                     action_normalizer=action_normalizer,
                     **variant['algo_kwargs'])
    algorithm.train()
def experiment(variant):
    env = gym.make(variant['env_id'])
    env = NormalizedBoxEnv(env)
    es = GaussianStrategy(action_space=env.action_space, )
    obs_dim = env.observation_space.low.size
    action_dim = env.action_space.low.size

    qf = FlattenMlp(input_size=obs_dim + action_dim,
                    output_size=1,
                    hidden_sizes=[128, 128])
    policy = TanhMlpPolicy(
        input_size=obs_dim,
        output_size=action_dim,
        hidden_sizes=[128, 128],
    )
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    algorithm = DDPG(env,
                     qf=qf,
                     policy=policy,
                     exploration_policy=exploration_policy,
                     **variant['algo_params'])

    algorithm.to(ptu.device)
    algorithm.train()
Пример #4
0
def example(variant):
    env = HalfCheetahEnv()
    if variant['normalize']:
        env = normalize(env)
    es = OUStrategy(action_space=env.action_space)
    qf = FeedForwardQFunction(
        int(env.observation_space.flat_dim),
        int(env.action_space.flat_dim),
        32,
        32,
    )
    policy = FeedForwardPolicy(
        int(env.observation_space.flat_dim),
        int(env.action_space.flat_dim),
        32,
        32,
    )
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    algorithm = DDPG(env,
                     qf=qf,
                     policy=policy,
                     exploration_policy=exploration_policy,
                     **variant['algo_params'])
    algorithm.to(ptu.device)
    algorithm.train()
def experiment(variant):
    env_params = variant['env_params']
    env = SawyerXYZReachingEnv(**env_params)
    es = OUStrategy(action_space=env.action_space)
    obs_dim = env.observation_space.low.size
    action_dim = env.action_space.low.size
    qf = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[400, 300],
    )
    policy = TanhMlpPolicy(
        input_size=obs_dim,
        output_size=action_dim,
        hidden_sizes=[400, 300],
    )
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    algorithm = DDPG(env,
                     qf=qf,
                     policy=policy,
                     exploration_policy=exploration_policy,
                     **variant['algo_params'])
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()
def experiment(variant):
    imsize = variant['imsize']
    history = variant['history']

    #env = InvertedDoublePendulumEnv()#gym.make(variant['env_id'])
    #    env = SawyerXYZEnv()
    env = RandomGoalPusher2DEnv()
    partial_obs_size = env.obs_dim
    env = NormalizedBoxEnv(
        ImageMujocoWithObsEnv(env,
                              imsize=imsize,
                              keep_prev=history - 1,
                              init_camera=variant['init_camera']))
    #    es = GaussianStrategy(
    #        action_space=env.action_space,
    #    )
    es = OUStrategy(action_space=env.action_space)
    obs_dim = env.observation_space.low.size
    action_dim = env.action_space.low.size

    qf = MergedCNN(input_width=imsize,
                   input_height=imsize,
                   output_size=1,
                   input_channels=history,
                   added_fc_input_size=action_dim + partial_obs_size,
                   **variant['cnn_params'])

    policy = CNNPolicy(
        input_width=imsize,
        input_height=imsize,
        added_fc_input_size=partial_obs_size,
        output_size=action_dim,
        input_channels=history,
        **variant['cnn_params'],
        output_activation=torch.tanh,
    )

    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    algorithm = DDPG(
        env,
        qf=qf,
        policy=policy,
        #        qf_weight_decay=.01,
        exploration_policy=exploration_policy,
        **variant['algo_params'])

    algorithm.to(ptu.device)
    algorithm.train()
Пример #7
0
def experiment(variant):
    env = variant['env_class'](**variant['env_kwargs'])
    env = MultitaskToFlatEnv(env)
    if variant['normalize']:
        env = NormalizedBoxEnv(env)
    exploration_type = variant['exploration_type']
    if exploration_type == 'ou':
        es = OUStrategy(action_space=env.action_space)
    elif exploration_type == 'gaussian':
        es = GaussianStrategy(
            action_space=env.action_space,
            max_sigma=0.1,
            min_sigma=0.1,  # Constant sigma
        )
    elif exploration_type == 'epsilon':
        es = EpsilonGreedy(
            action_space=env.action_space,
            prob_random_action=0.1,
        )
    else:
        raise Exception("Invalid type: " + exploration_type)
    obs_dim = env.observation_space.low.size
    action_dim = env.action_space.low.size
    qf = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[400, 300],
    )
    policy = TanhMlpPolicy(
        input_size=obs_dim,
        output_size=action_dim,
        hidden_sizes=[400, 300],
    )
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    algorithm = DDPG(
        env,
        qf=qf,
        policy=policy,
        exploration_policy=exploration_policy,
        **variant['algo_kwargs']
    )
    algorithm.to(ptu.device)
    algorithm.train()
def experiment(variant):
    env = NormalizedBoxEnv(
        MultiGoalEnv(
            actuation_cost_coeff=10,
            distance_cost_coeff=1,
            goal_reward=10,
        ))

    es = OUStrategy(action_space=env.action_space)
    qf = FeedForwardQFunction(
        int(env.observation_space.flat_dim),
        int(env.action_space.flat_dim),
        100,
        100,
    )
    policy = FeedForwardPolicy(
        int(env.observation_space.flat_dim),
        int(env.action_space.flat_dim),
        100,
        100,
    )
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    plotter = QFPolicyPlotter(
        qf=qf,
        # policy=policy,
        policy=exploration_policy,
        obs_lst=np.array([[-2.5, 0.0], [0.0, 0.0], [2.5, 2.5]]),
        default_action=[np.nan, np.nan],
        n_samples=100)
    algorithm = DDPG(env,
                     qf=qf,
                     policy=policy,
                     exploration_policy=exploration_policy,
                     render_eval_paths=True,
                     plotter=plotter,
                     **variant['algo_params'])
    algorithm.to(ptu.device)
    algorithm.train()
Пример #9
0
def example(variant):
    env_class = variant['env_class']
    env_params = variant['env_params']
    env = env_class(**env_params)
    normalize(env)
    es_class = variant['es_class']
    es_params = dict(action_space=env.action_space, **variant['es_params'])
    use_gpu = variant['use_gpu']
    es = es_class(**es_params)
    qf = FeedForwardQFunction(
        int(env.observation_space.flat_dim),
        int(env.action_space.flat_dim),
        100,
        100,
    )
    policy_class = variant['policy_class']
    policy_params = dict(
        obs_dim=get_dim(env.observation_space),
        action_dim=get_dim(env.action_space),
        fc1_size=100,
        fc2_size=100,
    )
    policy = policy_class(**policy_params)
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    algorithm = DDPG(
        env,
        qf=qf,
        policy=policy,
        exploration_policy=exploration_policy,
        **variant['algo_params'],
    )
    if use_gpu and ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()
Пример #10
0
def experiment(variant):
    data = joblib.load(GOOD_DDPG_POLICY_PATH)
    expert_policy = data['policy']

    env = NormalizedBoxEnv(variant['env_class']())
    es = OUStrategy(
        action_space=env.action_space,
        **variant['es_kwargs']
    )
    obs_dim = env.observation_space.low.size
    action_dim = env.action_space.low.size
    qf = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[400, 300],
    )
    policy = TanhMlpPolicy(
        input_size=obs_dim,
        output_size=action_dim,
        hidden_sizes=[400, 300],
    )
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=expert_policy,
    )
    algorithm = DDPG(
        env,
        qf=qf,
        policy=policy,
        exploration_policy=exploration_policy,
        **variant['algo_kwargs']
    )
    if ptu.gpu_enabled():
        expert_policy.to(ptu.device)
        algorithm.to(ptu.device)
    algorithm.train()
def experiment(variant):
    env_params = variant['env_params']
    es_params = variant['es_params']
    env = SawyerXYZReachingEnv(**env_params)
    es = OUStrategy(action_space=env.action_space, **es_params)
    hidden_sizes = variant['hidden_sizes']

    obs_dim = env.observation_space.low.size
    action_dim = env.action_space.low.size
    qf = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[hidden_sizes, hidden_sizes],
    )
    policy = TanhMlpPolicy(
        input_size=obs_dim,
        output_size=action_dim,
        hidden_sizes=[hidden_sizes, hidden_sizes],
    )
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    if variant['env_params']['relative_pos_control']:
        variant['algo_params']['max_path_length'] = 3
        variant['algo_params']['num_steps_per_epoch'] = 15
        variant['algo_params']['num_steps_per_eval'] = 15
    algorithm = DDPG(env,
                     qf=qf,
                     policy=policy,
                     exploration_policy=exploration_policy,
                     **variant['algo_params'])

    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()
Пример #12
0
 def evaluate(self, epoch):
     DDPG.evaluate(self, epoch)