Пример #1
0
def experiment(variant, data):
    # make new env, reloading with data['evaluation/env'] seems to make bug
    eval_env = gym.make("panda-v0", **{"headless": variant["headless"]})
    eval_env.seed(variant['seed'])
    expl_env = eval_env

    qf1 = data['trainer/qf1']
    qf2 = data['trainer/qf2']
    target_qf1 = data['trainer/target_qf1']
    target_qf2 = data['trainer/target_qf2']
    policy = data['trainer/policy']
    eval_policy = data["evaluation/policy"]
    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = CustomMDPPathCollector(eval_env, )
    buffer_filename = None
    if variant['buffer_filename'] is not None:
        buffer_filename = variant['buffer_filename']

    replay_buffer = EnvReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
    )
    if variant['load_buffer'] and buffer_filename is not None:
        replay_buffer.load_buffer(buffer_filename)
    else:
        dataset = get_dataset(variant["h5path"], eval_env)
        load_hdf5(d4rl.qlearning_dataset(eval_env, dataset), replay_buffer)

    trainer = CQLTrainer(env=eval_env,
                         policy=policy,
                         qf1=qf1,
                         qf2=qf2,
                         target_qf1=target_qf1,
                         target_qf2=target_qf2,
                         **variant['trainer_kwargs'])
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        eval_both=True,
        batch_rl=variant['load_buffer'],
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train(start_epoch=variant["start_epoch"])
Пример #2
0
def experiment(variant):
    expl_env = gym.make("CartPole-v0")
    eval_env = gym.make("CartPole-v0")
    obs_dim = expl_env.observation_space.low.size
    action_dim = eval_env.action_space.n

    qf = Mlp(hidden_sizes=[32, 32], input_size=obs_dim, output_size=action_dim)
    target_qf = Mlp(hidden_sizes=[32, 32],
                    input_size=obs_dim,
                    output_size=action_dim)
    qf_criterion = nn.MSELoss()
    eval_policy = ArgmaxDiscretePolicy(qf)
    expl_policy = PolicyWrappedWithExplorationStrategy(
        EpsilonGreedy(expl_env.action_space), eval_policy)
    eval_path_collector = MdpPathCollector(eval_env, eval_policy)
    expl_path_collector = MdpPathCollector(expl_env, expl_policy)
    trainer = DQNTrainer(qf=qf,
                         target_qf=target_qf,
                         qf_criterion=qf_criterion,
                         **variant["trainer_kwargs"])
    replay_buffer = EnvReplayBuffer(variant["replay_buffer_size"], expl_env)
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant["algorithm_kwargs"])
    algorithm.to(ptu.device)
    algorithm.train()
Пример #3
0
def experiment(variant):
    from rlkit.envs.gym_minigrid.gym_minigrid import envs

    expl_env = ToolsEnv(**variant['env_kwargs'])
    eval_env = ToolsEnv(**variant['env_kwargs'])

    rollout_env = ToolsEnv(**variant['env_kwargs'])

    obs_dim = expl_env.observation_space.low.size
    action_dim = eval_env.action_space.n
    layer_size = variant['algo_kwargs']['layer_size']
    lifetime = variant['env_kwargs'].get('time_horizon', 0) == 0
    if lifetime:
        assert eval_env.time_horizon == 0, 'cannot have time horizon for lifetime env'

    qf = gen_network(variant['algo_kwargs'], action_dim, layer_size)
    target_qf = gen_network(variant['algo_kwargs'], action_dim, layer_size)

    qf_criterion = nn.MSELoss()
    eval_policy = ArgmaxDiscretePolicy(qf)
    # eval_policy = SoftmaxQPolicy(qf)
    expl_policy = PolicyWrappedWithExplorationStrategy(
        EpsilonGreedyDecay(expl_env.action_space, 1e-5, 1, 0.1),
        eval_policy,
    )
    if lifetime:
        eval_policy = expl_policy
    # expl_policy = PolicyWrappedWithExplorationStrategy(
    #     EpsilonGreedy(expl_env.action_space, 0.5),
    #     eval_policy,
    # )
    if eval_env.time_horizon == 0:
        collector_class = LifetimeMdpPathCollector if lifetime else MdpPathCollector
    else:
        collector_class = MdpPathCollectorConfig
    eval_path_collector = collector_class(
        eval_env,
        eval_policy,
        # render=True
    )
    expl_path_collector = collector_class(expl_env, expl_policy)
    trainer = DoubleDQNTrainer(qf=qf,
                               target_qf=target_qf,
                               qf_criterion=qf_criterion,
                               **variant['algo_kwargs']['trainer_kwargs'])
    replay_buffer = EnvReplayBuffer(
        variant['algo_kwargs']['replay_buffer_size'], expl_env)
    #algo_class = TorchLifetimeRLAlgorithm if lifetime else TorchBatchRLAlgorithm
    algo_class = TorchHumanInputLifetimeRLAlgorithm
    algorithm = algo_class(trainer=trainer,
                           exploration_env=expl_env,
                           evaluation_env=eval_env,
                           rollout_env=rollout_env,
                           exploration_data_collector=expl_path_collector,
                           evaluation_data_collector=eval_path_collector,
                           replay_buffer=replay_buffer,
                           **variant['algo_kwargs']['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Пример #4
0
def experiment(variant):
    import sys
    from traffic.make_env import make_env
    expl_env = make_env(args.exp_name)
    eval_env = make_env(args.exp_name)
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.n

    module = Mlp(
        hidden_sizes=[32, 32],
        input_size=obs_dim,
        output_size=action_dim,
    )
    policy = SoftmaxPolicy(module, **variant['policy_kwargs'])
    qf1 = Mlp(input_size=obs_dim,
              output_size=action_dim,
              **variant['qf_kwargs'])
    target_qf1 = copy.deepcopy(qf1)
    qf2 = Mlp(input_size=obs_dim,
              output_size=action_dim,
              **variant['qf_kwargs'])
    target_qf2 = copy.deepcopy(qf2)

    eval_policy = ArgmaxDiscretePolicy(policy, use_preactivation=True)
    expl_policy = policy

    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        expl_policy,
    )
    qf_criterion = nn.MSELoss()
    trainer = SACDiscreteTrainer(env=eval_env,
                                 policy=policy,
                                 qf1=qf1,
                                 qf2=qf2,
                                 target_qf1=target_qf1,
                                 target_qf2=target_qf2,
                                 qf_criterion=qf_criterion,
                                 **variant['trainer_kwargs'])
    replay_buffer = EnvReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
    )
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Пример #5
0
def experiment(variant):
    '''
    1. 建立实验环境(eval, expl)
    2. 确立输入,输出维度,建立qf函数,policy函数
    3. 复制target qf和 target policy 函数
    4. 对于评估构建path collector
    5. 对于训练实验,构建探索策略、path collector、replay buffer
    6. 构建 DDPGTrainer (qf, policy)
    7. algorithm (包括trainer, env, replay buffer, path collector.以及用于评价部分)
    8. 开始训练
    :param variant: config parameter
    :return:
    '''
    eval_env = NormalizedBoxEnv(HalfCheetahEnv())
    expl_env = NormalizedBoxEnv(HalfCheetahEnv())
    # Or for a specific version:
    # import gym
    # env = NormalizedBoxEnv(gym.make('HalfCheetah-v1'))
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size
    qf = FlattenMlp(input_size=obs_dim + action_dim,
                    output_size=1,
                    **variant['qf_kwargs'])
    policy = TanhMlpPolicy(input_size=obs_dim,
                           output_size=action_dim,
                           **variant['policy_kwargs'])
    # 利用copy
    target_qf = copy.deepcopy(qf)
    target_policy = copy.deepcopy(policy)
    # 评估
    eval_path_collector = MdpPathCollector(eval_env, policy)
    # 实验 (探索策略、path收集、replay buffer)
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=OUStrategy(action_space=expl_env.action_space),
        policy=policy,
    )
    expl_path_collector = MdpPathCollector(expl_env, exploration_policy)
    replay_buffer = EnvReplayBuffer(variant['replay_buffer_size'], expl_env)

    trainer = DDPGTrainer(qf=qf,
                          target_qf=target_qf,
                          policy=policy,
                          target_policy=target_policy,
                          **variant['trainer_kwargs'])
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs'])
    # 转化变量格式
    algorithm.to(ptu.device)

    algorithm.train()
Пример #6
0
def experiment(variant):
    env = Point2DEnv(**variant['env_kwargs'])
    env = FlatGoalEnv(env)
    env = NormalizedBoxEnv(env)

    action_dim = int(np.prod(env.action_space.shape))
    obs_dim = int(np.prod(env.observation_space.shape))

    qf1 = ConcatMlp(input_size=obs_dim + action_dim,
                    output_size=1,
                    **variant['qf_kwargs'])
    qf2 = ConcatMlp(input_size=obs_dim + action_dim,
                    output_size=1,
                    **variant['qf_kwargs'])
    target_qf1 = ConcatMlp(input_size=obs_dim + action_dim,
                           output_size=1,
                           **variant['qf_kwargs'])
    target_qf2 = ConcatMlp(input_size=obs_dim + action_dim,
                           output_size=1,
                           **variant['qf_kwargs'])
    policy = TanhGaussianPolicy(obs_dim=obs_dim,
                                action_dim=action_dim,
                                **variant['policy_kwargs'])
    eval_env = expl_env = env

    eval_policy = MakeDeterministic(policy)
    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        policy,
    )
    replay_buffer = EnvReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
    )
    trainer = TwinSACTrainer(env=eval_env,
                             policy=policy,
                             qf1=qf1,
                             qf2=qf2,
                             target_qf1=target_qf1,
                             target_qf2=target_qf2,
                             **variant['trainer_kwargs'])
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        data_buffer=replay_buffer,
        **variant['algo_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Пример #7
0
def experiment(variant):

    qf = CNN(
        input_width=obs_dim,
        input_height=obs_dim,
        input_channels=channels,
        output_size=action_dim,
        kernel_sizes=[8, 4],
        n_channels=[16, 32],
        strides=[4, 2],
        paddings=[0, 0],
        hidden_sizes=[256],
    )
    target_qf = CNN(
        input_width=obs_dim,
        input_height=obs_dim,
        input_channels=channels,
        output_size=action_dim,
        kernel_sizes=[8, 4],
        n_channels=[16, 32],
        strides=[4, 2],
        paddings=[0, 0],
        hidden_sizes=[256],
    )
    qf_criterion = nn.MSELoss()
    eval_learner_policy = ArgmaxDiscretePolicy(qf)
    expl_learner_policy = PolicyWrappedWithExplorationStrategy(
        AnnealedEpsilonGreedy(symbolic_action_space,
                              anneal_rate=variant["anneal_rate"]),
        eval_learner_policy,
    )
    eval_policy = LearnPlanPolicy(eval_learner_policy)
    expl_policy = LearnPlanPolicy(expl_learner_policy)
    eval_path_collector = MdpPathCollector(eval_env,
                                           eval_policy,
                                           rollout=hierarchical_rollout)
    expl_path_collector = MdpPathCollector(expl_env,
                                           expl_policy,
                                           rollout=hierarchical_rollout)
    trainer = DQNTrainer(qf=qf,
                         target_qf=target_qf,
                         qf_criterion=qf_criterion,
                         **variant["trainer_kwargs"])
    replay_buffer = EnvReplayBuffer(variant["replay_buffer_size"], symb_env)
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant["algorithm_kwargs"])
    algorithm.to(ptu.device)
    algorithm.train()
Пример #8
0
def experiment(variant):
    # Select a different success_function for different tasks.
    expl_env = GymCraftingEnv(state_obs=True,
                              few_obj=True,
                              success_function=eval_eatbread)
    eval_env = GymCraftingEnv(state_obs=True,
                              few_obj=True,
                              success_function=eval_eatbread)
    obs_dim = expl_env.observation_space.low.size
    action_dim = eval_env.action_space.n

    qf = Mlp(
        hidden_sizes=[32, 32],
        input_size=obs_dim,
        output_size=action_dim,
    )
    target_qf = Mlp(
        hidden_sizes=[32, 32],
        input_size=obs_dim,
        output_size=action_dim,
    )
    qf_criterion = nn.MSELoss()
    eval_policy = ArgmaxDiscretePolicy(qf)
    expl_policy = PolicyWrappedWithExplorationStrategy(
        EpsilonGreedy(expl_env.action_space),
        eval_policy,
    )
    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        expl_policy,
    )
    trainer = DQNTrainer(qf=qf,
                         target_qf=target_qf,
                         qf_criterion=qf_criterion,
                         **variant['trainer_kwargs'])
    replay_buffer = EnvReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
    )
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Пример #9
0
def experiment(variant):

    expl_env = get_env()
    eval_env = get_env()

    post_epoch_funcs = []
    M = variant['layer_size']
    trainer = get_sac_model(env=eval_env, hidden_sizes=[M, M])
    policy = trainer.policy
    eval_policy = MakeDeterministic(policy)
    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        policy,
    )
    replay_buffer = EnvReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
    )

    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs'])
    columns = ['Epoch', 'mean', 'std']
    eval_result = pd.DataFrame(columns=columns)
    eval_output_csv = os.path.join(variant['log_dir'], 'eval_result.csv')

    def post_epoch_func(self, epoch):
        nonlocal eval_result
        nonlocal policy
        print(f'-------------post_epoch_func start-------------')
        eval_result = my_eval_policy(
            env=get_env(),
            algorithm=self,
            epoch=epoch,
            eval_result=eval_result,
            output_csv=eval_output_csv,
        )
        print(f'-------------post_epoch_func done-------------')

    algorithm.post_epoch_funcs = [
        post_epoch_func,
    ]
    algorithm.to(ptu.device)
    algorithm.train()
Пример #10
0
def run_sac(base_expl_env, base_eval_env, variant):
    expl_env = FlatGoalEnv(base_expl_env, append_goal_to_obs=True)
    eval_env = FlatGoalEnv(base_eval_env, append_goal_to_obs=True)
    obs_dim = expl_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size

    M = variant["layer_size"]
    num_hidden = variant["num_hidden_layers"]
    qf1 = FlattenMlp(input_size=obs_dim + action_dim,
                     output_size=1,
                     hidden_sizes=[M] * num_hidden)
    qf2 = FlattenMlp(input_size=obs_dim + action_dim,
                     output_size=1,
                     hidden_sizes=[M] * num_hidden)
    target_qf1 = FlattenMlp(input_size=obs_dim + action_dim,
                            output_size=1,
                            hidden_sizes=[M] * num_hidden)
    target_qf2 = FlattenMlp(input_size=obs_dim + action_dim,
                            output_size=1,
                            hidden_sizes=[M] * num_hidden)
    policy = TanhGaussianPolicy(obs_dim=obs_dim,
                                action_dim=action_dim,
                                hidden_sizes=[M] * num_hidden)
    eval_policy = MakeDeterministic(policy)
    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        policy,
    )
    replay_buffer = EnvReplayBuffer(
        variant["replay_buffer_size"],
        expl_env,
    )
    trainer = SACTrainer(env=eval_env,
                         policy=policy,
                         qf1=qf1,
                         qf2=qf2,
                         target_qf1=target_qf1,
                         target_qf2=target_qf2,
                         **variant["trainer_kwargs"])
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant["algorithm_kwargs"])
    algorithm.train()
Пример #11
0
def experiment(variant):
    import sys
    from traffic.make_env import make_env
    expl_env = make_env(args.exp_name)
    eval_env = make_env(args.exp_name)
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.n

    gb = TrafficGraphBuilder(input_dim=4,
                             ego_init=torch.tensor([0., 1.]),
                             other_init=torch.tensor([1., 0.]),
                             edge_index=torch.tensor([[0, 0, 1, 2],
                                                      [1, 2, 0, 0]]))
    qf = GNNNet(pre_graph_builder=gb,
                node_dim=16,
                output_dim=action_dim,
                post_mlp_kwargs=variant['qf_kwargs'],
                num_conv_layers=3)

    target_qf = copy.deepcopy(qf)
    eval_policy = ArgmaxDiscretePolicy(qf)
    expl_policy = PolicyWrappedWithExplorationStrategy(
        EpsilonGreedy(expl_env.action_space, variant['epsilon']),
        eval_policy,
    )
    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        expl_policy,
    )
    qf_criterion = nn.MSELoss()
    trainer = DoubleDQNTrainer(qf=qf,
                               target_qf=target_qf,
                               qf_criterion=qf_criterion,
                               **variant['trainer_kwargs'])
    replay_buffer = EnvReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
    )
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Пример #12
0
def experiment(variant):
    args = getArgs()
    # expl_env = NormalizedBoxEnv(environment(args))

    expl_env = environment(args, 'dqn')
    eval_env = environment(args, 'dqn')
    # expl_env.render()
    obs_dim = expl_env.get_obsdim()
    action_dim = expl_env.action_space.n

    qf = Mlp(
        hidden_sizes=[32, 32],
        input_size=obs_dim,
        output_size=action_dim,
    )
    target_qf = Mlp(
        hidden_sizes=[32, 32],
        input_size=obs_dim,
        output_size=action_dim,
    )
    qf_criterion = nn.MSELoss()
    eval_policy = ArgmaxDiscretePolicy(qf)
    expl_policy = PolicyWrappedWithExplorationStrategy(
        EpsilonGreedy(expl_env.action_space),
        eval_policy,
    )
    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        expl_policy,
    )
    trainer = DQNTrainer(qf=qf,
                         target_qf=target_qf,
                         qf_criterion=qf_criterion,
                         **variant['trainer_kwargs'])
    replay_buffer = EnvReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
    )
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Пример #13
0
def experiment(variant):
    import sys
    from traffic.make_env import make_env
    expl_env = make_env(args.exp_name)
    eval_env = make_env(args.exp_name)
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.n

    qf = Mlp(
        input_size=obs_dim,
        output_size=action_dim,
        **variant['qf_kwargs']
    )
    target_qf = copy.deepcopy(qf)
    eval_policy = ArgmaxDiscretePolicy(qf)
    expl_policy = PolicyWrappedWithExplorationStrategy(
        EpsilonGreedy(expl_env.action_space, variant['epsilon']),
        eval_policy,
    )
    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        expl_policy,
    )
    qf_criterion = nn.MSELoss()
    trainer = DoubleDQNTrainer(
        qf=qf,
        target_qf=target_qf,
        qf_criterion=qf_criterion,
        **variant['trainer_kwargs']
    )
    replay_buffer = EnvReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
    )
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        log_path_function = get_traffic_path_information,
        **variant['algorithm_kwargs']
    )
    algorithm.to(ptu.device)
    algorithm.train()
Пример #14
0
def experiment(variant):
    expl_env = NormalizedBoxEnv(HalfCheetahEnv())
    eval_env = NormalizedBoxEnv(HalfCheetahEnv())
    obs_dim = expl_env.observation_space.low.size
    action_dim = expl_env.action_space.low.size
    qf1 = FlattenMlp(input_size=obs_dim + action_dim,
                     output_size=1,
                     **variant["qf_kwargs"])
    qf2 = FlattenMlp(input_size=obs_dim + action_dim,
                     output_size=1,
                     **variant["qf_kwargs"])
    target_qf1 = FlattenMlp(input_size=obs_dim + action_dim,
                            output_size=1,
                            **variant["qf_kwargs"])
    target_qf2 = FlattenMlp(input_size=obs_dim + action_dim,
                            output_size=1,
                            **variant["qf_kwargs"])
    policy = TanhMlpPolicy(input_size=obs_dim,
                           output_size=action_dim,
                           **variant["policy_kwargs"])
    target_policy = TanhMlpPolicy(input_size=obs_dim,
                                  output_size=action_dim,
                                  **variant["policy_kwargs"])
    es = GaussianStrategy(
        action_space=expl_env.action_space,
        max_sigma=0.1,
        min_sigma=0.1,  # Constant sigma
    )
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es, policy=policy)
    eval_path_collector = MdpPathCollector(eval_env, policy)
    expl_path_collector = MdpPathCollector(expl_env, exploration_policy)
    replay_buffer = EnvReplayBuffer(variant["replay_buffer_size"], expl_env)
    trainer = TD3Trainer(policy=policy,
                         qf1=qf1,
                         qf2=qf2,
                         target_qf1=target_qf1,
                         target_qf2=target_qf2,
                         target_policy=target_policy,
                         **variant["trainer_kwargs"])
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant["algorithm_kwargs"])
    algorithm.to(ptu.device)
    algorithm.train()
Пример #15
0
def get_replay_buffer(variant, expl_env):
    """
    Define replay buffer specific to the mode
    """
    mode = variant["mode"]
    if mode in ["vanilla", "icm"]:
        replay_buffer = EnvReplayBuffer(
            env=expl_env, **variant["replay_buffer_kwargs"],
        )
    elif mode in ["her", "her+icm"]:
        replay_buffer = ObsDictRelabelingBuffer(
            env=expl_env, **variant["her"], **variant["replay_buffer_kwargs"]
        )

    return replay_buffer
Пример #16
0
def experiment(variant):
    from cartpole import CartPoleEnv
    from rlkit.envs.wrappers import ProbDiscreteEnv
    expl_env = ProbDiscreteEnv(CartPoleEnv(mode=2))
    eval_env = ProbDiscreteEnv(CartPoleEnv(mode=2))
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size
    # import gym
    # from rlkit.envs.wrappers import ProbDiscreteEnv
    # expl_env = ProbDiscreteEnv(gym.make('CartPole-v0'))
    # eval_env = ProbDiscreteEnv(gym.make('CartPole-v0'))
    # obs_dim = eval_env.observation_space.low.size
    # action_dim = eval_env.action_space.low.size

    qf = FlattenMlp(input_size=obs_dim + action_dim,
                    output_size=1,
                    **variant['qf_kwargs'])
    policy = SoftmaxMlpPolicy(input_size=obs_dim,
                              output_size=action_dim,
                              **variant['policy_kwargs'])
    target_qf = copy.deepcopy(qf)
    target_policy = copy.deepcopy(policy)
    eval_path_collector = MdpPathCollector(eval_env, policy)
    # remove this since need action to be a prob
    # exploration_policy = PolicyWrappedWithExplorationStrategy(
    #     exploration_strategy=OUStrategy(action_space=expl_env.action_space),
    #     policy=policy,
    # )
    exploration_policy = policy
    expl_path_collector = MdpPathCollector(expl_env, exploration_policy)
    replay_buffer = EnvReplayBuffer(variant['replay_buffer_size'], expl_env)
    trainer = DDPGTrainer(qf=qf,
                          target_qf=target_qf,
                          policy=policy,
                          target_policy=target_policy,
                          **variant['trainer_kwargs'])
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Пример #17
0
def experiment(variant):
    eval_env = NormalizedBoxEnv(HalfCheetahEnv())
    expl_env = NormalizedBoxEnv(HalfCheetahEnv())
    # Or for a specific version:
    # import gym
    # env = NormalizedBoxEnv(gym.make('HalfCheetah-v1'))
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size
    qf = ConcatMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        **variant['qf_kwargs']
    )
    policy = TanhMlpPolicy(
        input_size=obs_dim,
        output_size=action_dim,
        **variant['policy_kwargs']
    )
    target_qf = copy.deepcopy(qf)
    target_policy = copy.deepcopy(policy)
    eval_path_collector = MdpPathCollector(eval_env, policy)
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=OUStrategy(action_space=expl_env.action_space),
        policy=policy,
    )
    expl_path_collector = MdpPathCollector(expl_env, exploration_policy)
    replay_buffer = EnvReplayBuffer(variant['replay_buffer_size'], expl_env)
    trainer = DDPGTrainer(
        qf=qf,
        target_qf=target_qf,
        policy=policy,
        target_policy=target_policy,
        **variant['trainer_kwargs']
    )
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs']
    )
    algorithm.to(ptu.device)
    algorithm.train()
Пример #18
0
def experiment(variant):
    from cartpole import CartPoleEnv
    expl_env = NormalizedBoxEnv(CartPoleEnv(mode=0))
    eval_env = NormalizedBoxEnv(CartPoleEnv(mode=0))
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size

    qf = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        **variant['qf_kwargs']
    )
    policy = TanhMlpPolicy(
        input_size=obs_dim,
        output_size=action_dim,
        **variant['policy_kwargs']
    )
    target_qf = copy.deepcopy(qf)
    target_policy = copy.deepcopy(policy)
    eval_path_collector = MdpPathCollector(eval_env, policy)
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=OUStrategy(action_space=expl_env.action_space),
        policy=policy,
    )
    expl_path_collector = MdpPathCollector(expl_env, exploration_policy)
    replay_buffer = EnvReplayBuffer(variant['replay_buffer_size'], expl_env)
    trainer = DDPGTrainer(
        qf=qf,
        target_qf=target_qf,
        policy=policy,
        target_policy=target_policy,
        **variant['trainer_kwargs']
    )
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs']
    )
    algorithm.to(ptu.device)
    algorithm.train()
Пример #19
0
def experiment(variant):
    expl_env = NormalizedBoxEnv(HalfCheetahEnv())
    eval_env = NormalizedBoxEnv(HalfCheetahEnv())
    obs_dim = expl_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size

    M = variant["layer_size"]
    qf1 = FlattenMlp(input_size=obs_dim + action_dim,
                     output_size=1,
                     hidden_sizes=[M, M])
    qf2 = FlattenMlp(input_size=obs_dim + action_dim,
                     output_size=1,
                     hidden_sizes=[M, M])
    target_qf1 = FlattenMlp(input_size=obs_dim + action_dim,
                            output_size=1,
                            hidden_sizes=[M, M])
    target_qf2 = FlattenMlp(input_size=obs_dim + action_dim,
                            output_size=1,
                            hidden_sizes=[M, M])
    policy = TanhGaussianPolicy(obs_dim=obs_dim,
                                action_dim=action_dim,
                                hidden_sizes=[M, M])
    eval_policy = MakeDeterministic(policy)
    eval_path_collector = MdpPathCollector(eval_env, eval_policy)
    expl_path_collector = MdpPathCollector(expl_env, policy)
    replay_buffer = EnvReplayBuffer(variant["replay_buffer_size"], expl_env)
    trainer = SACTrainer(env=eval_env,
                         policy=policy,
                         qf1=qf1,
                         qf2=qf2,
                         target_qf1=target_qf1,
                         target_qf2=target_qf2,
                         **variant["trainer_kwargs"])
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant["algorithm_kwargs"])
    algorithm.to(ptu.device)
    algorithm.train()
def experiment(variant):
    env = gym.make('RLkitUR-v0')._start_ros_services()
    eval_env = gym.make('RLkitUR-v0')
    expl_env = gym.make('RLkitUR-v0')
    eval_env = NormalizedBoxEnv(eval_env)
    expl_env = NormalizedBoxEnv(expl_env)

    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size
    print("obs_dim: ", obs_dim)
    print("action_dim: ", action_dim)
    qf = FlattenMlp(input_size=obs_dim + action_dim,
                    output_size=1,
                    **variant['qf_kwargs'])
    policy = TanhMlpPolicy(input_size=obs_dim,
                           output_size=action_dim,
                           **variant['policy_kwargs'])
    target_qf = copy.deepcopy(qf)
    target_policy = copy.deepcopy(policy)
    eval_path_collector = MdpPathCollector(eval_env, policy)
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=OUStrategy(action_space=expl_env.action_space),
        policy=policy,
    )
    expl_path_collector = MdpPathCollector(expl_env, exploration_policy)
    replay_buffer = EnvReplayBuffer(variant['replay_buffer_size'], expl_env)
    trainer = DDPGTrainer(qf=qf,
                          target_qf=target_qf,
                          policy=policy,
                          target_policy=target_policy,
                          **variant['trainer_kwargs'])
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Пример #21
0
def experiment(variant):
    base_expl_env = PointMassEnv(n=variant["num_tasks"],
                                 reward_type=variant["reward_type"])
    expl_env = FlatGoalEnv(base_expl_env, append_goal_to_obs=True)

    base_eval_env = PointMassEnv(n=variant["num_tasks"],
                                 reward_type=variant["reward_type"])
    eval_env = FlatGoalEnv(base_eval_env, append_goal_to_obs=True)
    obs_dim = expl_env.observation_space.low.size
    action_dim = expl_env.action_space.low.size

    print(expl_env.observation_space, expl_env.action_space)
    qf1 = FlattenMlp(input_size=obs_dim + action_dim,
                     output_size=1,
                     **variant['qf_kwargs'])
    qf2 = FlattenMlp(input_size=obs_dim + action_dim,
                     output_size=1,
                     **variant['qf_kwargs'])
    target_qf1 = FlattenMlp(input_size=obs_dim + action_dim,
                            output_size=1,
                            **variant['qf_kwargs'])
    target_qf2 = FlattenMlp(input_size=obs_dim + action_dim,
                            output_size=1,
                            **variant['qf_kwargs'])
    policy = TanhMlpPolicy(input_size=obs_dim,
                           output_size=action_dim,
                           **variant['policy_kwargs'])
    target_policy = TanhMlpPolicy(input_size=obs_dim,
                                  output_size=action_dim,
                                  **variant['policy_kwargs'])
    es = GaussianStrategy(
        action_space=expl_env.action_space,
        max_sigma=0.1,
        min_sigma=0.1,  # Constant sigma
    )
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    eval_path_collector = MdpPathCollector(
        eval_env,
        policy,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        exploration_policy,
    )
    replay_buffer = EnvReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
    )
    trainer = TD3Trainer(policy=policy,
                         qf1=qf1,
                         qf2=qf2,
                         target_qf1=target_qf1,
                         target_qf2=target_qf2,
                         target_policy=target_policy,
                         **variant['trainer_kwargs'])
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs'])
    algorithm.train()
Пример #22
0
def experiment(variant):
    env_params = ENV_PARAMS[variant['env']]
    env_mod_params = variant['env_mod']
    variant.update(env_params)

    expl_env = NormalizedBoxEnv(variant['env_class'](env_mod_params))
    eval_env = NormalizedBoxEnv(variant['env_class']({}))
    obs_dim = expl_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size

    M = variant['layer_size']
    qf1 = ConcatMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    qf2 = ConcatMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    target_qf1 = ConcatMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    target_qf2 = ConcatMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    policy = TanhGaussianPolicy(
        obs_dim=obs_dim,
        action_dim=action_dim,
        hidden_sizes=[M, M],
    )
    eval_policy = MakeDeterministic(policy)
    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    replay_buffer = EnvReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
    )
    trainer = SACTrainer(env=eval_env,
                         policy=policy,
                         qf1=qf1,
                         qf2=qf2,
                         target_qf1=target_qf1,
                         target_qf2=target_qf2,
                         **variant['trainer_kwargs'])
    if variant['collection_mode'] == 'online':
        expl_path_collector = MdpStepCollector(
            expl_env,
            policy,
        )
        algorithm = TorchOnlineRLAlgorithm(
            trainer=trainer,
            exploration_env=expl_env,
            evaluation_env=eval_env,
            exploration_data_collector=expl_path_collector,
            evaluation_data_collector=eval_path_collector,
            replay_buffer=replay_buffer,
            max_path_length=variant['max_path_length'],
            batch_size=variant['batch_size'],
            num_epochs=variant['num_epochs'],
            num_eval_steps_per_epoch=variant['num_eval_steps_per_epoch'],
            num_expl_steps_per_train_loop=variant[
                'num_expl_steps_per_train_loop'],
            num_trains_per_train_loop=variant['num_trains_per_train_loop'],
            min_num_steps_before_training=variant[
                'min_num_steps_before_training'],
        )
    else:
        expl_path_collector = MdpPathCollector(
            expl_env,
            policy,
        )
        algorithm = TorchBatchRLAlgorithmModEnv(
            trainer=trainer,
            exploration_env=expl_env,
            evaluation_env=eval_env,
            exploration_data_collector=expl_path_collector,
            evaluation_data_collector=eval_path_collector,
            replay_buffer=replay_buffer,
            max_path_length=variant['max_path_length'],
            batch_size=variant['batch_size'],
            num_epochs=variant['num_epochs'],
            num_eval_steps_per_epoch=variant['num_eval_steps_per_epoch'],
            num_expl_steps_per_train_loop=variant[
                'num_expl_steps_per_train_loop'],
            num_trains_per_train_loop=variant['num_trains_per_train_loop'],
            min_num_steps_before_training=variant[
                'min_num_steps_before_training'],
            mod_env_epoch_schedule=variant['mod_env_epoch_schedule'],
            env_mod_dist=variant['mod_env_dist'],
            env_class=variant['env_class'],
            env_mod_params=variant['env_mod'])
    algorithm.to(ptu.device)
    algorithm.train()
Пример #23
0
def experiment(variant):
    expl_env = make_env()
    eval_env = make_env()
    obs_dim = expl_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size

    M = variant['layer_size']
    qf1 = ConcatMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    qf2 = ConcatMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    target_qf1 = ConcatMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    target_qf2 = ConcatMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    policy = TanhGaussianPolicy(
        obs_dim=obs_dim,
        action_dim=action_dim,
        hidden_sizes=[M, M],
    )
    eval_policy = MakeDeterministic(policy)
    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        policy,
    )
    replay_buffer = EnvReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
    )
    trainer = SACTrainer(env=eval_env,
                         policy=policy,
                         qf1=qf1,
                         qf2=qf2,
                         target_qf1=target_qf1,
                         target_qf2=target_qf2,
                         **variant['trainer_kwargs'])
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Пример #24
0
def experiment(variant):
    # from softlearning.environments.gym import register_image_reach
    # register_image_reach()
    # env = gym.envs.make(
    #     'Pusher2d-ImageReach-v0',
    # )
    from softlearning.environments.gym.mujoco.image_pusher_2d import (
        ImageForkReacher2dEnv)

    env_kwargs = {
        'image_shape': (32, 32, 3),
        'arm_goal_distance_cost_coeff': 0.0,
        'arm_object_distance_cost_coeff': 1.0,
        'goal': (0, -1),
    }

    eval_env = ImageForkReacher2dEnv(**env_kwargs)
    expl_env = ImageForkReacher2dEnv(**env_kwargs)

    input_width, input_height, input_channels = eval_env.image_shape
    image_dim = input_width * input_height * input_channels

    action_dim = int(np.prod(eval_env.action_space.shape))
    cnn_params = variant['cnn_params']
    cnn_params.update(
        input_width=input_width,
        input_height=input_height,
        input_channels=input_channels,
        added_fc_input_size=4,
        output_conv_channels=True,
        output_size=None,
    )
    non_image_dim = int(np.prod(eval_env.observation_space.shape)) - image_dim
    if variant['shared_qf_conv']:
        qf_cnn = CNN(**cnn_params)
        qf_obs_processor = nn.Sequential(
            Split(qf_cnn, identity, image_dim),
            FlattenEach(),
            ConcatTuple(),
        )

        qf_kwargs = copy.deepcopy(variant['qf_kwargs'])
        qf_kwargs['obs_processor'] = qf_obs_processor
        qf_kwargs['output_size'] = 1
        qf_kwargs['input_size'] = (action_dim + qf_cnn.conv_output_flat_size +
                                   non_image_dim)
        qf1 = MlpQfWithObsProcessor(**qf_kwargs)
        qf2 = MlpQfWithObsProcessor(**qf_kwargs)

        target_qf_cnn = CNN(**cnn_params)
        target_qf_obs_processor = nn.Sequential(
            Split(target_qf_cnn, identity, image_dim),
            FlattenEach(),
            ConcatTuple(),
        )
        target_qf_kwargs = copy.deepcopy(variant['qf_kwargs'])
        target_qf_kwargs['obs_processor'] = target_qf_obs_processor
        target_qf_kwargs['output_size'] = 1
        target_qf_kwargs['input_size'] = (action_dim +
                                          target_qf_cnn.conv_output_flat_size +
                                          non_image_dim)
        target_qf1 = MlpQfWithObsProcessor(**target_qf_kwargs)
        target_qf2 = MlpQfWithObsProcessor(**target_qf_kwargs)
    else:
        qf1_cnn = CNN(**cnn_params)
        cnn_output_dim = qf1_cnn.conv_output_flat_size
        qf1 = MlpQfWithObsProcessor(obs_processor=qf1_cnn,
                                    output_size=1,
                                    input_size=action_dim + cnn_output_dim,
                                    **variant['qf_kwargs'])
        qf2 = MlpQfWithObsProcessor(obs_processor=CNN(**cnn_params),
                                    output_size=1,
                                    input_size=action_dim + cnn_output_dim,
                                    **variant['qf_kwargs'])
        target_qf1 = MlpQfWithObsProcessor(obs_processor=CNN(**cnn_params),
                                           output_size=1,
                                           input_size=action_dim +
                                           cnn_output_dim,
                                           **variant['qf_kwargs'])
        target_qf2 = MlpQfWithObsProcessor(obs_processor=CNN(**cnn_params),
                                           output_size=1,
                                           input_size=action_dim +
                                           cnn_output_dim,
                                           **variant['qf_kwargs'])
    action_dim = int(np.prod(eval_env.action_space.shape))
    policy_cnn = CNN(**cnn_params)
    policy_obs_processor = nn.Sequential(
        Split(policy_cnn, identity, image_dim),
        FlattenEach(),
        ConcatTuple(),
    )
    policy = TanhGaussianPolicyAdapter(
        policy_obs_processor, policy_cnn.conv_output_flat_size + non_image_dim,
        action_dim, **variant['policy_kwargs'])

    eval_policy = MakeDeterministic(policy)
    eval_path_collector = MdpPathCollector(
        eval_env, eval_policy, **variant['eval_path_collector_kwargs'])
    replay_buffer = EnvReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
    )
    trainer = SACTrainer(env=eval_env,
                         policy=policy,
                         qf1=qf1,
                         qf2=qf2,
                         target_qf1=target_qf1,
                         target_qf2=target_qf2,
                         **variant['trainer_kwargs'])
    if variant['collection_mode'] == 'batch':
        expl_path_collector = MdpPathCollector(
            expl_env, policy, **variant['expl_path_collector_kwargs'])
        algorithm = TorchBatchRLAlgorithm(
            trainer=trainer,
            exploration_env=expl_env,
            evaluation_env=eval_env,
            exploration_data_collector=expl_path_collector,
            evaluation_data_collector=eval_path_collector,
            replay_buffer=replay_buffer,
            **variant['algo_kwargs'])
    elif variant['collection_mode'] == 'online':
        expl_path_collector = MdpStepCollector(
            expl_env, policy, **variant['expl_path_collector_kwargs'])
        algorithm = TorchOnlineRLAlgorithm(
            trainer=trainer,
            exploration_env=expl_env,
            evaluation_env=eval_env,
            exploration_data_collector=expl_path_collector,
            evaluation_data_collector=eval_path_collector,
            replay_buffer=replay_buffer,
            **variant['algo_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Пример #25
0
def experiment(variant):
    eval_env = gym.make(variant['env_name'])
    expl_env = eval_env

    obs_dim = expl_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size

    M = variant['layer_size']
    qf1 = FlattenMlp_Dropout(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[
            M,
            M,
        ],
    )
    qf2 = FlattenMlp_Dropout(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[
            M,
            M,
        ],
    )
    target_qf1 = FlattenMlp_Dropout(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[
            M,
            M,
        ],
    )
    target_qf2 = FlattenMlp_Dropout(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[
            M,
            M,
        ],
    )
    policy = TanhGaussianPolicy(
        obs_dim=obs_dim,
        action_dim=action_dim,
        hidden_sizes=[
            M,
            M,
        ],
    )
    vae_policy = VAEPolicy(
        obs_dim=obs_dim,
        action_dim=action_dim,
        hidden_sizes=[750, 750],
        latent_dim=action_dim * 2,
    )
    eval_path_collector = CustomMDPPathCollector(eval_env, )
    expl_path_collector = MdpPathCollector(
        expl_env,
        policy,
    )
    buffer_filename = None
    if variant['buffer_filename'] is not None:
        buffer_filename = variant['buffer_filename']

    replay_buffer = EnvReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
    )
    load_hdf5(eval_env.unwrapped.get_dataset(),
              replay_buffer,
              max_size=variant['replay_buffer_size'])

    trainer = UWACTrainer(env=eval_env,
                          policy=policy,
                          qf1=qf1,
                          qf2=qf2,
                          target_qf1=target_qf1,
                          target_qf2=target_qf2,
                          vae=vae_policy,
                          **variant['trainer_kwargs'])
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        batch_rl=True,
        q_learning_alg=True,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Пример #26
0
def experiment(variant):
    if variant.get("pretrained_algorithm_path", False):
        resume(variant)
        return

    if 'env' in variant:
        env_params = ENV_PARAMS[variant['env']]
        variant.update(env_params)

        if 'env_id' in env_params:
            if env_params['env_id'] in ['pen-v0', 'pen-sparse-v0', 'door-v0', 'relocate-v0', 'hammer-v0',
                                        'pen-sparse-v0', 'door-sparse-v0', 'relocate-sparse-v0', 'hammer-sparse-v0']:
                import mj_envs
            expl_env = gym.make(env_params['env_id'])
            eval_env = gym.make(env_params['env_id'])
        else:
            expl_env = NormalizedBoxEnv(variant['env_class']())
            eval_env = NormalizedBoxEnv(variant['env_class']())

        if variant.get('sparse_reward', False):
            expl_env = RewardWrapperEnv(expl_env, compute_hand_sparse_reward)
            eval_env = RewardWrapperEnv(eval_env, compute_hand_sparse_reward)

        if variant.get('add_env_demos', False):
            variant["path_loader_kwargs"]["demo_paths"].append(variant["env_demo_path"])

        if variant.get('add_env_offpolicy_data', False):
            variant["path_loader_kwargs"]["demo_paths"].append(variant["env_offpolicy_data_path"])
    else:
        expl_env = encoder_wrapped_env(variant)
        eval_env = encoder_wrapped_env(variant)

    path_loader_kwargs = variant.get("path_loader_kwargs", {})
    stack_obs = path_loader_kwargs.get("stack_obs", 1)
    if stack_obs > 1:
        expl_env = StackObservationEnv(expl_env, stack_obs=stack_obs)
        eval_env = StackObservationEnv(eval_env, stack_obs=stack_obs)

    obs_dim = expl_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size
    if hasattr(expl_env, 'info_sizes'):
        env_info_sizes = expl_env.info_sizes
    else:
        env_info_sizes = dict()

    M = variant['layer_size']
    vf_kwargs = variant.get("vf_kwargs", {})
    vf1 = ConcatMlp(
        input_size=obs_dim,
        output_size=1,
        hidden_sizes=[M, M],
        **vf_kwargs
    )
    target_vf1 = ConcatMlp(
        input_size=obs_dim,
        output_size=1,
        hidden_sizes=[M, M],
        **vf_kwargs
    )
    policy_class = variant.get("policy_class", TanhGaussianPolicy)
    policy_kwargs = variant['policy_kwargs']
    policy = policy_class(
        obs_dim=obs_dim,
        action_dim=action_dim,
        **policy_kwargs,
    )
    target_policy = policy_class(
        obs_dim=obs_dim,
        action_dim=action_dim,
        **policy_kwargs,
    )

    buffer_policy_class = variant.get("buffer_policy_class", policy_class)
    buffer_policy = buffer_policy_class(
        obs_dim=obs_dim,
        action_dim=action_dim,
        **variant.get("buffer_policy_kwargs", policy_kwargs),
    )

    eval_policy = MakeDeterministic(policy)
    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )

    expl_policy = policy
    exploration_kwargs =  variant.get('exploration_kwargs', {})
    if exploration_kwargs:
        if exploration_kwargs.get("deterministic_exploration", False):
            expl_policy = MakeDeterministic(policy)

        exploration_strategy = exploration_kwargs.get("strategy", None)
        if exploration_strategy is None:
            pass
        elif exploration_strategy == 'ou':
            es = OUStrategy(
                action_space=expl_env.action_space,
                max_sigma=exploration_kwargs['noise'],
                min_sigma=exploration_kwargs['noise'],
            )
            expl_policy = PolicyWrappedWithExplorationStrategy(
                exploration_strategy=es,
                policy=expl_policy,
            )
        elif exploration_strategy == 'gauss_eps':
            es = GaussianAndEpislonStrategy(
                action_space=expl_env.action_space,
                max_sigma=exploration_kwargs['noise'],
                min_sigma=exploration_kwargs['noise'],  # constant sigma
                epsilon=0,
            )
            expl_policy = PolicyWrappedWithExplorationStrategy(
                exploration_strategy=es,
                policy=expl_policy,
            )
        else:
            error

    if variant.get('replay_buffer_class', EnvReplayBuffer) == AWREnvReplayBuffer:
        main_replay_buffer_kwargs = variant['replay_buffer_kwargs']
        main_replay_buffer_kwargs['env'] = expl_env
        main_replay_buffer_kwargs['qf1'] = qf1
        main_replay_buffer_kwargs['qf2'] = qf2
        main_replay_buffer_kwargs['policy'] = policy
    else:
        main_replay_buffer_kwargs=dict(
            max_replay_buffer_size=variant['replay_buffer_size'],
            env=expl_env,
        )
    replay_buffer_kwargs = dict(
        max_replay_buffer_size=variant['replay_buffer_size'],
        env=expl_env,
    )

    replay_buffer = variant.get('replay_buffer_class', EnvReplayBuffer)(
        **main_replay_buffer_kwargs,
    )
    if variant.get('use_validation_buffer', False):
        train_replay_buffer = replay_buffer
        validation_replay_buffer = variant.get('replay_buffer_class', EnvReplayBuffer)(
            **main_replay_buffer_kwargs,
        )
        replay_buffer = SplitReplayBuffer(train_replay_buffer, validation_replay_buffer, 0.9)

    trainer_class = variant.get("trainer_class", QuinoaTrainer)
    trainer = trainer_class(
        env=eval_env,
        policy=policy,
        vf1=vf1,
        target_policy=target_policy,
        target_vf1=target_vf1,
        buffer_policy=buffer_policy,
        **variant['trainer_kwargs']
    )
    if variant['collection_mode'] == 'online':
        expl_path_collector = MdpStepCollector(
            expl_env,
            policy,
        )
        algorithm = TorchOnlineRLAlgorithm(
            trainer=trainer,
            exploration_env=expl_env,
            evaluation_env=eval_env,
            exploration_data_collector=expl_path_collector,
            evaluation_data_collector=eval_path_collector,
            replay_buffer=replay_buffer,
            max_path_length=variant['max_path_length'],
            batch_size=variant['batch_size'],
            num_epochs=variant['num_epochs'],
            num_eval_steps_per_epoch=variant['num_eval_steps_per_epoch'],
            num_expl_steps_per_train_loop=variant['num_expl_steps_per_train_loop'],
            num_trains_per_train_loop=variant['num_trains_per_train_loop'],
            min_num_steps_before_training=variant['min_num_steps_before_training'],
        )
    else:
        expl_path_collector = MdpPathCollector(
            expl_env,
            expl_policy,
        )
        algorithm = TorchBatchRLAlgorithm(
            trainer=trainer,
            exploration_env=expl_env,
            evaluation_env=eval_env,
            exploration_data_collector=expl_path_collector,
            evaluation_data_collector=eval_path_collector,
            replay_buffer=replay_buffer,
            max_path_length=variant['max_path_length'],
            batch_size=variant['batch_size'],
            num_epochs=variant['num_epochs'],
            num_eval_steps_per_epoch=variant['num_eval_steps_per_epoch'],
            num_expl_steps_per_train_loop=variant['num_expl_steps_per_train_loop'],
            num_trains_per_train_loop=variant['num_trains_per_train_loop'],
            min_num_steps_before_training=variant['min_num_steps_before_training'],
        )
    algorithm.to(ptu.device)

    demo_train_buffer = EnvReplayBuffer(
        **replay_buffer_kwargs,
    )
    demo_test_buffer = EnvReplayBuffer(
        **replay_buffer_kwargs,
    )

    if variant.get("save_video", False):
        if variant.get("presampled_goals", None):
            variant['image_env_kwargs']['presampled_goals'] = load_local_or_remote_file(variant['presampled_goals']).item()
        image_eval_env = ImageEnv(GymToMultiEnv(eval_env), **variant["image_env_kwargs"])
        image_eval_path_collector = ObsDictPathCollector(
            image_eval_env,
            eval_policy,
            observation_key="state_observation",
        )
        image_expl_env = ImageEnv(GymToMultiEnv(expl_env), **variant["image_env_kwargs"])
        image_expl_path_collector = ObsDictPathCollector(
            image_expl_env,
            expl_policy,
            observation_key="state_observation",
        )
        video_func = VideoSaveFunction(
            image_eval_env,
            variant,
            image_expl_path_collector,
            image_eval_path_collector,
        )
        algorithm.post_train_funcs.append(video_func)
    if variant.get('save_paths', False):
        algorithm.post_train_funcs.append(save_paths)
    if variant.get('load_demos', False):
        path_loader_class = variant.get('path_loader_class', MDPPathLoader)
        path_loader = path_loader_class(trainer,
            replay_buffer=replay_buffer,
            demo_train_buffer=demo_train_buffer,
            demo_test_buffer=demo_test_buffer,
            **path_loader_kwargs
        )
        path_loader.load_demos()
    if variant.get('save_initial_buffers', False):
        buffers = dict(
            replay_buffer=replay_buffer,
            demo_train_buffer=demo_train_buffer,
            demo_test_buffer=demo_test_buffer,
        )
        buffer_path = osp.join(logger.get_snapshot_dir(), 'buffers.p')
        pickle.dump(buffers, open(buffer_path, "wb"))
    if variant.get('pretrain_policy', False):
        trainer.pretrain_policy_with_bc()
    if variant.get('pretrain_rl', False):
        trainer.pretrain_q_with_bc_data()
    if variant.get('save_pretrained_algorithm', False):
        p_path = osp.join(logger.get_snapshot_dir(), 'pretrain_algorithm.p')
        pt_path = osp.join(logger.get_snapshot_dir(), 'pretrain_algorithm.pt')
        data = algorithm._get_snapshot()
        data['algorithm'] = algorithm
        torch.save(data, open(pt_path, "wb"))
        torch.save(data, open(p_path, "wb"))
    if variant.get('train_rl', True):
        algorithm.train()
Пример #27
0
def offpolicy_main(variant):
    print("offpolicy main")

    if args.algo == 'sac':
        algo = "SAC"
    elif args.algo == 'td3':
        algo = "TD3"

    setup_logger('{0}_{1}'.format(args.env_name, args.save_name),
                 variant=variant)
    ptu.set_gpu_mode(True)  # optionally set the GPU (default=True)

    expl_env, eval_env, env_obj = prepare_env(args.env_name,
                                              args.visionmodel_path,
                                              **env_kwargs)
    obs_dim = expl_env.observation_space.low.size
    action_dim = expl_env.action_space.low.size
    expl_policy, eval_policy, trainer = prepare_trainer(
        algo, expl_env, obs_dim, action_dim, args.pretrained_policy_load,
        variant)

    if args.env_name.find('doorenv') > -1:
        expl_policy.knob_noisy = eval_policy.knob_noisy = args.knob_noisy
        expl_policy.nn = eval_policy.nn = env_obj.nn
        expl_policy.visionnet_input = eval_policy.visionnet_input = env_obj.visionnet_input

    if args.visionnet_input:
        visionmodel = load_visionmodel(expl_env._wrapped_env.xml_path,
                                       args.visionmodel_path, VisionModelXYZ())
        visionmodel.to(ptu.device)
        expl_policy.visionmodel = visionmodel.eval()
    else:
        expl_policy.visionmodel = None

    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
        doorenv=args.env_name.find('doorenv') > -1,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        expl_policy,
        doorenv=args.env_name.find('doorenv') > -1,
    )

    if not args.replaybuffer_load:
        replay_buffer = EnvReplayBuffer(
            variant['replay_buffer_size'],
            expl_env,
        )
    else:
        replay_buffer = pickle.load(open(args.replaybuffer_load, "rb"))
        replay_buffer._env_info_keys = replay_buffer.env_info_sizes.keys()
        print("Loaded the replay buffer that has length of {}".format(
            replay_buffer.get_diagnostics()))

    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs'])

    algorithm.save_interval = args.save_interval
    algorithm.save_dir = args.save_dir
    algorithm.algo = args.algo
    algorithm.env_name = args.env_name
    algorithm.save_name = args.save_name
    algorithm.env_kwargs = env_kwargs
    summary_name = args.log_dir + '{0}_{1}'
    writer = SummaryWriter(summary_name.format(args.env_name, args.save_name))
    algorithm.writer = writer

    algorithm.to(ptu.device)
    algorithm.train()
Пример #28
0
def experiment(variant):
    if variant.get("pretrained_algorithm_path", False):
        resume(variant)
        return

    normalize_env = variant.get("normalize_env", True)
    env_id = variant.get("env_id", None)
    env_params = ENV_PARAMS.get(env_id, {})
    variant.update(env_params)
    env_class = variant.get("env_class", None)
    env_kwargs = variant.get("env_kwargs", {})

    expl_env = make(env_id, env_class, env_kwargs, normalize_env)
    eval_env = make(env_id, env_class, env_kwargs, normalize_env)

    if variant.get("add_env_demos", False):
        variant["path_loader_kwargs"]["demo_paths"].append(
            variant["env_demo_path"])
    if variant.get("add_env_offpolicy_data", False):
        variant["path_loader_kwargs"]["demo_paths"].append(
            variant["env_offpolicy_data_path"])

    path_loader_kwargs = variant.get("path_loader_kwargs", {})
    stack_obs = path_loader_kwargs.get("stack_obs", 1)
    if stack_obs > 1:
        expl_env = StackObservationEnv(expl_env, stack_obs=stack_obs)
        eval_env = StackObservationEnv(eval_env, stack_obs=stack_obs)

    obs_dim = expl_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size

    if hasattr(expl_env, "info_sizes"):
        env_info_sizes = expl_env.info_sizes
    else:
        env_info_sizes = dict()

    qf_kwargs = variant.get("qf_kwargs", {})
    qf1 = ConcatMlp(input_size=obs_dim + action_dim,
                    output_size=1,
                    **qf_kwargs)
    qf2 = ConcatMlp(input_size=obs_dim + action_dim,
                    output_size=1,
                    **qf_kwargs)
    target_qf1 = ConcatMlp(input_size=obs_dim + action_dim,
                           output_size=1,
                           **qf_kwargs)
    target_qf2 = ConcatMlp(input_size=obs_dim + action_dim,
                           output_size=1,
                           **qf_kwargs)

    policy_class = variant.get("policy_class", TanhGaussianPolicy)
    policy_kwargs = variant["policy_kwargs"]
    policy_path = variant.get("policy_path", False)
    if policy_path:
        policy = load_local_or_remote_file(policy_path)
    else:
        policy = policy_class(
            obs_dim=obs_dim,
            action_dim=action_dim,
            **policy_kwargs,
        )
    buffer_policy_path = variant.get("buffer_policy_path", False)
    if buffer_policy_path:
        buffer_policy = load_local_or_remote_file(buffer_policy_path)
    else:
        buffer_policy_class = variant.get("buffer_policy_class", policy_class)
        buffer_policy = buffer_policy_class(
            obs_dim=obs_dim,
            action_dim=action_dim,
            **variant.get("buffer_policy_kwargs", policy_kwargs),
        )

    eval_policy = MakeDeterministic(policy)
    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )

    expl_policy = policy
    exploration_kwargs = variant.get("exploration_kwargs", {})
    if exploration_kwargs:
        if exploration_kwargs.get("deterministic_exploration", False):
            expl_policy = MakeDeterministic(policy)

        exploration_strategy = exploration_kwargs.get("strategy", None)
        if exploration_strategy is None:
            pass
        elif exploration_strategy == "ou":
            es = OUStrategy(
                action_space=expl_env.action_space,
                max_sigma=exploration_kwargs["noise"],
                min_sigma=exploration_kwargs["noise"],
            )
            expl_policy = PolicyWrappedWithExplorationStrategy(
                exploration_strategy=es,
                policy=expl_policy,
            )
        elif exploration_strategy == "gauss_eps":
            es = GaussianAndEpsilonStrategy(
                action_space=expl_env.action_space,
                max_sigma=exploration_kwargs["noise"],
                min_sigma=exploration_kwargs["noise"],  # constant sigma
                epsilon=0,
            )
            expl_policy = PolicyWrappedWithExplorationStrategy(
                exploration_strategy=es,
                policy=expl_policy,
            )
        else:
            error

    main_replay_buffer_kwargs = dict(
        max_replay_buffer_size=variant["replay_buffer_size"],
        env=expl_env,
    )
    replay_buffer_kwargs = dict(
        max_replay_buffer_size=variant["replay_buffer_size"],
        env=expl_env,
    )

    replay_buffer = variant.get("replay_buffer_class",
                                EnvReplayBuffer)(**main_replay_buffer_kwargs, )
    if variant.get("use_validation_buffer", False):
        train_replay_buffer = replay_buffer
        validation_replay_buffer = variant.get(
            "replay_buffer_class",
            EnvReplayBuffer)(**main_replay_buffer_kwargs, )
        replay_buffer = SplitReplayBuffer(train_replay_buffer,
                                          validation_replay_buffer, 0.9)

    trainer_class = variant.get("trainer_class", AWACTrainer)
    trainer = trainer_class(
        env=eval_env,
        policy=policy,
        qf1=qf1,
        qf2=qf2,
        target_qf1=target_qf1,
        target_qf2=target_qf2,
        buffer_policy=buffer_policy,
        **variant["trainer_kwargs"],
    )
    if variant["collection_mode"] == "online":
        expl_path_collector = MdpStepCollector(
            expl_env,
            policy,
        )
        algorithm = TorchOnlineRLAlgorithm(
            trainer=trainer,
            exploration_env=expl_env,
            evaluation_env=eval_env,
            exploration_data_collector=expl_path_collector,
            evaluation_data_collector=eval_path_collector,
            replay_buffer=replay_buffer,
            max_path_length=variant["max_path_length"],
            batch_size=variant["batch_size"],
            num_epochs=variant["num_epochs"],
            num_eval_steps_per_epoch=variant["num_eval_steps_per_epoch"],
            num_expl_steps_per_train_loop=variant[
                "num_expl_steps_per_train_loop"],
            num_trains_per_train_loop=variant["num_trains_per_train_loop"],
            min_num_steps_before_training=variant[
                "min_num_steps_before_training"],
        )
    else:
        expl_path_collector = MdpPathCollector(
            expl_env,
            expl_policy,
        )
        algorithm = TorchBatchRLAlgorithm(
            trainer=trainer,
            exploration_env=expl_env,
            evaluation_env=eval_env,
            exploration_data_collector=expl_path_collector,
            evaluation_data_collector=eval_path_collector,
            replay_buffer=replay_buffer,
            max_path_length=variant["max_path_length"],
            batch_size=variant["batch_size"],
            num_epochs=variant["num_epochs"],
            num_eval_steps_per_epoch=variant["num_eval_steps_per_epoch"],
            num_expl_steps_per_train_loop=variant[
                "num_expl_steps_per_train_loop"],
            num_trains_per_train_loop=variant["num_trains_per_train_loop"],
            min_num_steps_before_training=variant[
                "min_num_steps_before_training"],
        )
    algorithm.to(ptu.device)

    demo_train_buffer = EnvReplayBuffer(**replay_buffer_kwargs, )
    demo_test_buffer = EnvReplayBuffer(**replay_buffer_kwargs, )

    if variant.get("save_video", False):
        if variant.get("presampled_goals", None):
            variant["image_env_kwargs"][
                "presampled_goals"] = load_local_or_remote_file(
                    variant["presampled_goals"]).item()

        def get_img_env(env):
            renderer = EnvRenderer(**variant["renderer_kwargs"])
            img_env = InsertImageEnv(GymToMultiEnv(env), renderer=renderer)

        image_eval_env = ImageEnv(GymToMultiEnv(eval_env),
                                  **variant["image_env_kwargs"])
        # image_eval_env = get_img_env(eval_env)
        image_eval_path_collector = ObsDictPathCollector(
            image_eval_env,
            eval_policy,
            observation_key="state_observation",
        )
        image_expl_env = ImageEnv(GymToMultiEnv(expl_env),
                                  **variant["image_env_kwargs"])
        # image_expl_env = get_img_env(expl_env)
        image_expl_path_collector = ObsDictPathCollector(
            image_expl_env,
            expl_policy,
            observation_key="state_observation",
        )
        video_func = VideoSaveFunction(
            image_eval_env,
            variant,
            image_expl_path_collector,
            image_eval_path_collector,
        )
        algorithm.post_train_funcs.append(video_func)
    if variant.get("save_paths", False):
        algorithm.post_train_funcs.append(save_paths)
    if variant.get("load_demos", False):
        path_loader_class = variant.get("path_loader_class", MDPPathLoader)
        path_loader = path_loader_class(
            trainer,
            replay_buffer=replay_buffer,
            demo_train_buffer=demo_train_buffer,
            demo_test_buffer=demo_test_buffer,
            **path_loader_kwargs,
        )
        path_loader.load_demos()
    if variant.get("load_env_dataset_demos", False):
        path_loader_class = variant.get("path_loader_class", HDF5PathLoader)
        path_loader = path_loader_class(
            trainer,
            replay_buffer=replay_buffer,
            demo_train_buffer=demo_train_buffer,
            demo_test_buffer=demo_test_buffer,
            **path_loader_kwargs,
        )
        path_loader.load_demos(expl_env.get_dataset())
    if variant.get("save_initial_buffers", False):
        buffers = dict(
            replay_buffer=replay_buffer,
            demo_train_buffer=demo_train_buffer,
            demo_test_buffer=demo_test_buffer,
        )
        buffer_path = osp.join(logger.get_snapshot_dir(), "buffers.p")
        pickle.dump(buffers, open(buffer_path, "wb"))
    if variant.get("pretrain_buffer_policy", False):
        trainer.pretrain_policy_with_bc(
            buffer_policy,
            replay_buffer.train_replay_buffer,
            replay_buffer.validation_replay_buffer,
            10000,
            label="buffer",
        )
    if variant.get("pretrain_policy", False):
        trainer.pretrain_policy_with_bc(
            policy,
            demo_train_buffer,
            demo_test_buffer,
            trainer.bc_num_pretrain_steps,
        )
    if variant.get("pretrain_rl", False):
        trainer.pretrain_q_with_bc_data()
    if variant.get("save_pretrained_algorithm", False):
        p_path = osp.join(logger.get_snapshot_dir(), "pretrain_algorithm.p")
        pt_path = osp.join(logger.get_snapshot_dir(), "pretrain_algorithm.pt")
        data = algorithm._get_snapshot()
        data["algorithm"] = algorithm
        torch.save(data, open(pt_path, "wb"))
        torch.save(data, open(p_path, "wb"))
    if variant.get("train_rl", True):
        algorithm.train()
Пример #29
0
def experiment(variant):
    setup_logger("name-of-experiment", variant=variant)
    ptu.set_gpu_mode(True)

    expl_env = gym.make(variant["env_name"])
    eval_env = gym.make(variant["env_name"])
    obs_dim = expl_env.observation_space.image.shape[1]
    channels = expl_env.observation_space.image.shape[0]
    action_dim = SYMBOLIC_ACTION_COUNT
    symbolic_action_space = gym.spaces.Discrete(SYMBOLIC_ACTION_COUNT)
    symb_env = gym.make(variant["env_name"])
    symb_env.action_space = symbolic_action_space

    qf = CNN(
        input_width=obs_dim,
        input_height=obs_dim,
        input_channels=channels,
        output_size=action_dim,
        kernel_sizes=[8, 4],
        n_channels=[16, 32],
        strides=[4, 2],
        paddings=[0, 0],
        hidden_sizes=[256],
    )
    target_qf = CNN(
        input_width=obs_dim,
        input_height=obs_dim,
        input_channels=channels,
        output_size=action_dim,
        kernel_sizes=[8, 4],
        n_channels=[16, 32],
        strides=[4, 2],
        paddings=[0, 0],
        hidden_sizes=[256],
    )
    qf_criterion = nn.MSELoss()

    eval_policy = LearnPlanPolicy(None)
    expl_policy = LearnPlanPolicy(None)
    eval_path_collector = MdpPathCollector(eval_env,
                                           eval_policy,
                                           rollout=hierarchical_rollout,
                                           render=variant["render"])
    expl_path_collector = MdpPathCollector(expl_env,
                                           expl_policy,
                                           rollout=hierarchical_rollout,
                                           render=variant["render"])
    trainer = DQNTrainer(qf=qf,
                         target_qf=target_qf,
                         qf_criterion=qf_criterion,
                         **variant["trainer_kwargs"])
    replay_buffer = EnvReplayBuffer(variant["replay_buffer_size"], symb_env)
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant["algorithm_kwargs"])
    algorithm.to(ptu.device)
    algorithm.train()
Пример #30
0
def run_experiment_func(variant):
    env_params = ENV_PARAMS[variant['env']]
    variant.update(env_params)

    expl_env = NormalizedBoxEnv(variant['env_class']())
    eval_env = NormalizedBoxEnv(variant['env_class']())
    obs_dim = expl_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size

    M = variant['layer_size']
    qf1 = ConcatMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    qf2 = ConcatMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    target_qf1 = ConcatMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    target_qf2 = ConcatMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )

    vf = ConcatMlp(
        input_size=obs_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    policy = TanhGaussianPolicy(
        obs_dim=obs_dim,
        action_dim=action_dim,
        hidden_sizes=[M, M],
    )
    eval_policy = MakeDeterministic(policy)
    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        policy,
    )
    replay_buffer = EnvReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
    )
    trainer = SACTrainer(
        env=eval_env,
        policy=policy,
        qf1=qf1,
        qf2=qf2,
        target_qf1=target_qf1,
        target_qf2=target_qf2,
        discount=variant['discount'],
        soft_target_tau=variant['soft_target_tau'],
        target_update_period=variant['target_update_period'],
        policy_lr=variant['policy_lr'],
        qf_lr=variant['qf_lr'],
        reward_scale=1,
        use_automatic_entropy_tuning=True,
    )
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        max_path_length=variant['max_path_length'],
        batch_size=variant['batch_size'],
        num_epochs=variant['num_epochs'],
        num_eval_steps_per_epoch=variant['num_eval_steps_per_epoch'],
        num_expl_steps_per_train_loop=variant['num_expl_steps_per_train_loop'],
        num_trains_per_train_loop=variant['num_trains_per_train_loop'],
        min_num_steps_before_training=variant['min_num_steps_before_training'],
    )
    return algorithm