Exemplo n.º 1
0
def experiment(variant):
    cuda = True
    from gym.envs.mujoco import HalfCheetahEnv
    from mujoco_torch.core.bridge import MjCudaRender
    R = 84
    env = HalfCheetahEnv()
    c = Convnet(6, output_activation=torch.tanh, input_channels=3)
    if cuda:
        c.cuda()

    gt.stamp("start")
    for i in range(100):
        img = env.sim.render(R, R, device_id=1)

    gt.stamp("warmstart")
    for i in gt.timed_for(range(1000)):
        env.step(np.random.rand(6))
        gt.stamp('step')

        img = env.sim.render(R, R, device_id=1)
        gt.stamp('render')

        x = np_to_var(img)
        if cuda:
            x = x.cuda()
            torch.cuda.synchronize()
        gt.stamp('transfer')
        # cv2.imshow("img", img)
        # cv2.waitKey(1)
    gt.stamp("end")

    print(img)

    print(gt.report(include_itrs=False))
Exemplo n.º 2
0
    def __init__(self, noise_type='uniform', noise_scale=0.0, init_scale=0.0):
        self.noise_type = noise_type
        assert self.noise_type in ['normal', 'uniform']
        self.noise_scale = noise_scale
        self.init_scale = init_scale

        HalfCheetahEnv.__init__(self)
Exemplo n.º 3
0
def experiment(variant):
    '''
    1. 建立实验环境(eval, expl)
    2. 确立输入,输出维度,建立qf函数,policy函数
    3. 复制target qf和 target policy 函数
    4. 对于评估构建path collector
    5. 对于训练实验,构建探索策略、path collector、replay buffer
    6. 构建 DDPGTrainer (qf, policy)
    7. algorithm (包括trainer, env, replay buffer, path collector.以及用于评价部分)
    8. 开始训练
    :param variant: config parameter
    :return:
    '''
    eval_env = NormalizedBoxEnv(HalfCheetahEnv())
    expl_env = NormalizedBoxEnv(HalfCheetahEnv())
    # Or for a specific version:
    # import gym
    # env = NormalizedBoxEnv(gym.make('HalfCheetah-v1'))
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size
    qf = FlattenMlp(input_size=obs_dim + action_dim,
                    output_size=1,
                    **variant['qf_kwargs'])
    policy = TanhMlpPolicy(input_size=obs_dim,
                           output_size=action_dim,
                           **variant['policy_kwargs'])
    # 利用copy
    target_qf = copy.deepcopy(qf)
    target_policy = copy.deepcopy(policy)
    # 评估
    eval_path_collector = MdpPathCollector(eval_env, policy)
    # 实验 (探索策略、path收集、replay buffer)
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=OUStrategy(action_space=expl_env.action_space),
        policy=policy,
    )
    expl_path_collector = MdpPathCollector(expl_env, exploration_policy)
    replay_buffer = EnvReplayBuffer(variant['replay_buffer_size'], expl_env)

    trainer = DDPGTrainer(qf=qf,
                          target_qf=target_qf,
                          policy=policy,
                          target_policy=target_policy,
                          **variant['trainer_kwargs'])
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs'])
    # 转化变量格式
    algorithm.to(ptu.device)

    algorithm.train()
Exemplo n.º 4
0
def experiment(variant):
    expl_env = NormalizedBoxEnv(HalfCheetahEnv())
    eval_env = NormalizedBoxEnv(HalfCheetahEnv())
    obs_dim = expl_env.observation_space.low.size
    action_dim = expl_env.action_space.low.size
    qf1 = FlattenMlp(input_size=obs_dim + action_dim,
                     output_size=1,
                     **variant["qf_kwargs"])
    qf2 = FlattenMlp(input_size=obs_dim + action_dim,
                     output_size=1,
                     **variant["qf_kwargs"])
    target_qf1 = FlattenMlp(input_size=obs_dim + action_dim,
                            output_size=1,
                            **variant["qf_kwargs"])
    target_qf2 = FlattenMlp(input_size=obs_dim + action_dim,
                            output_size=1,
                            **variant["qf_kwargs"])
    policy = TanhMlpPolicy(input_size=obs_dim,
                           output_size=action_dim,
                           **variant["policy_kwargs"])
    target_policy = TanhMlpPolicy(input_size=obs_dim,
                                  output_size=action_dim,
                                  **variant["policy_kwargs"])
    es = GaussianStrategy(
        action_space=expl_env.action_space,
        max_sigma=0.1,
        min_sigma=0.1,  # Constant sigma
    )
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es, policy=policy)
    eval_path_collector = MdpPathCollector(eval_env, policy)
    expl_path_collector = MdpPathCollector(expl_env, exploration_policy)
    replay_buffer = EnvReplayBuffer(variant["replay_buffer_size"], expl_env)
    trainer = TD3Trainer(policy=policy,
                         qf1=qf1,
                         qf2=qf2,
                         target_qf1=target_qf1,
                         target_qf2=target_qf2,
                         target_policy=target_policy,
                         **variant["trainer_kwargs"])
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant["algorithm_kwargs"])
    algorithm.to(ptu.device)
    algorithm.train()
Exemplo n.º 5
0
def experiment(variant):
    from gym.envs.mujoco import HalfCheetahEnv
    from mujoco_torch.core.bridge import MjCudaRender
    renderer = MjCudaRender(32, 32)
    env = HalfCheetahEnv()

    renderer.get_cuda_tensor(env.sim)
Exemplo n.º 6
0
def experiment(variant):
    env = NormalizedBoxEnv(HalfCheetahEnv())
    # Or for a specific version:
    # import gym
    # env = NormalizedBoxEnv(gym.make('HalfCheetah-v1'))

    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))

    net_size = variant['net_size']
    qf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim,
        output_size=1,
    )
    policy = TanhGaussianPolicy(
        hidden_sizes=[net_size, net_size],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )
    algorithm = SoftActorCritic(env=env,
                                policy=policy,
                                qf=qf,
                                vf=vf,
                                **variant['algo_params'])
    algorithm.to(ptu.device)
    algorithm.train()
Exemplo n.º 7
0
def simulate_policy(args):
    data = torch.load(str(args.file))
    #data = joblib.load(str(args.file))
    policy = data['evaluation/policy']
    env = NormalizedBoxEnv(HalfCheetahEnv())
    #env = data['evaluation/env']
    print("Policy loaded")
    if args.gpu:
        set_gpu_mode(True)
        policy.cuda()

    if args.collect:
        data = []
    for trial in tqdm(range(100)):
        path = rollout(
            env,
            policy,
            max_path_length=args.H + 1,
            render=not args.collect,
        )
        if hasattr(env, "log_diagnostics"):
            env.log_diagnostics([path])
        logger.dump_tabular()
        if args.collect:
            data.append([path['actions'], path['next_observations']])

    if args.collect:
        import pickle
        with open("data/expert.pkl", mode='wb') as f:
            pickle.dump(data, f)
Exemplo n.º 8
0
def experiment(variant):
    env = NormalizedBoxEnv(HalfCheetahEnv())
    # Or for a specific version:
    # import gym
    # env = NormalizedBoxEnv(gym.make('HalfCheetah-v1'))
    es = OUStrategy(action_space=env.action_space)
    obs_dim = env.observation_space.low.size
    action_dim = env.action_space.low.size
    qf = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[400, 300],
    )
    policy = TanhMlpPolicy(
        input_size=obs_dim,
        output_size=action_dim,
        hidden_sizes=[400, 300],
    )
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    algorithm = DDPG(env,
                     qf=qf,
                     policy=policy,
                     exploration_policy=exploration_policy,
                     **variant['algo_params'])
    algorithm.to(ptu.device)
    algorithm.train()
Exemplo n.º 9
0
def example(variant):
    import mujoco_py
    import torch
    logger.log(torch.__version__)
    date_format = '%m/%d/%Y %H:%M:%S %Z'
    date = datetime.now(tz=pytz.utc)
    logger.log("start")
    logger.log('Current date & time is: {}'.format(date.strftime(date_format)))
    if torch.cuda.is_available():
        x = torch.randn(3)
        logger.log(str(x.to(ptu.device)))

    date = date.astimezone(timezone('US/Pacific'))
    logger.log('Local date & time is: {}'.format(date.strftime(date_format)))
    for i in range(variant['num_seconds']):
        logger.log("Tick, {}".format(i))
        time.sleep(1)
    logger.log("end")
    logger.log('Local date & time is: {}'.format(date.strftime(date_format)))

    logger.log("start mujoco")
    from gym.envs.mujoco import HalfCheetahEnv
    e = HalfCheetahEnv()
    img = e.sim.render(32, 32)
    logger.log(str(sum(img)))
    logger.log("end mujocoy")
Exemplo n.º 10
0
Arquivo: ddpg.py Projeto: jesbu1/rlkit
def experiment(variant):
    eval_env = NormalizedBoxEnv(HalfCheetahEnv())
    expl_env = NormalizedBoxEnv(HalfCheetahEnv())
    # Or for a specific version:
    # import gym
    # env = NormalizedBoxEnv(gym.make('HalfCheetah-v1'))
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size
    qf = ConcatMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        **variant['qf_kwargs']
    )
    policy = TanhMlpPolicy(
        input_size=obs_dim,
        output_size=action_dim,
        **variant['policy_kwargs']
    )
    target_qf = copy.deepcopy(qf)
    target_policy = copy.deepcopy(policy)
    eval_path_collector = MdpPathCollector(eval_env, policy)
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=OUStrategy(action_space=expl_env.action_space),
        policy=policy,
    )
    expl_path_collector = MdpPathCollector(expl_env, exploration_policy)
    replay_buffer = EnvReplayBuffer(variant['replay_buffer_size'], expl_env)
    trainer = DDPGTrainer(
        qf=qf,
        target_qf=target_qf,
        policy=policy,
        target_policy=target_policy,
        **variant['trainer_kwargs']
    )
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs']
    )
    algorithm.to(ptu.device)
    algorithm.train()
Exemplo n.º 11
0
def experiment(variant):

    root = 0

    E = 20
    R = 84
    U = 6
    cuda = True

    envs = []

    for e in range(E):
        env = HalfCheetahEnv()
        envs.append(env)

    c = Convnet(6, output_activation=torch.tanh, input_channels=3)
    if cuda:
        c.cuda()

    # viewer = mujoco_py.MjRenderContextOffscreen(env.sim, device_id=1)
    # env.sim.add_render_context(viewer)

    def step(i, stamp=True):
        imgs = []
        if i % 100 == 0:
            for e in envs:
                e.reset()
        for e in envs:
            img = e.sim.render(R, R, device_id=0).transpose()
            imgs.append(img)
        gt.stamp('render') if stamp else 0

        imgs = np.array(imgs)

        torch_img = np_to_var(imgs)
        if cuda:
            torch_img = torch_img.cuda()
            torch.cuda.synchronize()
        gt.stamp('transfer') if stamp else 0

        u = get_numpy(c.forward(torch_img).cpu())
        torch.cuda.synchronize()
        gt.stamp('forward') if stamp else 0

        for i, e in enumerate(envs):
            e.step(u[i, :])
        gt.stamp('step') if stamp else 0

    for i in range(10):
        step(i, False)

    gt.stamp('start')
    for i in gt.timed_for(range(100)):
        step(i)
    gt.stamp('end')

    print(gt.report(include_itrs=False, format_options=dict(itr_num_width=10)))
Exemplo n.º 12
0
def experiment(variant):
    expl_env = NormalizedBoxEnv(HalfCheetahEnv())
    eval_env = NormalizedBoxEnv(HalfCheetahEnv())
    obs_dim = expl_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size

    M = variant["layer_size"]
    qf1 = FlattenMlp(input_size=obs_dim + action_dim,
                     output_size=1,
                     hidden_sizes=[M, M])
    qf2 = FlattenMlp(input_size=obs_dim + action_dim,
                     output_size=1,
                     hidden_sizes=[M, M])
    target_qf1 = FlattenMlp(input_size=obs_dim + action_dim,
                            output_size=1,
                            hidden_sizes=[M, M])
    target_qf2 = FlattenMlp(input_size=obs_dim + action_dim,
                            output_size=1,
                            hidden_sizes=[M, M])
    policy = TanhGaussianPolicy(obs_dim=obs_dim,
                                action_dim=action_dim,
                                hidden_sizes=[M, M])
    eval_policy = MakeDeterministic(policy)
    eval_path_collector = MdpPathCollector(eval_env, eval_policy)
    expl_path_collector = MdpPathCollector(expl_env, policy)
    replay_buffer = EnvReplayBuffer(variant["replay_buffer_size"], expl_env)
    trainer = SACTrainer(env=eval_env,
                         policy=policy,
                         qf1=qf1,
                         qf2=qf2,
                         target_qf1=target_qf1,
                         target_qf2=target_qf2,
                         **variant["trainer_kwargs"])
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant["algorithm_kwargs"])
    algorithm.to(ptu.device)
    algorithm.train()
Exemplo n.º 13
0
def test_trpo_pipeline():
    with LocalRunner() as runner:
        env = GarageEnv(HalfCheetahEnv())

        baseline = LinearFeatureBaseline()
        policy = GaussianMLPPolicy(env_spec=env.spec)

        algo = TRPO(policy=policy, baseline=baseline)

        runner.setup(algo=algo, env=env)
        runner.train(n_epochs=100, batch_size=512)
Exemplo n.º 14
0
def run_exp(snapshot_config, *_):
    with LocalRunner(snapshot_config) as runner:
        env = GarageEnv(HalfCheetahEnv())

        baseline = LinearFeatureBaseline()
        policy = GaussianMLPPolicy(env_spec=env.spec)

        algo = TRPO(policy=policy, baseline=baseline)

        runner.setup(algo=algo, env=env)
        runner.train(n_epochs=100, batch_size=512)
Exemplo n.º 15
0
def experiment(variant):
    E = 10
    R = 84
    cuda = True

    envs = []

    renderer = MjCudaRender(R, R)


    for e in range(E):
        env = HalfCheetahEnv()
        envs.append(env)
    c = Convnet(6, output_activation=torch.tanh, input_channels=3)
    if cuda:
        c.cuda()

    # viewer = mujoco_py.MjRenderContextOffscreen(env.sim, device_id=1)
    # env.sim.add_render_context(viewer)

    def step(stamp=True):
        imgs = []
        if i % 100 == 0:
            for e in range(E):
                envs[e].reset()
        for e in range(E):
            # img = renderer.get_cuda_tensor(envs[e].sim)
            img = envs[e].sim.render(R, R, device_id=1).transpose()
        gt.stamp('render') if stamp else 0

        # imgs =np.array(imgs)
        # torch_img = np_to_var(imgs)
        # if cuda:
        #     torch_img = torch_img.cuda()
        #     torch.cuda.synchronize()
        # gt.stamp('transfer') if stamp else 0

        # u = get_numpy(c.forward(torch_img).cpu())
        # torch.cuda.synchronize()
        # gt.stamp('forward') if stamp else 0

        # for e in range(E):
        #     envs[e].step(u[e, :])
        # gt.stamp('step') if stamp else 0

    for i in range(10):
        step(False)

    gt.stamp('start')
    for i in gt.timed_for(range(100)):
        step()
    gt.stamp('end')
Exemplo n.º 16
0
def experiment(variant):
    from gym.envs.mujoco import HalfCheetahEnv
    from mujoco_torch.core.bridge import MjCudaRender
    renderer = MjCudaRender(84, 84)
    env = HalfCheetahEnv()

    gt.stamp("start")
    for i in range(100):
        tensor, img = renderer.get_cuda_tensor(env.sim, False)

    gt.stamp("warmstart")
    for i in range(1000):
        env.step(np.random.rand(6))
        tensor, img = renderer.get_cuda_tensor(env.sim, True)

        x = np_to_var(img).cuda()
        torch.cuda.synchronize()
        # cv2.imshow("img", img)
        # cv2.waitKey(1)
    gt.stamp("end")

    print(img)

    print(gt.report())
Exemplo n.º 17
0
def experiment(variant):
    from gym.envs.mujoco import HalfCheetahEnv
    from mujoco_torch.core.bridge import MjCudaRender
    renderer = MjCudaRender(84, 84)
    env = HalfCheetahEnv()

    gt.stamp("start")
    for i in range(100):
        tensor, img = renderer.get_cuda_tensor(env.sim, False)

    gt.stamp("warmstart")
    for i in gt.timed_for(range(1000)):
        env.step(np.random.rand(6))
        gt.stamp('step')

        tensor, img = renderer.get_cuda_tensor(env.sim, False)
        gt.stamp('render')
        # cv2.imshow("img", img)
        # cv2.waitKey(1)
    gt.stamp("end")

    print(img)

    print(gt.report(include_itrs=False))
Exemplo n.º 18
0
def run_exp(*_):
    with LocalRunner() as runner:
        env = GarageEnv(HalfCheetahEnv())
        # q-functions
        qf1 = ContinuousMLPQFunction(env_spec=env.spec)
        qf2 = ContinuousMLPQFunction(env_spec=env.spec)
        # replay buffer
        replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), )
        # policy
        policy = GaussianMLPPolicy(env_spec=env.spec)
        # algorithm
        algo = SAC(
            env_spec=env.spec,
            policy=policy,
            qfs=[qf1, qf2],
            replay_buffer=replay_buffer,
        )
        # setup and train
        runner.setup(algo, env)
        runner.train(n_epochs=100, batch_size=1000)
Exemplo n.º 19
0
def simulate_policy(args):
    data = torch.load(str(args.file))
    #data = joblib.load(str(args.file))
    policy = data['evaluation/policy']
    env = NormalizedBoxEnv(HalfCheetahEnv())
    #env = data['evaluation/env']
    print("Policy loaded")
    if args.gpu:
        set_gpu_mode(True)
        policy.cuda()
    while True:
        path = rollout(
            env,
            policy,
            max_path_length=args.H,
            render=True,
        )
        if hasattr(env, "log_diagnostics"):
            env.log_diagnostics([path])
        logger.dump_tabular()
Exemplo n.º 20
0
def example(variant):
    import torch

    import rlkit.torch.pytorch_util as ptu

    print("Starting")
    logger.log(torch.__version__)
    date_format = "%m/%d/%Y %H:%M:%S %Z"
    date = datetime.now(tz=pytz.utc)
    logger.log("start")
    logger.log("Current date & time is: {}".format(date.strftime(date_format)))
    logger.log("Cuda available: {}".format(torch.cuda.is_available()))
    if torch.cuda.is_available():
        x = torch.randn(3)
        logger.log(str(x.to(ptu.device)))

    date = date.astimezone(timezone("US/Pacific"))
    logger.log("Local date & time is: {}".format(date.strftime(date_format)))
    for i in range(variant["num_seconds"]):
        logger.log("Tick, {}".format(i))
        time.sleep(1)
    logger.log("end")
    logger.log("Local date & time is: {}".format(date.strftime(date_format)))

    logger.log("start mujoco")
    from gym.envs.mujoco import HalfCheetahEnv

    e = HalfCheetahEnv()
    img = e.sim.render(32, 32)
    logger.log(str(sum(img)))
    logger.log("end mujoco")

    logger.record_tabular("Epoch", 1)
    logger.dump_tabular()
    logger.record_tabular("Epoch", 2)
    logger.dump_tabular()
    logger.record_tabular("Epoch", 3)
    logger.dump_tabular()
    print("Done")
Exemplo n.º 21
0
def experiment(variant):
    env = NormalizedBoxEnv(HalfCheetahEnv())
    es = GaussianStrategy(
        action_space=env.action_space,
        max_sigma=0.1,
        min_sigma=0.1,  # Constant sigma
    )
    obs_dim = env.observation_space.low.size
    action_dim = env.action_space.low.size
    qf1 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[400, 300],
    )
    qf2 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[400, 300],
    )
    policy = TanhMlpPolicy(
        input_size=obs_dim,
        output_size=action_dim,
        hidden_sizes=[400, 300],
    )
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    algorithm = PERTD3(env,
                       qf1=qf1,
                       qf2=qf2,
                       policy=policy,
                       exploration_policy=exploration_policy,
                       **variant['algo_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Exemplo n.º 22
0
 def __init__(self, **kwargs):
     HalfCheetahEnv.__init__(self,)
     offline_env.OfflineEnv.__init__(self, **kwargs)
Exemplo n.º 23
0
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.bn2(self.conv2(x)))
        x = F.relu(self.bn3(self.conv3(x)))
        h = x.view(-1, 128)  # flatten
        return self.output_activation(self.fc1(h))


if __name__ == "__main__":
    E = 10
    R = 84
    cuda = True

    envs = []

    for e in range(E):
        env = HalfCheetahEnv()
        envs.append(env)
    c = Convnet(6, output_activation=torch.tanh, input_channels=3)
    if cuda:
        c.cuda()

    # viewer = mujoco_py.MjRenderContextOffscreen(env.sim, device_id=1)
    # env.sim.add_render_context(viewer)

    def step(stamp=True):
        imgs = []
        if i % 100 == 0:
            for e in range(E):
                envs[e].reset()
        for e in range(E):
            img = envs[e].sim.render(R, R, device_id=1).transpose()
Exemplo n.º 24
0
def experiment(variant):
    eval_env = NormalizedBoxEnv(HalfCheetahEnv())
    expl_env = NormalizedBoxEnv(HalfCheetahEnv())
    obs_dim = expl_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size

    skills_dim = variant['skills_dim']

    # Define the networks

    M = variant['layer_size']
    qf1 = FlattenMlp(
        input_size=obs_dim + action_dim + skills_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    qf2 = FlattenMlp(
        input_size=obs_dim + action_dim + skills_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    target_qf1 = FlattenMlp(
        input_size=obs_dim + action_dim + skills_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    target_qf2 = FlattenMlp(
        input_size=obs_dim + action_dim + skills_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    policy = TanhGaussianPolicy(
        obs_dim=obs_dim + skills_dim,
        action_dim=action_dim,
        hidden_sizes=[M, M],
    )

    higher_level_policy = categorical_mlp.CategoricalMLPPolicy(
        input_size=obs_dim,
        output_size=skills_dim,
        hidden_sizes=(M, M),
    )

    value_function = FlattenMlp(
        hidden_sizes=[M, M],
        input_size=obs_dim,
        output_size=1,
    )

    discriminator_function = FlattenMlp(
        hidden_sizes=[M, M],
        input_size=obs_dim,
        output_size=skills_dim
    )

    target_vf = FlattenMlp(
        hidden_sizes=[M, M],
        input_size=obs_dim,
        output_size=1,
    )

    eval_policy = MakeDeterministic(policy)
    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
        higher_level_policy,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        policy,
        higher_level_policy,
    )
    replay_buffer = EnvReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
    )

    trainer = emp_skills_trainer.EmpowermentSkillsTrainer(
        env=eval_env,
        higher_level_policy=higher_level_policy,
        policy=policy,
        qf1=qf1,
        qf2=qf2,
        target_qf1=target_qf1,
        target_qf2=target_qf2,
        target_vf=target_vf,
        value_function=value_function,
        discriminator=discriminator_function,
        **variant['trainer_kwargs']
    )

    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs']
    )
    algorithm.to(ptu.device)
    algorithm.train()
Exemplo n.º 25
0
def experiment(variant):
    # expl_env = NormalizedBoxEnv(gym.make('activesearchrl-v0'))
    # eval_env = NormalizedBoxEnv(gym.make('activesearchrl-v0'))
    # obs_dim = expl_env.observation_space.low.size
    # action_dim = eval_env.action_space.low.size

    expl_env = NormalizedBoxEnv(HalfCheetahEnv())
    eval_env = NormalizedBoxEnv(HalfCheetahEnv())
    obs_dim = expl_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size

    M = variant['layer_size']
    qf1 = ConcatMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    qf2 = ConcatMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    target_qf1 = ConcatMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    target_qf2 = ConcatMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    # policy = TanhGaussianPolicy(
    #     obs_dim=obs_dim,
    #     action_dim=action_dim,
    #     hidden_sizes=[M, M],
    # )
    policy = GaussianMixturePolicy(obs_dim=obs_dim,
                                   action_dim=action_dim,
                                   hidden_sizes=[M, M],
                                   num_gaussians=2)

    # data = torch.load('/Users/conor/Documents/PHD_RESEARCH/ACTIVE_SEARCH_AS_RL/rlkit/data/tabular-active-search-k1/tabular_active_search_k1_2020_11_10_16_18_25_0000--s-0/params.pkl')
    # qf1 = data['trainer/qf1']
    # qf2 = data['trainer/qf2']
    # target_qf1 = data['trainer/target_qf1']
    # target_qf2 = data['trainer/target_qf2']
    # policy = data['trainer/policy']

    eval_policy = MakeDeterministic(policy)
    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        policy,
    )
    replay_buffer = EnvReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
    )
    trainer = SACTrainer(env=eval_env,
                         policy=policy,
                         qf1=qf1,
                         qf2=qf2,
                         target_qf1=target_qf1,
                         target_qf2=target_qf2,
                         **variant['trainer_kwargs'])
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Exemplo n.º 26
0
 def __init__(self):
     self._base_pos = 15.
     HalfCheetahEnv.__init__(self)
Exemplo n.º 27
0
def experiment(variant):

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    n_proc = comm.Get_size()
    root = 0
    gpus = GPUtil.getGPUs()
    n_gpu = len(gpus)
    torch.distributed.init_process_group(backend='mpi', world_size=n_proc)

    E = 20
    R = 84
    U = 6
    cuda = True

    envs = []

    for e in range(rank, E, n_proc):
        env = HalfCheetahEnv()
        envs.append(env)

    sendcounts = np.array(comm.gather(len(envs), root))

    i_sendcounts = None
    u_sendcounts = None

    if rank == root:
        i_sendcounts = sendcounts * 3 * R * R
        u_sendcounts = sendcounts * U
    c = Convnet(6, output_activation=torch.tanh, input_channels=3)
    c = torch.nn.parallel.DistributedDataParallel(c)
    if cuda:
        c.cuda()

    # viewer = mujoco_py.MjRenderContextOffscreen(env.sim, device_id=1)
    # env.sim.add_render_context(viewer)

    def step(i, stamp=True):
        imgs = []
        if i % 100 == 0:
            for e in envs:
                e.reset()
        for e in envs:
            img = e.sim.render(R, R, device_id=rank % n_gpu).transpose()
            imgs.append(img)
        comm.Barrier()
        if rank == 0:
            gt.stamp('render') if stamp else 0

        imgs = np.array(imgs)
        r_imgs = None
        if rank == 0:
            r_imgs = np.empty((E, 3, R, R), dtype='uint8')

        comm.Gatherv(sendbuf=imgs, recvbuf=(r_imgs, i_sendcounts), root=root)
        if rank == 0:
            gt.stamp('comm1') if stamp else 0

        u = None
        if rank == 0:
            torch_img = np_to_var(r_imgs)
            if cuda:
                torch_img = torch_img.cuda()
                torch.cuda.synchronize()
            gt.stamp('transfer') if stamp else 0

            u = get_numpy(c.forward(torch_img).cpu())
            torch.cuda.synchronize()
            gt.stamp('forward') if stamp else 0

        r_u = np.empty((len(envs), U), dtype='float32')
        comm.Scatterv(sendbuf=(u, u_sendcounts), recvbuf=r_u, root=root)
        if rank == 0:
            gt.stamp('comm2') if stamp else 0
        for i, e in enumerate(envs):
            e.step(r_u[i, :])
        comm.Barrier()
        if rank == 0:
            gt.stamp('step') if stamp else 0

    for i in range(10):
        step(i, False)

    if rank == 0:
        gt.stamp('start')
    for i in gt.timed_for(range(100)):
        step(i)
    if rank == 0:
        gt.stamp('end')

        print(
            gt.report(include_itrs=False,
                      format_options=dict(itr_num_width=10)))
Exemplo n.º 28
0
def experiment(variant):
    # Or for a specific version (Daniel: doesn't work):
    # import gym
    # env = NormalizedBoxEnv(gym.make('HalfCheetah-v1'))

    if 'Ant' in args.env:
        expl_env = NormalizedBoxEnv(AntEnv())
        eval_env = NormalizedBoxEnv(AntEnv())
    elif 'InvertedPendulum' in args.env:
        expl_env = NormalizedBoxEnv(InvertedPendulumEnv())
        eval_env = NormalizedBoxEnv(InvertedPendulumEnv())
    elif 'HalfCheetah' in args.env:
        expl_env = NormalizedBoxEnv(HalfCheetahEnv())
        eval_env = NormalizedBoxEnv(HalfCheetahEnv())
    elif 'Hopper' in args.env:
        expl_env = NormalizedBoxEnv(HopperEnv())
        eval_env = NormalizedBoxEnv(HopperEnv())
    elif 'Reacher' in args.env:
        expl_env = NormalizedBoxEnv(ReacherEnv())
        eval_env = NormalizedBoxEnv(ReacherEnv())
    elif 'Swimmer' in args.env:
        expl_env = NormalizedBoxEnv(SwimmerEnv())
        eval_env = NormalizedBoxEnv(SwimmerEnv())
    elif 'Walker2d' in args.env:
        expl_env = NormalizedBoxEnv(Walker2dEnv())
        eval_env = NormalizedBoxEnv(Walker2dEnv())
    else:
        raise ValueError(args.env)

    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size
    qf = FlattenMlp(input_size=obs_dim + action_dim,
                    output_size=1,
                    **variant['qf_kwargs'])
    policy = TanhMlpPolicy(input_size=obs_dim,
                           output_size=action_dim,
                           **variant['policy_kwargs'])
    target_qf = copy.deepcopy(qf)
    target_policy = copy.deepcopy(policy)
    eval_path_collector = MdpPathCollector(eval_env, policy)
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=OUStrategy(action_space=expl_env.action_space),
        policy=policy,
    )
    expl_path_collector = MdpPathCollector(expl_env, exploration_policy)
    replay_buffer = EnvReplayBuffer(variant['replay_buffer_size'], expl_env)
    trainer = DDPGTrainer(qf=qf,
                          target_qf=target_qf,
                          policy=policy,
                          target_policy=target_policy,
                          **variant['trainer_kwargs'])
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Exemplo n.º 29
0
def run_task(snapshot_config, *_):
    """Set up environment and algorithm and run the task.
    Args:
        snapshot_config (garage.experiment.SnapshotConfig): The snapshot
            configuration used by LocalRunner to create the snapshotter.
            If None, it will create one with default settings.
        _ : Unused parameters
    """

    th = 1.8
    g_max = 0.1
    #delta = 1e-7
    if args.env == 'CartPole':
        #CartPole

        env = TfEnv(normalize(CartPoleEnv()))
        runner = LocalRunner(snapshot_config)
        batch_size = 5000
        max_length = 100
        n_timestep = 5e5
        n_counts = 5
        name = 'CartPole'
        grad_factor = 5
        th = 1.2
        #batchsize: 1
        # lr = 0.1
        # w = 2
        # c = 50

        #batchsize: 50
        lr = 0.75
        c = 3
        w = 2

        discount = 0.995
        path = './init/CartPole_policy.pth'

    if args.env == 'Walker':
        #Walker_2d
        env = TfEnv(normalize(Walker2dEnv()))
        runner = LocalRunner(snapshot_config)
        batch_size = 50000
        max_length = 500

        n_timestep = 1e7
        n_counts = 5
        lr = 0.75
        w = 2
        c = 12
        grad_factor = 6

        discount = 0.999

        name = 'Walk'
        path = './init/Walk_policy.pth'

    if args.env == 'HalfCheetah':
        env = TfEnv(normalize(HalfCheetahEnv()))
        runner = LocalRunner(snapshot_config)

        batch_size = 50000
        max_length = 500

        n_timestep = 1e7
        n_counts = 5
        lr = 0.6
        w = 1
        c = 4
        grad_factor = 5
        th = 1.2
        g_max = 0.06

        discount = 0.999

        name = 'HalfCheetah'
        path = './init/HalfCheetah_policy.pth'

    if args.env == 'Hopper':
        #Hopper
        env = TfEnv(normalize(HopperEnv()))
        runner = LocalRunner(snapshot_config)

        batch_size = 50000
        max_length = 1000
        th = 1.5
        n_timestep = 1e7
        n_counts = 5
        lr = 0.75
        w = 1
        c = 3
        grad_factor = 6
        g_max = 0.15
        discount = 0.999

        name = 'Hopper'
        path = './init/Hopper_policy.pth'

    for i in range(n_counts):
        # print(env.spec)
        if args.env == 'CartPole':
            policy = CategoricalMLPPolicy(env.spec,
                                       hidden_sizes=[8, 8],
                                       hidden_nonlinearity=torch.tanh,
                                       output_nonlinearity=None)
        else:
            policy = GaussianMLPPolicy(env.spec,
                                       hidden_sizes=[64, 64],
                                       hidden_nonlinearity=torch.tanh,
                                       output_nonlinearity=None)


        policy.load_state_dict(torch.load(path))
        baseline = LinearFeatureBaseline(env_spec=env.spec)

        algo = MBPG_HA(env_spec=env.spec,
                   env = env,
                    env_name= name,
                   policy=policy,
                   baseline=baseline,
                   max_path_length=max_length,
                   discount=discount,
                   grad_factor=grad_factor,
                   policy_lr= lr,
                   c = c,
                   w = w,
                   th=th,
                   g_max=g_max,
                   n_timestep=n_timestep,

                   batch_size=batch_size,
                   center_adv=True,
                   # delta=delta
                   #decay_learning_rate=d_lr,

                   )

        runner.setup(algo, env)
        runner.train(n_epochs=100, batch_size=batch_size)
Exemplo n.º 30
0
def experiment(variant):
    cuda = True
    ingpu = False
    R = 84
    E = 100
    N = 100

    if ingpu:
        from mujoco_torch.core.bridge import MjCudaRender
        renderer = MjCudaRender(84, 84, E)

    envs = []
    for e in range(E):
        env = HalfCheetahEnv()
        envs.append(env)

    c = Convnet(6, output_activation=torch.tanh, input_channels=3)
    if cuda:
        c.cuda()

    def step(stamp=True):
        for e in range(E):
            env = envs[e]
            env.step(np.random.rand(6))
        gt.stamp('step') if stamp else 0

        if ingpu:
            sims = [env.sim for env in envs]
            env = envs[e]
            tensor, img = renderer.get_batch_cuda_tensor(sims, False)
            tensor = Variable(tensor).float()
            gt.stamp('render') if stamp else 0

        else:
            imgs = []
            for e in range(E):
                env = envs[e]
                img = env.sim.render(R, R, device_id=1)
                imgs.append(img)
            gt.stamp('render') if stamp else 0

            imgs = np.array(imgs)
            tensor = np_to_var(imgs)
            if cuda:
                tensor = tensor.cuda()
                torch.cuda.synchronize()
            gt.stamp('transfer') if stamp else 0

        u = get_numpy(c.forward(tensor).cpu())
        torch.cuda.synchronize()
        gt.stamp('forward') if stamp else 0

        # cv2.imshow("img", img)
        # cv2.waitKey(1)

    gt.stamp("start")
    for i in range(10):
        step(False)

    gt.stamp("warmstart")
    for i in gt.timed_for(range(N)):
        step()

    gt.stamp("end")

    print(gt.report(include_itrs=False))