Пример #1
0
def experiment(variant):
    env_class = variant['env_class']
    env = env_class(**variant['env_params'])
    env = NormalizedBoxEnv(
        env,
        **variant['normalize_params']
    )
    observation_space = convert_gym_space(env.observation_space)
    action_space = convert_gym_space(env.action_space)
    qf = variant['qf_class'](
        int(observation_space.flat_dim),
        int(action_space.flat_dim),
        env.goal_dim,
        **variant['qf_params']
    )
    policy = FFUniversalPolicy(
        int(observation_space.flat_dim),
        int(action_space.flat_dim),
        env.goal_dim,
        **variant['policy_params']
    )
    epoch_discount_schedule = None
    epoch_discount_schedule_class = variant['epoch_discount_schedule_class']
    if epoch_discount_schedule_class is not None:
        epoch_discount_schedule = epoch_discount_schedule_class(
            **variant['epoch_discount_schedule_params']
        )
    qf_criterion = variant['qf_criterion_class'](
        **variant['qf_criterion_params']
    )
    es = variant['sampler_es_class'](
        action_space=action_space,
        **variant['sampler_es_params']
    )
    if variant['explore_with_ddpg_policy']:
        raw_exploration_policy = policy
    else:
        raw_exploration_policy = TerminalRewardSampleOCPolicy(
            qf,
            env,
            5,
        )
    exploration_policy = UniversalPolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=raw_exploration_policy,
    )
    algo = variant['algo_class'](
        env,
        qf,
        policy,
        exploration_policy,
        epoch_discount_schedule=epoch_discount_schedule,
        qf_criterion=qf_criterion,
        **variant['algo_params']
    )
    if ptu.gpu_enabled():
        algo.cuda()
    algo.train()
def example(variant):
    env_class = variant['env_class']
    env_params = variant['env_params']
    env = env_class(**env_params)
    obs_space = convert_gym_space(env.observation_space)
    action_space = convert_gym_space(env.action_space)
    es_class = variant['es_class']
    es_params = es_params = dict(
        action_space=action_space,
    )
    policy_class = variant['policy_class']
    use_gpu = variant['use_gpu']

    if variant['normalize_env']:
        env = normalize(env)

    es = es_class(**es_params)
    qf = FeedForwardQFunction(
        int(env.observation_space.flat_dim),
        int(env.action_space.flat_dim),
        100,
        100,
    )
    policy_params = dict(
        obs_dim=int(obs_space.flat_dim),
        action_dim=int(action_space.flat_dim),
        fc1_size=100,
        fc2_size=100,
    )
    policy = policy_class(**policy_params)
    remote_env = RemoteRolloutEnv(
        env_class,
        env_params,
        policy_class,
        policy_params,
        es_class,
        es_params,
        variant['max_path_length'],
        variant['normalize_env'],
    )

    algorithm =ParallelDDPG(
        remote_env,
        exploration_strategy=es,
        qf=qf,
        policy=policy,
        **variant['algo_params'],
    )
    if use_gpu:
        algorithm.cuda()
    algorithm.train()
def example(variant):
    env_class = variant['env_class']
    env_params = variant['env_params']
    env = env_class(**env_params)
    obs_space = convert_gym_space(env.observation_space)
    action_space = convert_gym_space(env.action_space)
    es_class = variant['es_class']
    es_params = dict(action_space=action_space, **variant['es_params'])
    use_gpu = variant['use_gpu']
    es = es_class(**es_params)
    policy_class = variant['policy_class']
    policy_params = dict(
        obs_dim=int(obs_space.flat_dim),
        action_dim=int(action_space.flat_dim),
        fc1_size=100,
        fc2_size=100,
    )
    policy = policy_class(**policy_params)
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    remote_env = RemoteRolloutEnv(
        env,
        policy,
        exploration_policy,
        variant['max_path_length'],
        variant['normalize_env'],
    )
    qf = FeedForwardQFunction(
        int(remote_env.observation_space.flat_dim),
        int(remote_env.action_space.flat_dim),
        100,
        100,
    )
    algorithm = ParallelDDPG(
        remote_env,
        qf=qf,
        policy=policy,
        exploration_policy=exploration_policy,
        **variant['algo_params'],
    )
    if use_gpu and ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()
Пример #4
0
def experiment(variant):
    env_params = variant['env_params']
    env = MultiTaskBaxterEnv(**env_params)

    observation_space = convert_gym_space(env.observation_space)
    action_space = convert_gym_space(env.action_space)
    qf = FlatUniversalQfunction(
        int(observation_space.flat_dim),
        int(action_space.flat_dim),
        env.goal_dim,
        **variant['qf_params'],
    )

    policy = FFUniversalPolicy(int(observation_space.flat_dim),
                               int(action_space.flat_dim), env.goal_dim,
                               **variant['policy_params'])

    es = variant['sampler_es_class'](action_space=action_space,
                                     **variant['sampler_es_params'])

    exploration_policy = UniversalPolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    epoch_discount_schedule = variant['epoch_discount_schedule_class'](
        **variant['epoch_discount_schedule_params'])

    algo = HorizonFedStateDistanceQLearning(
        env,
        qf,
        policy,
        exploration_policy,
        qf_criterion=HuberLoss(),
        epoch_discount_schedule=epoch_discount_schedule,
        **variant['algo_params'])
    if ptu.gpu_enabled():
        algo.cuda()
    algo.train()