def experiment(variant): env_class = variant['env_class'] env = env_class(**variant['env_params']) env = NormalizedBoxEnv( env, **variant['normalize_params'] ) observation_space = convert_gym_space(env.observation_space) action_space = convert_gym_space(env.action_space) qf = variant['qf_class']( int(observation_space.flat_dim), int(action_space.flat_dim), env.goal_dim, **variant['qf_params'] ) policy = FFUniversalPolicy( int(observation_space.flat_dim), int(action_space.flat_dim), env.goal_dim, **variant['policy_params'] ) epoch_discount_schedule = None epoch_discount_schedule_class = variant['epoch_discount_schedule_class'] if epoch_discount_schedule_class is not None: epoch_discount_schedule = epoch_discount_schedule_class( **variant['epoch_discount_schedule_params'] ) qf_criterion = variant['qf_criterion_class']( **variant['qf_criterion_params'] ) es = variant['sampler_es_class']( action_space=action_space, **variant['sampler_es_params'] ) if variant['explore_with_ddpg_policy']: raw_exploration_policy = policy else: raw_exploration_policy = TerminalRewardSampleOCPolicy( qf, env, 5, ) exploration_policy = UniversalPolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=raw_exploration_policy, ) algo = variant['algo_class']( env, qf, policy, exploration_policy, epoch_discount_schedule=epoch_discount_schedule, qf_criterion=qf_criterion, **variant['algo_params'] ) if ptu.gpu_enabled(): algo.cuda() algo.train()
def example(variant): env_class = variant['env_class'] env_params = variant['env_params'] env = env_class(**env_params) obs_space = convert_gym_space(env.observation_space) action_space = convert_gym_space(env.action_space) es_class = variant['es_class'] es_params = es_params = dict( action_space=action_space, ) policy_class = variant['policy_class'] use_gpu = variant['use_gpu'] if variant['normalize_env']: env = normalize(env) es = es_class(**es_params) qf = FeedForwardQFunction( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 100, 100, ) policy_params = dict( obs_dim=int(obs_space.flat_dim), action_dim=int(action_space.flat_dim), fc1_size=100, fc2_size=100, ) policy = policy_class(**policy_params) remote_env = RemoteRolloutEnv( env_class, env_params, policy_class, policy_params, es_class, es_params, variant['max_path_length'], variant['normalize_env'], ) algorithm =ParallelDDPG( remote_env, exploration_strategy=es, qf=qf, policy=policy, **variant['algo_params'], ) if use_gpu: algorithm.cuda() algorithm.train()
def example(variant): env_class = variant['env_class'] env_params = variant['env_params'] env = env_class(**env_params) obs_space = convert_gym_space(env.observation_space) action_space = convert_gym_space(env.action_space) es_class = variant['es_class'] es_params = dict(action_space=action_space, **variant['es_params']) use_gpu = variant['use_gpu'] es = es_class(**es_params) policy_class = variant['policy_class'] policy_params = dict( obs_dim=int(obs_space.flat_dim), action_dim=int(action_space.flat_dim), fc1_size=100, fc2_size=100, ) policy = policy_class(**policy_params) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) remote_env = RemoteRolloutEnv( env, policy, exploration_policy, variant['max_path_length'], variant['normalize_env'], ) qf = FeedForwardQFunction( int(remote_env.observation_space.flat_dim), int(remote_env.action_space.flat_dim), 100, 100, ) algorithm = ParallelDDPG( remote_env, qf=qf, policy=policy, exploration_policy=exploration_policy, **variant['algo_params'], ) if use_gpu and ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def experiment(variant): env_params = variant['env_params'] env = MultiTaskBaxterEnv(**env_params) observation_space = convert_gym_space(env.observation_space) action_space = convert_gym_space(env.action_space) qf = FlatUniversalQfunction( int(observation_space.flat_dim), int(action_space.flat_dim), env.goal_dim, **variant['qf_params'], ) policy = FFUniversalPolicy(int(observation_space.flat_dim), int(action_space.flat_dim), env.goal_dim, **variant['policy_params']) es = variant['sampler_es_class'](action_space=action_space, **variant['sampler_es_params']) exploration_policy = UniversalPolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) epoch_discount_schedule = variant['epoch_discount_schedule_class']( **variant['epoch_discount_schedule_params']) algo = HorizonFedStateDistanceQLearning( env, qf, policy, exploration_policy, qf_criterion=HuberLoss(), epoch_discount_schedule=epoch_discount_schedule, **variant['algo_params']) if ptu.gpu_enabled(): algo.cuda() algo.train()