def example(variant): env = HalfCheetahEnv() if variant['normalize']: env = normalize(env) es = OUStrategy(action_space=env.action_space) qf = FeedForwardQFunction( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 32, 32, ) policy = FeedForwardPolicy( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 32, 32, ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = DDPG(env, qf=qf, policy=policy, exploration_policy=exploration_policy, **variant['algo_params']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = gym.make(variant['env_id']) env = NormalizedBoxEnv(env) es = GaussianStrategy(action_space=env.action_space, ) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[128, 128]) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[128, 128], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = DDPG(env, qf=qf, policy=policy, exploration_policy=exploration_policy, **variant['algo_params']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = variant['env_class'](**variant['env_kwargs']) env = NormalizedBoxEnv(env, **variant['normalize_kwargs']) if variant['multitask']: env = MultitaskToFlatEnv(env) es = OUStrategy(action_space=env.action_space, **variant['ou_kwargs']) obs_dim = int(env.observation_space.flat_dim) action_dim = int(env.action_space.flat_dim) obs_normalizer = TorchFixedNormalizer(obs_dim) action_normalizer = TorchFixedNormalizer(action_dim) qf = MlpQf(input_size=obs_dim + action_dim, output_size=1, obs_normalizer=obs_normalizer, action_normalizer=action_normalizer, **variant['qf_kwargs']) policy = TanhMlpPolicy(input_size=obs_dim, output_size=action_dim, obs_normalizer=obs_normalizer, **variant['policy_kwargs']) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = DDPG(env, qf, policy, exploration_policy, obs_normalizer=obs_normalizer, action_normalizer=action_normalizer, **variant['algo_kwargs']) algorithm.train()
def experiment(variant): env_params = variant['env_params'] env = SawyerXYZReachingEnv(**env_params) es = OUStrategy(action_space=env.action_space) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = DDPG(env, qf=qf, policy=policy, exploration_policy=exploration_policy, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def experiment(variant): imsize = variant['imsize'] history = variant['history'] #env = InvertedDoublePendulumEnv()#gym.make(variant['env_id']) # env = SawyerXYZEnv() env = RandomGoalPusher2DEnv() partial_obs_size = env.obs_dim env = NormalizedBoxEnv( ImageMujocoWithObsEnv(env, imsize=imsize, keep_prev=history - 1, init_camera=variant['init_camera'])) # es = GaussianStrategy( # action_space=env.action_space, # ) es = OUStrategy(action_space=env.action_space) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf = MergedCNN(input_width=imsize, input_height=imsize, output_size=1, input_channels=history, added_fc_input_size=action_dim + partial_obs_size, **variant['cnn_params']) policy = CNNPolicy( input_width=imsize, input_height=imsize, added_fc_input_size=partial_obs_size, output_size=action_dim, input_channels=history, **variant['cnn_params'], output_activation=torch.tanh, ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = DDPG( env, qf=qf, policy=policy, # qf_weight_decay=.01, exploration_policy=exploration_policy, **variant['algo_params']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = variant['env_class'](**variant['env_kwargs']) env = MultitaskToFlatEnv(env) if variant['normalize']: env = NormalizedBoxEnv(env) exploration_type = variant['exploration_type'] if exploration_type == 'ou': es = OUStrategy(action_space=env.action_space) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=env.action_space, prob_random_action=0.1, ) else: raise Exception("Invalid type: " + exploration_type) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = DDPG( env, qf=qf, policy=policy, exploration_policy=exploration_policy, **variant['algo_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = NormalizedBoxEnv( MultiGoalEnv( actuation_cost_coeff=10, distance_cost_coeff=1, goal_reward=10, )) es = OUStrategy(action_space=env.action_space) qf = FeedForwardQFunction( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 100, 100, ) policy = FeedForwardPolicy( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 100, 100, ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) plotter = QFPolicyPlotter( qf=qf, # policy=policy, policy=exploration_policy, obs_lst=np.array([[-2.5, 0.0], [0.0, 0.0], [2.5, 2.5]]), default_action=[np.nan, np.nan], n_samples=100) algorithm = DDPG(env, qf=qf, policy=policy, exploration_policy=exploration_policy, render_eval_paths=True, plotter=plotter, **variant['algo_params']) algorithm.to(ptu.device) algorithm.train()
def example(variant): env_class = variant['env_class'] env_params = variant['env_params'] env = env_class(**env_params) normalize(env) es_class = variant['es_class'] es_params = dict(action_space=env.action_space, **variant['es_params']) use_gpu = variant['use_gpu'] es = es_class(**es_params) qf = FeedForwardQFunction( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 100, 100, ) policy_class = variant['policy_class'] policy_params = dict( obs_dim=get_dim(env.observation_space), action_dim=get_dim(env.action_space), fc1_size=100, fc2_size=100, ) policy = policy_class(**policy_params) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = DDPG( env, qf=qf, policy=policy, exploration_policy=exploration_policy, **variant['algo_params'], ) if use_gpu and ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def experiment(variant): data = joblib.load(GOOD_DDPG_POLICY_PATH) expert_policy = data['policy'] env = NormalizedBoxEnv(variant['env_class']()) es = OUStrategy( action_space=env.action_space, **variant['es_kwargs'] ) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=expert_policy, ) algorithm = DDPG( env, qf=qf, policy=policy, exploration_policy=exploration_policy, **variant['algo_kwargs'] ) if ptu.gpu_enabled(): expert_policy.to(ptu.device) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env_params = variant['env_params'] es_params = variant['es_params'] env = SawyerXYZReachingEnv(**env_params) es = OUStrategy(action_space=env.action_space, **es_params) hidden_sizes = variant['hidden_sizes'] obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[hidden_sizes, hidden_sizes], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[hidden_sizes, hidden_sizes], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) if variant['env_params']['relative_pos_control']: variant['algo_params']['max_path_length'] = 3 variant['algo_params']['num_steps_per_epoch'] = 15 variant['algo_params']['num_steps_per_eval'] = 15 algorithm = DDPG(env, qf=qf, policy=policy, exploration_policy=exploration_policy, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()