def get_td3_trainer(env, hidden_sizes=[256, 256], **kwargs): obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf1 = ConcatMlp(input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes) qf2 = ConcatMlp(input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes) target_qf1 = ConcatMlp(input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes) target_qf2 = ConcatMlp(input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes) policy = TanhMlpPolicy(input_size=obs_dim, output_size=action_dim, hidden_sizes=hidden_sizes) target_policy = TanhMlpPolicy(input_size=obs_dim, output_size=action_dim, hidden_sizes=hidden_sizes) trainer = TD3Trainer(policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, target_policy=target_policy, hidden_sizes=hidden_sizes) return trainer
def get_td3pg(evaluation_environment, parameters): """ :param evaluation_environment: :param parameters: :return: """ obs_dim = evaluation_environment.observation_space.low.size action_dim = evaluation_environment.action_space.low.size hidden_sizes_qf = parameters['hidden_sizes_qf'] hidden_sizes_policy = parameters['hidden_sizes_policy'] qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes_qf, ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes_qf, ) target_qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes_qf, ) target_qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes_qf, ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=hidden_sizes_policy, ) target_policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=hidden_sizes_policy, ) es = GaussianStrategy( action_space=evaluation_environment.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) trainer = TD3Trainer(policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, target_policy=target_policy, **parameters['trainer_params']) return exploration_policy, policy, trainer
def train_distilled_policy(num_tasks, policies=None, epochs_per_task=500, batch_size=100, lr=1e-3): """ Trains a distilled policy (using an optimal expert or a trained expert). Saves the policy in a .pkl file along with the env and the loss history. :param num_tasks: Number of tasks/policies to distill. :param policies: A list of length `num_tasks` containing all the individual experts. :param epochs_per_task: Number of training epochs per task. :param batch_size: Batch sample size per update step. :param lr: Learning rate of the optimizer. :return: The trained policy and the environment. """ base_env = PointMassEnv(n=num_tasks) env = FlatGoalEnv(base_env, append_goal_to_obs=True) obs_dim = env.observation_space.low.size act_dim = env.action_space.low.size policy = TanhMlpPolicy(input_size=obs_dim, output_size=act_dim, hidden_sizes=[64, 64] # hidden_sizes=[64, 64, 64] ) loss_history = [] criterion = nn.MSELoss() optim = Adam(policy.parameters(), lr=lr) for epoch in range(epochs_per_task * num_tasks): if policies: assert len(policies) == num_tasks, "Number of expert policies needs " \ "to be equal to the number of tasks" obs, act_labels = get_batch(env, batch_size, policies) obs_var, act_labels_var = Variable(torch.from_numpy(obs)).float(), \ Variable(torch.from_numpy(act_labels)).float() acts = policy(obs_var) optim.zero_grad() loss = criterion(acts, act_labels_var) loss.backward() optim.step() loss_val = loss.data.item() loss_history.append(loss_val) if epoch % 50 == 0: print("epoch: {0} \t loss: {1}".format(epoch, loss_val)) print("FINAL loss: {1}".format(epoch, loss.data.item())) out = dict(policy=policy, env=env, loss_history=loss_history) appended_path = "-from_expert_policies" if policies else "" path = "./logs/policy-distillation/model-{0}{1}.pkl".format( num_tasks, appended_path) with open(path, "wb") as f: pickle.dump(out, f, protocol=pickle.HIGHEST_PROTOCOL) return policy, env
def experiment(variant): expl_env = NormalizedBoxEnv(HalfCheetahEnv()) eval_env = NormalizedBoxEnv(HalfCheetahEnv()) obs_dim = expl_env.observation_space.low.size action_dim = expl_env.action_space.low.size qf1 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant["qf_kwargs"]) qf2 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant["qf_kwargs"]) target_qf1 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant["qf_kwargs"]) target_qf2 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant["qf_kwargs"]) policy = TanhMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant["policy_kwargs"]) target_policy = TanhMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant["policy_kwargs"]) es = GaussianStrategy( action_space=expl_env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy) eval_path_collector = MdpPathCollector(eval_env, policy) expl_path_collector = MdpPathCollector(expl_env, exploration_policy) replay_buffer = EnvReplayBuffer(variant["replay_buffer_size"], expl_env) trainer = TD3Trainer(policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, target_policy=target_policy, **variant["trainer_kwargs"]) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant["algorithm_kwargs"]) algorithm.to(ptu.device) algorithm.train()
def get_ddpg(evaluation_environment, parameters): obs_dim = evaluation_environment.observation_space.low.size action_dim = evaluation_environment.action_space.low.size hidden_sizes_qf = parameters['hidden_sizes_qf'] hidden_sizes_policy = parameters['hidden_sizes_policy'] qf = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes_qf, ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=hidden_sizes_policy, ) target_qf = copy.deepcopy(qf) target_policy = copy.deepcopy(policy) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=OUStrategy( action_space=evaluation_environment.action_space), policy=policy, ) trainer = DDPGTrainer(qf=qf, target_qf=target_qf, policy=policy, target_policy=target_policy, **parameters['trainer_params']) return exploration_policy, policy, trainer
def experiment(variant): #env = NormalizedBoxEnv(HalfCheetahEnv()) env = NormalizedBoxEnv(create_swingup()) # Or for a specific version: # import gym # env = NormalizedBoxEnv(gym.make('HalfCheetah-v1')) es = OUStrategy(action_space=env.action_space) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = DDPG( env, qf=qf, policy=policy, exploration_policy=exploration_policy, **variant['algo_params'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = NormalizedBoxEnv(variant['env_class']()) es = GaussianStrategy(action_space=env.action_space, **variant['es_kwargs']) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf1 = ConcatMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) qf2 = ConcatMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) policy = TanhMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs']) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = TD3(env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def example(variant): env = variant['env_class']() if variant['normalize']: env = NormalizedBoxEnv(env) es = OUStrategy(action_space=env.action_space, **variant['es_kwargs']) obs_dim = int(np.prod(env.observation_space.low.shape)) action_dim = int(np.prod(env.action_space.low.shape)) qf = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['vf_params']) vf = FlattenMlp(input_size=obs_dim, output_size=1, **variant['vf_params']) policy = TanhMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant['policy_params']) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = N3DPG(env, qf=qf, vf=vf, policy=policy, exploration_policy=exploration_policy, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): farmlist_base = [('123.123.123.123', 4)] farmer = Farmer(farmlist_base) environment = acq_remote_env(farmer) env = NormalizedBoxEnv(environment) es = OUStrategy(action_space=env.action_space) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = DDPG(env, qf=qf, policy=policy, exploration_policy=exploration_policy, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def experiment(variant): env = NormalizedBoxEnv(gym.make('HalfCheetah-v1')) es = OUStrategy(action_space=env.action_space) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = DDPG( env, qf=qf, policy=policy, exploration_policy=exploration_policy, **variant['algo_params'] ) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def experiment(variant): env = gym.make('replab-v0')._start_rospy(goal_oriented=False) #SIM #env = gym.make('replab-v0')._start_sim(goal_oriented=False, render=False) env = NormalizedBoxEnv(env) es = GaussianStrategy(action_space=env.action_space) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = DDPG(env, qf=qf, policy=policy, exploration_policy=exploration_policy, **variant['algo_params']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): ''' 1. 建立实验环境(eval, expl) 2. 确立输入,输出维度,建立qf函数,policy函数 3. 复制target qf和 target policy 函数 4. 对于评估构建path collector 5. 对于训练实验,构建探索策略、path collector、replay buffer 6. 构建 DDPGTrainer (qf, policy) 7. algorithm (包括trainer, env, replay buffer, path collector.以及用于评价部分) 8. 开始训练 :param variant: config parameter :return: ''' eval_env = NormalizedBoxEnv(HalfCheetahEnv()) expl_env = NormalizedBoxEnv(HalfCheetahEnv()) # Or for a specific version: # import gym # env = NormalizedBoxEnv(gym.make('HalfCheetah-v1')) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size qf = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) policy = TanhMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs']) # 利用copy target_qf = copy.deepcopy(qf) target_policy = copy.deepcopy(policy) # 评估 eval_path_collector = MdpPathCollector(eval_env, policy) # 实验 (探索策略、path收集、replay buffer) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=OUStrategy(action_space=expl_env.action_space), policy=policy, ) expl_path_collector = MdpPathCollector(expl_env, exploration_policy) replay_buffer = EnvReplayBuffer(variant['replay_buffer_size'], expl_env) trainer = DDPGTrainer(qf=qf, target_qf=target_qf, policy=policy, target_policy=target_policy, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs']) # 转化变量格式 algorithm.to(ptu.device) algorithm.train()
def experiment(variant): if variant['multitask']: env = CylinderXYPusher2DEnv(**variant['env_kwargs']) env = MultitaskToFlatEnv(env) else: env = Pusher2DEnv(**variant['env_kwargs']) if variant['normalize']: env = NormalizedBoxEnv(env) exploration_type = variant['exploration_type'] if exploration_type == 'ou': es = OUStrategy(action_space=env.action_space) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=env.action_space, prob_random_action=0.1, ) else: raise Exception("Invalid type: " + exploration_type) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf1 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) qf2 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = TD3( env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, **variant['algo_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def td3_experiment(variant): env = variant['env_class'](**variant['env_kwargs']) env = MultitaskToFlatEnv(env) if variant.get('make_silent_env', True): env = MultitaskEnvToSilentMultitaskEnv(env) if variant['normalize']: env = NormalizedBoxEnv(env) exploration_type = variant['exploration_type'] if exploration_type == 'ou': es = OUStrategy(action_space=env.action_space) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=env.action_space, prob_random_action=0.1, ) else: raise Exception("Invalid type: " + exploration_type) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf1 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs'] ) qf2 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs'] ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs'] ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = TD3( env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, **variant['algo_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = variant['env_class'](**variant['env_kwargs']) if variant['normalize']: env = NormalizedBoxEnv(env) exploration_type = variant['exploration_type'] if exploration_type == 'ou': es = OUStrategy(action_space=env.action_space) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=env.action_space, prob_random_action=0.1, ) else: raise Exception("Invalid type: " + exploration_type) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size goal_dim = env.goal_dim qf1 = ConcatMlp( input_size=obs_dim + action_dim + goal_dim, output_size=1, hidden_sizes=[400, 300], ) qf2 = ConcatMlp( input_size=obs_dim + action_dim + goal_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim + goal_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = variant['replay_buffer_class']( env=env, **variant['replay_buffer_kwargs']) algorithm = HerTd3(env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, replay_buffer=replay_buffer, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = variant['env_class'](**variant['env_kwargs']) action_dim = env.action_space.low.size obs_dim = env.observation_space.low.size qf = ConcatMlp(input_size=action_dim + obs_dim, output_size=1, **variant['qf_kwargs']) policy = TanhMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs']) algorithm = FiniteHorizonDDPG(env, qf, policy, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = gym.make('SawyerReachXYZEnv-v0') es = GaussianAndEpislonStrategy( action_space=env.action_space, max_sigma=.2, min_sigma=.2, # constant sigma epsilon=.3, ) obs_dim = env.observation_space.spaces['observation'].low.size goal_dim = env.observation_space.spaces['desired_goal'].low.size action_dim = env.action_space.low.size qf1 = FlattenMlp( input_size=obs_dim + goal_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) qf2 = FlattenMlp( input_size=obs_dim + goal_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim + goal_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = ObsDictRelabelingBuffer( env=env, achieved_goal_key='state_achieved_goal', desired_goal_key='state_desired_goal', **variant['replay_buffer_kwargs'] ) algorithm = HerTd3( env=env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, replay_buffer=replay_buffer, **variant['algo_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = gym.make('FetchReach-v1') es = GaussianAndEpsilonStrategy( action_space=env.action_space, max_sigma=.2, min_sigma=.2, # constant sigma epsilon=.3, ) obs_dim = env.observation_space.spaces['observation'].low.size goal_dim = env.observation_space.spaces['desired_goal'].low.size action_dim = env.action_space.low.size qf1 = FlattenMlp( input_size=obs_dim + goal_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) qf2 = FlattenMlp( input_size=obs_dim + goal_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim + goal_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = ObsDictRelabelingBuffer(env=env, **variant['replay_buffer_kwargs']) algorithm = HerTd3(her_kwargs={ "observation_key": "observation", "desired_goal_key": "desired_goal" }, td3_kwargs={ "env": env, "qf1": qf1, "qf2": qf2, "policy": policy, "exploration_policy": exploration_policy, "replay_buffer": replay_buffer, }**variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = gym.make('replab-v0')._start_rospy(goal_oriented=True) #SIM #env = gym.make('replab-v0')._start_sim(goal_oriented=True, render=False) env = NormalizedBoxEnv(env) es = GaussianAndEpislonStrategy( action_space=env.action_space, max_sigma=.2, min_sigma=.2, # constant sigma epsilon=.3, ) obs_dim = env.observation_space.spaces['observation'].low.size goal_dim = env.observation_space.spaces['desired_goal'].low.size action_dim = env.action_space.low.size qf1 = FlattenMlp( input_size=obs_dim + goal_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) qf2 = FlattenMlp( input_size=obs_dim + goal_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim + goal_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = ObsDictRelabelingBuffer(env=env, **variant['replay_buffer_kwargs']) algorithm = HerTd3(her_kwargs=dict(observation_key='observation', desired_goal_key='desired_goal'), td3_kwargs=dict(env=env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy), replay_buffer=replay_buffer, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): eval_env = NormalizedBoxEnv(HalfCheetahEnv()) expl_env = NormalizedBoxEnv(HalfCheetahEnv()) # Or for a specific version: # import gym # env = NormalizedBoxEnv(gym.make('HalfCheetah-v1')) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size qf = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs'] ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs'] ) target_qf = copy.deepcopy(qf) target_policy = copy.deepcopy(policy) eval_path_collector = MdpPathCollector(eval_env, policy) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=OUStrategy(action_space=expl_env.action_space), policy=policy, ) expl_path_collector = MdpPathCollector(expl_env, exploration_policy) replay_buffer = EnvReplayBuffer(variant['replay_buffer_size'], expl_env) trainer = DDPGTrainer( qf=qf, target_qf=target_qf, policy=policy, target_policy=target_policy, **variant['trainer_kwargs'] ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): logger.add_text_output('./d_text.txt') logger.add_tabular_output('./d_tabular.txt') logger.set_snapshot_dir('./snaps') farmer = Farmer([('0.0.0.0', 1)]) remote_env = farmer.force_acq_env() remote_env.set_spaces() env = NormalizedBoxEnv(remote_env) es = GaussianStrategy( action_space=env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[256, 256], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[256, 256], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[256, 256], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = TD3(env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, **variant['algo_kwargs']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def experiment(variant): from cartpole import CartPoleEnv expl_env = NormalizedBoxEnv(CartPoleEnv(mode=0)) eval_env = NormalizedBoxEnv(CartPoleEnv(mode=0)) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size qf = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs'] ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs'] ) target_qf = copy.deepcopy(qf) target_policy = copy.deepcopy(policy) eval_path_collector = MdpPathCollector(eval_env, policy) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=OUStrategy(action_space=expl_env.action_space), policy=policy, ) expl_path_collector = MdpPathCollector(expl_env, exploration_policy) replay_buffer = EnvReplayBuffer(variant['replay_buffer_size'], expl_env) trainer = DDPGTrainer( qf=qf, target_qf=target_qf, policy=policy, target_policy=target_policy, **variant['trainer_kwargs'] ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): #Robot env = gym.make('replab-v0')._start_rospy(goal_oriented=False) #SIM #env = gym.make('replab-v0')._start_sim(goal_oriented=False, render=False) env.action_space.low *= 10 env.action_space.high *= 10 env = NormalizedBoxEnv(env) es = GaussianStrategy( action_space=env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = TD3( env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, **variant['algo_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant,env=None): if env is None: # default setting of environment env = NormalizedBoxEnv(HopperEnv()) es = GaussianStrategy( action_space=env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = DDPG( env, qf=qf1, policy=policy, exploration_policy=exploration_policy, **variant['algo_params'] ) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def experiment(variant): env = gym.make('RLkitUR-v0')._start_ros_services() eval_env = gym.make('RLkitUR-v0') expl_env = gym.make('RLkitUR-v0') eval_env = NormalizedBoxEnv(eval_env) expl_env = NormalizedBoxEnv(expl_env) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size print("obs_dim: ", obs_dim) print("action_dim: ", action_dim) qf = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) policy = TanhMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs']) target_qf = copy.deepcopy(qf) target_policy = copy.deepcopy(policy) eval_path_collector = MdpPathCollector(eval_env, policy) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=OUStrategy(action_space=expl_env.action_space), policy=policy, ) expl_path_collector = MdpPathCollector(expl_env, exploration_policy) replay_buffer = EnvReplayBuffer(variant['replay_buffer_size'], expl_env) trainer = DDPGTrainer(qf=qf, target_qf=target_qf, policy=policy, target_policy=target_policy, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = CylinderXYPusher2DEnv(**variant['env_kwargs']) if variant['normalize']: env = NormalizedBoxEnv(env) es = EpsilonGreedy( action_space=env.action_space, prob_random_action=0.1, ) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size goal_dim = env.goal_dim qf1 = ConcatMlp( input_size=obs_dim + action_dim + goal_dim, output_size=1, hidden_sizes=[400, 300], ) qf2 = ConcatMlp( input_size=obs_dim + action_dim + goal_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim + goal_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = SimpleHerReplayBuffer(env=env, **variant['replay_buffer_kwargs']) algorithm = HerTd3(env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, replay_buffer=replay_buffer, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def get_ddpg_trainer(env, hidden_sizes=[256, 256]): obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf = ConcatMlp(input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes) policy = TanhMlpPolicy(input_size=obs_dim, output_size=action_dim, hidden_sizes=hidden_sizes) target_qf = copy.deepcopy(qf) target_policy = copy.deepcopy(policy) trainer = DDPGTrainer(qf=qf, target_qf=target_qf, policy=policy, target_policy=target_policy, use_soft_update=True, tau=1e-2, discount=0.99, qf_learning_rate=1e-3, policy_learning_rate=1e-4) return trainer
def experiment(variant): logger.add_text_output('./d_text.txt') logger.add_tabular_output('./d_tabular.txt') logger.set_snapshot_dir('./snaps') farmer = Farmer([('0.0.0.0', 1)]) remote_env = farmer.force_acq_env() remote_env.set_spaces() env = NormalizedBoxEnv(remote_env) es = OUStrategy(action_space=env.action_space) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size net_size = variant['net_size'] qf = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[net_size, net_size], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[net_size, net_size], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = DDPG(env, qf=qf, policy=policy, exploration_policy=exploration_policy, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def experiment(variant): env = NormalizedBoxEnv(HopperEnv()) es = GaussianStrategy( action_space=env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = TD3( env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, **variant['algo_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def get_danger(env, parameters): obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size hidden_sizes_qf = parameters['hidden_sizes_qf'] hidden_sizes_policy = parameters['hidden_sizes_policy'] qf_danger = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes_qf, output_activation=torch.sigmoid, ) policy_danger = TanhMlpPolicy( input_size=obs_dim, hidden_sizes=hidden_sizes_policy, output_size=action_dim, ) danger_trainer = DangerTrainer(qf_danger=qf_danger, policy_danger=policy_danger, **parameters['trainer_danger_params']) return policy_danger, qf_danger, danger_trainer