def experiment(variant): env = NormalizedBoxEnv(gym.make('HalfCheetah-v2')) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] qf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim, ) algorithm = SoftActorCritic(env=env, policy=policy, qf=qf, vf=vf, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def experiment(variant): # env = NormalizedBoxEnv(Reacher7DofXyzGoalState()) env = NormalizedBoxEnv(MultitaskPoint2DEnv()) vectorized = True policy = StochasticTdmPolicy(env=env, **variant['policy_kwargs']) qf = TdmQf(env=env, vectorized=vectorized, norm_order=2, **variant['qf_kwargs']) vf = TdmVf(env=env, vectorized=vectorized, **variant['vf_kwargs']) replay_buffer_size = variant['algo_params']['base_kwargs'][ 'replay_buffer_size'] replay_buffer = HerReplayBuffer(replay_buffer_size, env) algorithm = TdmSac( env, qf, vf, variant['algo_params']['sac_kwargs'], variant['algo_params']['tdm_kwargs'], variant['algo_params']['base_kwargs'], supervised_weight=variant['algo_params']['supervised_weight'], policy=policy, replay_buffer=replay_buffer, ) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def __init__( self, train_dataset, test_dataset, model, batch_size=128, log_interval=0, lr=1e-3, **kwargs ): self.log_interval = log_interval self.batch_size = batch_size if ptu.gpu_enabled(): model.cuda() self.model = model self.representation_size = model.representation_size self.optimizer = optim.Adam(self.model.parameters(), lr=lr) self.train_dataset, self.test_dataset = train_dataset, test_dataset assert self.train_dataset['z'].dtype == np.float32 assert self.test_dataset['z'].dtype ==np.float32 assert self.train_dataset['z_proj'].dtype == np.float32 assert self.test_dataset['z_proj'].dtype == np.float32 self.mse = nn.MSELoss()
def update_networks_func(algo, epoch): if epoch % algo.epoch_freq != 0 and epoch != algo.num_epochs - 1: exit() if epoch == algo.num_epochs - 1: filename = local_path_from_s3_or_local_path(osp.join(variant['ckpt'], 'params.pkl')) else: filename = local_path_from_s3_or_local_path(osp.join(variant['ckpt'], 'itr_%d.pkl' % epoch)) print("updating networks from {}".format(filename)) data = joblib.load(filename) assert (data["epoch"] == epoch) algo.qf1 = data['qf1'] algo.qf2 = data['qf2'] algo.policy = data['trained_policy'] algo.target_policy = data["target_policy"] algo.exploration_policy = data["exploration_policy"] if 'n_env_steps_total' in data: algo._n_env_steps_total = data["n_env_steps_total"] if isinstance(algo.eval_policy, SubgoalPlanner): algo.eval_policy.qf = algo.qf1 algo.eval_policy.mf_policy = algo.policy else: algo.eval_policy = data["eval_policy"] if ptu.gpu_enabled(): algo.cuda() if hasattr(algo, "update_sampler_and_rollout_function"): algo.update_sampler_and_rollout_function()
def experiment(variant): env_params = variant['env_params'] env = SawyerXYZReachingEnv(**env_params) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size hidden_size = variant['hidden_size'] qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[hidden_size, hidden_size], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[hidden_size, hidden_size], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[hidden_size, hidden_size], ) es = OUStrategy(action_space=env.action_space, **variant['es_kwargs']) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = TD3(env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, **variant['algo_kwargs']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def experiment(variant): from railrl.core import logger import railrl.torch.pytorch_util as ptu beta = variant["beta"] representation_size = variant["representation_size"] train_data, test_data, info = generate_vae_dataset( **variant['get_data_kwargs']) logger.save_extra_data(info) logger.get_snapshot_dir() if 'beta_schedule_kwargs' in variant: beta_schedule = PiecewiseLinearSchedule( **variant['beta_schedule_kwargs']) else: beta_schedule = None m = ConvVAE(representation_size, input_channels=3) if ptu.gpu_enabled(): m.to(ptu.device) gpu_id = variant.get("gpu_id", None) if gpu_id is not None: ptu.set_device(gpu_id) t = ConvVAETrainer(train_data, test_data, m, beta=beta, beta_schedule=beta_schedule, **variant['algo_kwargs']) save_period = variant['save_period'] for epoch in range(variant['num_epochs']): should_save_imgs = (epoch % save_period == 0) t.train_epoch(epoch) t.test_epoch(epoch, save_reconstruction=should_save_imgs, save_scatterplot=should_save_imgs) if should_save_imgs: t.dump_samples(epoch)
def experiment(variant): env_params = variant['env_params'] env = SawyerXYZReachingEnv(**env_params) es = OUStrategy(action_space=env.action_space) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = DDPG(env, qf=qf, policy=policy, exploration_policy=exploration_policy, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def experiment(variant): num_rollouts = variant['num_rollouts'] H = variant['H'] render = variant['render'] data = joblib.load(variant['qf_path']) qf = data['qf'] env = data['env'] qf_policy = data['policy'] if ptu.gpu_enabled(): qf.to(ptu.device) qf_policy.to(ptu.device) policy_class = variant['policy_class'] if policy_class == StateOnlySdqBasedSqpOcPolicy: policy = policy_class(qf, env, qf_policy, **variant['policy_params']) else: policy = policy_class(qf, env, **variant['policy_params']) paths = [] for _ in range(num_rollouts): goal = env.sample_goal_for_rollout() path = multitask_rollout( env, policy, goal, discount=variant['discount'], max_path_length=H, animated=render, ) paths.append(path) env.log_diagnostics(paths) logger.dump_tabular(with_timestamp=False)
def experiment(variant): env_params = variant['env_params'] env = SawyerXYZReachingEnv(**env_params) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] qf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim, ) algorithm = SoftActorCritic(env=env, policy=policy, qf=qf, vf=vf, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def experiment(variant): num_rollouts = variant['num_rollouts'] H = variant['H'] render = variant['render'] data = joblib.load(variant['qf_path']) policy_params = variant['policy_params'] if 'model' in data: model = data['model'] else: qf = data['qf'] model = ModelExtractor(qf) policy_params['model_learns_deltas'] = False env = data['env'] if ptu.gpu_enabled(): model.to(ptu.device) policy = variant['policy_class']( model, env, **policy_params ) paths = [] for _ in range(num_rollouts): goal = env.sample_goal_for_rollout() path = multitask_rollout( env, policy, goal, discount=0, max_path_length=H, animated=render, ) paths.append(path) env.log_diagnostics(paths) logger.dump_tabular(with_timestamp=False)
def experiment(variant): env = variant['env_class'](**variant['env_kwargs']) if variant['multitask']: env = MultitaskEnvToSilentMultitaskEnv(env) env = NormalizedBoxEnv(env, **variant['normalize_kwargs']) observation_dim = int(np.prod(env.observation_space.low.shape)) action_dim = int(np.prod(env.action_space.low.shape)) obs_normalizer = TorchFixedNormalizer(observation_dim) action_normalizer = TorchFixedNormalizer(action_dim) delta_normalizer = TorchFixedNormalizer(observation_dim) model = DynamicsModel(observation_dim=observation_dim, action_dim=action_dim, obs_normalizer=obs_normalizer, action_normalizer=action_normalizer, delta_normalizer=delta_normalizer, **variant['model_kwargs']) mpc_controller = MPCController(env, model, env.cost_fn, **variant['mpc_controller_kwargs']) es = OUStrategy(action_space=env.action_space, **variant['ou_kwargs']) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=mpc_controller, ) algo = DistanceModelTrainer(env, model, mpc_controller, exploration_policy=exploration_policy, obs_normalizer=obs_normalizer, action_normalizer=action_normalizer, delta_normalizer=delta_normalizer, **variant['algo_kwargs']) if ptu.gpu_enabled(): algo.to(ptu.device) algo.train()
def experiment(variant): # if variant['multitask']: # env = MultitaskPoint2DEnv(**variant['env_kwargs']) # env = MultitaskToFlatEnv(env) # else: # env = Pusher2DEnv(**variant['env_kwargs']) env_name = variant["env_name"] env = gym.make(env_name) if variant['normalize']: env = NormalizedBoxEnv(env) exploration_type = variant['exploration_type'] if exploration_type == 'ou': es = OUStrategy(action_space=env.action_space) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=env.action_space, prob_random_action=0.1, ) else: raise Exception("Invalid type: " + exploration_type) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = TD3( env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, **variant['algo_kwargs'] ) if ptu.gpu_enabled(): algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env_class = variant['env_class'] env = env_class(**variant['env_params']) env = NormalizedBoxEnv( env, **variant['normalize_params'] ) observation_space = convert_gym_space(env.observation_space) action_space = convert_gym_space(env.action_space) qf = variant['qf_class']( int(observation_space.flat_dim), int(action_space.flat_dim), env.goal_dim, **variant['qf_params'] ) policy = FFUniversalPolicy( int(observation_space.flat_dim), int(action_space.flat_dim), env.goal_dim, **variant['policy_params'] ) epoch_discount_schedule = None epoch_discount_schedule_class = variant['epoch_discount_schedule_class'] if epoch_discount_schedule_class is not None: epoch_discount_schedule = epoch_discount_schedule_class( **variant['epoch_discount_schedule_params'] ) qf_criterion = variant['qf_criterion_class']( **variant['qf_criterion_params'] ) es = variant['sampler_es_class']( action_space=action_space, **variant['sampler_es_params'] ) if variant['explore_with_ddpg_policy']: raw_exploration_policy = policy else: raw_exploration_policy = TerminalRewardSampleOCPolicy( qf, env, 5, ) exploration_policy = UniversalPolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=raw_exploration_policy, ) algo = variant['algo_class']( env, qf, policy, exploration_policy, epoch_discount_schedule=epoch_discount_schedule, qf_criterion=qf_criterion, **variant['algo_params'] ) if ptu.gpu_enabled(): algo.cuda() algo.train()
def example(variant): load_policy_file = variant.get('load_policy_file', None) if not load_policy_file == None and exists(load_policy_file): data = joblib.load(load_policy_file) algorithm = data['algorithm'] epochs = algorithm.num_epochs - data['epoch'] algorithm.num_epochs = epochs use_gpu = variant['use_gpu'] if use_gpu and ptu.gpu_enabled(): algorithm.cuda() algorithm.train() else: es_min_sigma = variant['es_min_sigma'] es_max_sigma = variant['es_max_sigma'] num_epochs = variant['num_epochs'] batch_size = variant['batch_size'] use_gpu = variant['use_gpu'] dueling = variant['dueling'] env = normalize(gym_env('Reacher-v1')) es = OUStrategy( max_sigma=es_max_sigma, min_sigma=es_min_sigma, action_space=env.action_space, ) if dueling: qf = FeedForwardDuelingQFunction( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 100, 100, ) else: qf = FeedForwardQFunction( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 100, 100, ) policy = FeedForwardPolicy( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 100, 100, ) algorithm = DDPG( env, qf, policy, es, num_epochs=num_epochs, batch_size=batch_size, ) if use_gpu: algorithm.cuda() algorithm.train()
def resume_torch_algorithm_simple(variant): from railrl.torch import pytorch_util as ptu load_file = variant.get('params_file', None) if load_file is not None and osp.exists(load_file): data = joblib.load(load_file) algorithm = data['algorithm'] epoch = data['epoch'] + 1 if ptu.gpu_enabled(): algorithm.cuda() algorithm.train(start_epoch=epoch + 1)
def experiment(variant): env = NormalizedBoxEnv(variant['env_class']()) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size variant['algo_kwargs'] = dict( num_epochs=variant['num_epochs'], num_steps_per_epoch=variant['num_steps_per_epoch'], num_steps_per_eval=variant['num_steps_per_eval'], max_path_length=variant['max_path_length'], min_num_steps_before_training=variant['min_num_steps_before_training'], batch_size=variant['batch_size'], discount=variant['discount'], replay_buffer_size=variant['replay_buffer_size'], soft_target_tau=variant['soft_target_tau'], target_update_period=variant['target_update_period'], train_policy_with_reparameterization=variant[ 'train_policy_with_reparameterization'], policy_lr=variant['policy_lr'], qf_lr=variant['qf_lr'], vf_lr=variant['vf_lr'], reward_scale=variant['reward_scale'], use_automatic_entropy_tuning=variant.get( 'use_automatic_entropy_tuning', False)) M = variant['layer_size'] qf = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], # **variant['qf_kwargs'] ) vf = FlattenMlp( input_size=obs_dim, output_size=1, hidden_sizes=[M, M], # **variant['vf_kwargs'] ) policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[M, M], # **variant['policy_kwargs'] ) algorithm = SoftActorCritic(env, policy=policy, qf=qf, vf=vf, **variant['algo_kwargs']) if ptu.gpu_enabled(): qf.cuda() vf.cuda() policy.cuda() algorithm.cuda() algorithm.train()
def experiment(variant): env = DiscreteReacherEnv(**variant['env_params']) qf = Mlp( hidden_sizes=[32, 32], input_size=int(np.prod(env.observation_space.shape)), output_size=env.action_space.n, ) algorithm = DQN(env, qf=qf, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def her_twin_sac_experiment(variant): env = variant['env_class'](**variant['env_kwargs']) observation_key = variant.get('observation_key', 'observation') desired_goal_key = variant.get('desired_goal_key', 'desired_goal') replay_buffer = ObsDictRelabelingBuffer( env=env, observation_key=observation_key, desired_goal_key=desired_goal_key, **variant['replay_buffer_kwargs'] ) obs_dim = env.observation_space.spaces['observation'].low.size action_dim = env.action_space.low.size goal_dim = env.observation_space.spaces['desired_goal'].low.size if variant['normalize']: env = NormalizedBoxEnv(env) qf1 = FlattenMlp( input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs'] ) qf2 = FlattenMlp( input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs'] ) vf = FlattenMlp( input_size=obs_dim + goal_dim, output_size=1, **variant['vf_kwargs'] ) policy = TanhGaussianPolicy( obs_dim=obs_dim + goal_dim, action_dim=action_dim, **variant['policy_kwargs'] ) algorithm = HerTwinSac( env, qf1=qf1, qf2=qf2, vf=vf, policy=policy, replay_buffer=replay_buffer, observation_key=observation_key, desired_goal_key=desired_goal_key, **variant['algo_kwargs'] ) if ptu.gpu_enabled(): qf1.to(ptu.device) qf2.to(ptu.device) vf.to(ptu.device) policy.to(ptu.device) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): from railrl.core import logger import railrl.torch.pytorch_util as ptu beta = variant["beta"] representation_size = variant["representation_size"] # train_data, test_data, info = generate_vae_dataset( # **variant['get_data_kwargs'] # ) num_divisions = 5 images = np.zeros((num_divisions * 10000, 21168)) for i in range(num_divisions): imgs = np.load( '/home/murtaza/vae_data/sawyer_torque_control_images100000_' + str(i + 1) + '.npy') images[i * 10000:(i + 1) * 10000] = imgs print(i) mid = int(num_divisions * 10000 * .9) train_data, test_data = images[:mid], images[mid:] info = dict() logger.save_extra_data(info) logger.get_snapshot_dir() if 'beta_schedule_kwargs' in variant: kwargs = variant['beta_schedule_kwargs'] kwargs['y_values'][2] = variant['beta'] kwargs['x_values'][1] = variant['flat_x'] kwargs['x_values'][2] = variant['ramp_x'] + variant['flat_x'] beta_schedule = PiecewiseLinearSchedule( **variant['beta_schedule_kwargs']) else: beta_schedule = None m = ConvVAE(representation_size, input_channels=3, **variant['conv_vae_kwargs']) if ptu.gpu_enabled(): m.cuda() t = ConvVAETrainer(train_data, test_data, m, beta=beta, beta_schedule=beta_schedule, **variant['algo_kwargs']) save_period = variant['save_period'] for epoch in range(variant['num_epochs']): should_save_imgs = (epoch % save_period == 0) t.train_epoch(epoch) t.test_epoch(epoch, save_reconstruction=should_save_imgs, save_scatterplot=should_save_imgs) if should_save_imgs: t.dump_samples(epoch)
def __init__( self, train_dataset, test_dataset, model, batch_size=128, beta=0.5, beta_schedule=None, lr=1e-3, extra_recon_logging=dict(), recon_weights=None, recon_loss_type='mse', **kwargs ): assert recon_loss_type in ['mse', 'wse'] self.batch_size = batch_size self.beta = beta self.beta_schedule = beta_schedule if self.beta_schedule is None: self.beta_schedule = ConstantSchedule(self.beta) if ptu.gpu_enabled(): model.cuda() self.model = model self.representation_size = model.representation_size self.optimizer = optim.Adam(self.model.parameters(), lr=lr) self.train_dataset, self.test_dataset = train_dataset, test_dataset assert self.train_dataset['next_obs'].dtype == np.float32 assert self.test_dataset['next_obs'].dtype ==np.float32 assert self.train_dataset['obs'].dtype == np.float32 assert self.test_dataset['obs'].dtype == np.float32 self.normalize = model.normalize self.mse = nn.MSELoss() if self.normalize: self.train_data_mean = ptu.np_to_var(np.mean(self.train_dataset['next_obs'], axis=0)) np_std = np.std(self.train_dataset['next_obs'], axis=0) for i in range(len(np_std)): if np_std[i] < 1e-3: np_std[i] = 1.0 self.train_data_std = ptu.np_to_var(np_std) self.model.train_data_mean = self.train_data_mean self.model.train_data_std = self.train_data_std self.extra_recon_logging = extra_recon_logging self.recon_weights = recon_weights self.recon_loss_type = recon_loss_type
def experiment(variant): env_params = variant['env_params'] env = SawyerXYReachingEnv(**env_params) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[100, 100], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[100, 100], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[100, 100], ) # es = GaussianStrategy( # action_space=env.action_space, # **variant['es_kwargs'] # ) # es = EpsilonGreedy( # action_space=env.action_space, # prob_random_action=0.2, # ) es = OUStrategy( action_space=env.action_space, **variant['es_kwargs'] ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = TD3( env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, **variant['algo_kwargs'] ) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def experiment(variant): env_params = variant['env_params'] env = MultiTaskSawyerXYZReachingEnv(env_params) tdm_normalizer = TdmNormalizer( env, vectorized=True, max_tau=variant['ddpg_tdm_kwargs']['tdm_kwargs']['max_tau'], ) qf = TdmQf( env=env, vectorized=True, hidden_sizes=[variant['hidden_sizes'], variant['hidden_sizes']], structure='norm_difference', tdm_normalizer=tdm_normalizer, ) policy = TdmPolicy( env=env, hidden_sizes=[variant['hidden_sizes'], variant['hidden_sizes']], tdm_normalizer=tdm_normalizer, ) es = OUStrategy( action_space=env.action_space, **variant['es_kwargs'] ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = HerReplayBuffer( env=env, **variant['her_replay_buffer_kwargs'] ) qf_criterion = variant['qf_criterion_class']() ddpg_tdm_kwargs = copy.deepcopy(variant['ddpg_tdm_kwargs']) ddpg_tdm_kwargs['ddpg_kwargs']['qf_criterion'] = qf_criterion algorithm = TdmDdpg( env, qf=qf, replay_buffer=replay_buffer, policy=policy, exploration_policy=exploration_policy, **variant['ddpg_tdm_kwargs'] ) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def example(variant): env_class = variant['env_class'] env_params = variant['env_params'] env = env_class(**env_params) obs_space = convert_gym_space(env.observation_space) action_space = convert_gym_space(env.action_space) es_class = variant['es_class'] es_params = dict(action_space=action_space, **variant['es_params']) use_gpu = variant['use_gpu'] es = es_class(**es_params) policy_class = variant['policy_class'] policy_params = dict( obs_dim=int(obs_space.flat_dim), action_dim=int(action_space.flat_dim), fc1_size=100, fc2_size=100, ) policy = policy_class(**policy_params) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) remote_env = RemoteRolloutEnv( env, policy, exploration_policy, variant['max_path_length'], variant['normalize_env'], ) qf = FeedForwardQFunction( int(remote_env.observation_space.flat_dim), int(remote_env.action_space.flat_dim), 100, 100, ) algorithm = ParallelDDPG( remote_env, qf=qf, policy=policy, exploration_policy=exploration_policy, **variant['algo_params'], ) if use_gpu and ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def experiment(variant): from railrl.core import logger import railrl.torch.pytorch_util as ptu beta = variant["beta"] representation_size = variant["representation_size"] #this has both states and images so can't use generate vae dataset X = np.load( '/home/murtaza/vae_data/sawyer_torque_control_ou_imgs_zoomed_out10000.npy' ) Y = np.load( '/home/murtaza/vae_data/sawyer_torque_control_ou_states_zoomed_out10000.npy' ) Y = np.concatenate((Y[:, :7], Y[:, 14:]), axis=1) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.1) info = dict() logger.save_extra_data(info) logger.get_snapshot_dir() if 'beta_schedule_kwargs' in variant: beta_schedule = PiecewiseLinearSchedule( **variant['beta_schedule_kwargs']) else: beta_schedule = None m = ConvVAE(representation_size, input_channels=3, state_sim_debug=True, state_size=Y.shape[1], **variant['conv_vae_kwargs']) if ptu.gpu_enabled(): m.cuda() t = ConvVAETrainer((X_train, Y_train), (X_test, Y_test), m, beta=beta, beta_schedule=beta_schedule, state_sim_debug=True, **variant['algo_kwargs']) save_period = variant['save_period'] for epoch in range(variant['num_epochs']): should_save_imgs = (epoch % save_period == 0) t.train_epoch(epoch) t.test_epoch(epoch, save_reconstruction=should_save_imgs, save_scatterplot=should_save_imgs) if should_save_imgs: t.dump_samples(epoch)
def experiment(variant): env = NormalizedBoxEnv(MultitaskPoint2DEnv()) #try full state reacher # env = Reacher7DofMultitaskEnv() es = OUStrategy(action_space=env.action_space) policy = TdmPolicy(env=env, **variant['policy_kwargs']) replay_buffer_size = variant['algo_params']['base_kwargs'][ 'replay_buffer_size'] replay_buffer = TauReplayBuffer(replay_buffer_size, env) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = TdmSupervised(env, replay_buffer=replay_buffer, policy=policy, exploration_policy=exploration_policy, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def __init__( self, X_train, X_test, y_train, y_test, model, batch_size=128, lr=3e-4, weight_decay=0, num_batches=128, ): self.batch_size = batch_size if ptu.gpu_enabled(): model.to(ptu.device) self.model = model self.criterion = nn.MSELoss() self.optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay) self.X_train, self.X_test, self.y_train, self.y_test = X_train, X_test, y_train, y_test self.num_batches = num_batches
def experiment(variant): from railrl.core import logger import railrl.torch.pytorch_util as ptu beta = variant["beta"] representation_size = variant["representation_size"] train_data, test_data, info = get_data(**variant['get_data_kwargs']) logger.save_extra_data(info) logger.get_snapshot_dir() beta_schedule = PiecewiseLinearSchedule(**variant['beta_schedule_kwargs']) m = ConvVAE(representation_size, input_channels=3) if ptu.gpu_enabled(): m.to(ptu.device) t = ConvVAETrainer(train_data, test_data, m, beta=beta, beta_schedule=beta_schedule, **variant['algo_kwargs']) for epoch in range(variant['num_epochs']): t.train_epoch(epoch) t.test_epoch(epoch) t.dump_samples(epoch)
def experiment(variant): from railrl.core import logger import railrl.torch.pytorch_util as ptu beta = variant["beta"] representation_size = variant["representation_size"] train_data, test_data, info = generate_vae_dataset( **variant['generate_vae_dataset_kwargs']) logger.save_extra_data(info) logger.get_snapshot_dir() if 'beta_schedule_kwargs' in variant: # kwargs = variant['beta_schedule_kwargs'] # kwargs['y_values'][2] = variant['beta'] # kwargs['x_values'][1] = variant['flat_x'] # kwargs['x_values'][2] = variant['ramp_x'] + variant['flat_x'] variant['beta_schedule_kwargs']['y_values'][-1] = variant['beta'] beta_schedule = PiecewiseLinearSchedule( **variant['beta_schedule_kwargs']) else: beta_schedule = None m = ConvVAE(representation_size, input_channels=3, **variant['conv_vae_kwargs']) if ptu.gpu_enabled(): m.cuda() t = ConvVAETrainer(train_data, test_data, m, beta=beta, beta_schedule=beta_schedule, **variant['algo_kwargs']) save_period = variant['save_period'] for epoch in range(variant['num_epochs']): should_save_imgs = (epoch % save_period == 0) t.train_epoch(epoch) t.test_epoch(epoch, save_reconstruction=should_save_imgs, save_scatterplot=should_save_imgs) if should_save_imgs: t.dump_samples(epoch)
def experiment(variant): env_params = variant['env_params'] env = MultiTaskBaxterEnv(**env_params) observation_space = convert_gym_space(env.observation_space) action_space = convert_gym_space(env.action_space) qf = FlatUniversalQfunction( int(observation_space.flat_dim), int(action_space.flat_dim), env.goal_dim, **variant['qf_params'], ) policy = FFUniversalPolicy(int(observation_space.flat_dim), int(action_space.flat_dim), env.goal_dim, **variant['policy_params']) es = variant['sampler_es_class'](action_space=action_space, **variant['sampler_es_params']) exploration_policy = UniversalPolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) epoch_discount_schedule = variant['epoch_discount_schedule_class']( **variant['epoch_discount_schedule_params']) algo = HorizonFedStateDistanceQLearning( env, qf, policy, exploration_policy, qf_criterion=HuberLoss(), epoch_discount_schedule=epoch_discount_schedule, **variant['algo_params']) if ptu.gpu_enabled(): algo.cuda() algo.train()
def experiment(variant): env = NormalizedBoxEnv(variant['env_class']()) es = OUStrategy( action_space=env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = TD3(env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, **variant['algo_kwargs']) env.set_goal(variant['goal']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()