def main(args): dataset, env = get_pybullet(args.dataset) d3rlpy.seed(args.seed) train_episodes, test_episodes = train_test_split(dataset, test_size=0.2) device = None if args.gpu is None else Device(args.gpu) dynamics = ProbabilisticEnsembleDynamics(use_gpu=device) dynamics.fit(train_episodes, eval_episodes=test_episodes, n_steps=100000, scorers={ "obs_error": dynamics_observation_prediction_error_scorer, "reward_error": dynamics_reward_prediction_error_scorer, }) combo = COMBO(q_func_factory=args.q_func, dynamics=dynamics, use_gpu=device) combo.fit(train_episodes, eval_episodes=test_episodes, n_steps=1000000, scorers={ 'environment': evaluate_on_environment(env), 'td_error': td_error_scorer, 'discounted_advantage': discounted_sum_of_advantage_scorer, 'value_scale': average_value_estimation_scorer, 'value_std': value_estimation_std_scorer, 'action_diff': continuous_action_diff_scorer })
def test_evaluate_on_environment(observation_shape, action_size, episode_length, n_trials): shape = (n_trials, episode_length + 1) + observation_shape observations = np.random.random(shape) class DummyEnv: def __init__(self): self.episode = 0 def step(self, action): self.t += 1 observation = observations[self.episode - 1, self.t] reward = np.mean(observation) + np.mean(action) done = self.t == episode_length return observation, reward, done, {} def reset(self): self.t = 0 self.episode += 1 return observations[self.episode - 1, 0] # projection matrix for deterministic action A = np.random.random(observation_shape + (action_size, )) algo = DummyAlgo(A, 0.0) ref_rewards = [] for i in range(n_trials): episode_obs = observations[i, :, :] actions = algo.predict(episode_obs[:-1]) rewards = np.mean(episode_obs[1:], axis=1) + np.mean(actions, axis=1) ref_rewards.append(np.sum(rewards)) mean_reward = evaluate_on_environment(DummyEnv(), n_trials)(algo) assert np.allclose(mean_reward, np.mean(ref_rewards))
def main(args): dataset, env = get_pybullet(args.dataset) d3rlpy.seed(args.seed) train_episodes, test_episodes = train_test_split(dataset, test_size=0.2) device = None if args.gpu is None else Device(args.gpu) encoder_factory = VectorEncoderFactory(hidden_units=[256, 256, 256, 256]) awac = AWAC(actor_encoder_factory=encoder_factory, critic_encoder_factory=encoder_factory, q_func_factory=args.q_func, use_gpu=device) awac.fit(train_episodes, eval_episodes=test_episodes, n_epochs=1000, scorers={ 'environment': evaluate_on_environment(env), 'td_error': td_error_scorer, 'discounted_advantage': discounted_sum_of_advantage_scorer, 'value_scale': average_value_estimation_scorer, 'value_std': value_estimation_std_scorer, 'action_diff': continuous_action_diff_scorer })
def main(args): dataset, env = get_atari(args.dataset) d3rlpy.seed(args.seed) train_episodes, test_episodes = train_test_split(dataset, test_size=0.2) bc = DiscreteBC( n_frames=4, # frame stacking scaler='pixel', use_gpu=args.gpu) bc.fit(train_episodes, eval_episodes=test_episodes, n_epochs=100, scorers={'environment': evaluate_on_environment(env, epsilon=0.05)})
def main(args): dataset, env = get_atari(args.dataset) d3rlpy.seed(args.seed) train_episodes, test_episodes = train_test_split(dataset, test_size=0.2) device = None if args.gpu is None else Device(args.gpu) bc = DiscreteBC(n_epochs=100, scaler='pixel', use_batch_norm=False, use_gpu=device) bc.fit(train_episodes, eval_episodes=test_episodes, scorers={'environment': evaluate_on_environment(env, epsilon=0.05)})
def main(args): dataset, env = get_pybullet(args.dataset) d3rlpy.seed(args.seed) train_episodes, test_episodes = train_test_split(dataset, test_size=0.2) device = None if args.gpu is None else Device(args.gpu) bc = BC(n_epochs=100, use_gpu=device) bc.fit(train_episodes, eval_episodes=test_episodes, scorers={ 'environment': evaluate_on_environment(env), 'action_diff': continuous_action_diff_scorer })
def main(args): dataset, env = get_pybullet(args.dataset) d3rlpy.seed(args.seed) train_episodes, test_episodes = train_test_split(dataset, test_size=0.2) device = None if args.gpu is None else Device(args.gpu) awr = AWR(n_epochs=100, use_gpu=device) awr.fit(train_episodes, eval_episodes=test_episodes, scorers={ 'environment': evaluate_on_environment(env), 'td_error': td_error_scorer, 'value_scale': average_value_estimation_scorer, 'action_diff': continuous_action_diff_scorer })
def test_evaluate_on_environment( observation_shape, action_size, episode_length, n_trials ): shape = (n_trials, episode_length + 1) + observation_shape if len(observation_shape) == 3: observations = np.random.randint(0, 255, size=shape, dtype=np.uint8) else: observations = np.random.random(shape).astype("f4") class DummyEnv: def __init__(self): self.episode = 0 self.observation_space = spaces.Box( low=0, high=255, shape=observation_shape ) def step(self, action): self.t += 1 observation = observations[self.episode - 1, self.t] reward = np.mean(observation) + np.mean(action) done = self.t == episode_length return observation, reward, done, {} def reset(self): self.t = 0 self.episode += 1 return observations[self.episode - 1, 0] # projection matrix for deterministic action feature_size = reduce(mul, observation_shape) A = np.random.random((feature_size, action_size)) algo = DummyAlgo(A, 0.0) ref_rewards = [] for i in range(n_trials): episode_obs = observations[i].reshape((-1, feature_size)) actions = algo.predict(episode_obs[:-1]) rewards = np.mean(episode_obs[1:], axis=1) + np.mean(actions, axis=1) ref_rewards.append(np.sum(rewards)) mean_reward = evaluate_on_environment(DummyEnv(), n_trials)(algo) assert np.allclose(mean_reward, np.mean(ref_rewards))
def main(args): dataset, env = get_pybullet(args.dataset) d3rlpy.seed(args.seed) train_episodes, test_episodes = train_test_split(dataset, test_size=0.2) device = None if args.gpu is None else Device(args.gpu) sac = SAC(n_epochs=100, q_func_type=args.q_func_type, use_gpu=device) sac.fit(train_episodes, eval_episodes=test_episodes, scorers={ 'environment': evaluate_on_environment(env), 'td_error': td_error_scorer, 'discounted_advantage': discounted_sum_of_advantage_scorer, 'value_scale': average_value_estimation_scorer, 'value_std': value_estimation_std_scorer, 'action_diff': continuous_action_diff_scorer })
def main(args): dataset, env = get_atari(args.dataset) d3rlpy.seed(args.seed) train_episodes, test_episodes = train_test_split(dataset, test_size=0.2) dqn = DQN( n_frames=4, # frame stacking q_func_type=args.q_func_type, scaler='pixel', use_gpu=args.gpu) dqn.fit(train_episodes, eval_episodes=test_episodes, n_epochs=100, scorers={ 'environment': evaluate_on_environment(env, epsilon=0.05), 'td_error': td_error_scorer, 'discounted_advantage': discounted_sum_of_advantage_scorer, 'value_scale': average_value_estimation_scorer })
def main(args): dataset, env = get_atari(args.dataset) d3rlpy.seed(args.seed) train_episodes, test_episodes = train_test_split(dataset, test_size=0.2) device = None if args.gpu is None else Device(args.gpu) dqn = DQN(n_epochs=100, q_func_type=args.q_func_type, scaler='pixel', use_batch_norm=False, use_gpu=device) dqn.fit(train_episodes, eval_episodes=test_episodes, scorers={ 'environment': evaluate_on_environment(env, epsilon=0.05), 'td_error': td_error_scorer, 'discounted_advantage': discounted_sum_of_advantage_scorer, 'value_scale': average_value_estimation_scorer })
def train(params): # setup algorithm if pretrain: dqn = DQN(batch_size=params.get("batch_size"), learning_rate=params.get("learning_rate"), target_update_interval=params.get("target_update_interval"), q_func_factory=QRQFunctionFactory( n_quantiles=params.get("n_quantiles")), n_steps=params.get("train_freq"), gamma=params.get("gamma"), n_critics=1, target_reduction_type="min", use_gpu=True) # setup replay buffer buffer = ReplayBuffer(maxlen=params.get("buffer_size"), env=env) # setup explorers explorer = LinearDecayEpsilonGreedy( start_epsilon=1.0, end_epsilon=params.get("exploration_final_eps"), duration=100000) # start training dqn.fit_online( env, buffer, n_steps=params.get("train_steps"), explorer= explorer, # you don't need this with probablistic policy algorithms tensorboard_dir=log_dir, eval_env=eval_env) print("Saving Model") dqn.save_model(exp_name) print("convert buffer to dataset") dataset = buffer.to_mdp_dataset() # save MDPDataset dataset.dump('{0}.h5'.format(exp_name)) print("Loading Dataset for Offline Training") dataset = d3rlpy.dataset.MDPDataset.load('{0}.h5'.format(exp_name)) train_episodes, test_episodes = train_test_split(dataset, test_size=0.2) # The dataset can then be used to train a d3rlpy model cql = DiscreteCQL(learning_rate=6.25e-05, encoder_factory='default', q_func_factory='mean', batch_size=32, n_frames=1, n_steps=1, gamma=0.99, n_critics=1, bootstrap=False, share_encoder=False, target_reduction_type='min', target_update_interval=8000, use_gpu=True, scaler=None, augmentation=None, generator=None, impl=None) cql_exp = params.get("model_name") + "_offline_" + params.get( "environment") cql_log = '../../../logs/' + cql_exp cql.fit(dataset.episodes, eval_episodes=test_episodes, n_epochs=1000, scorers={ 'environment': evaluate_on_environment(env, epsilon=0.05), 'td_error': td_error_scorer, 'discounted_advantage': discounted_sum_of_advantage_scorer, 'value_scale': average_value_estimation_scorer, }, tensorboard_dir=cql_log) cql.save_model(cql_exp)
from d3rlpy.algos import CQL from d3rlpy.ope import FQE from d3rlpy.metrics.scorer import evaluate_on_environment from d3rlpy.metrics.scorer import initial_state_value_estimation_scorer from d3rlpy.metrics.scorer import soft_opc_scorer dataset, env = get_pybullet('hopper-bullet-mixed-v0') train_episodes, test_episodes = train_test_split(dataset, test_size=0.2) # train algorithm cql = CQL(n_epochs=100, use_gpu=True) cql.fit(train_episodes, eval_episodes=test_episodes, scorers={ 'environment': evaluate_on_environment(env), 'init_value': initial_state_value_estimation_scorer, 'soft_opc': soft_opc_scorer(600) }) # or load the trained model # cql = CQL.from_json('<path-to-json>/params.json') # cql.load_model('<path-to-model>/model.pt') # evaluate the trained policy fqe = FQE(algo=cql, n_epochs=200, q_func_factory='qr', learning_rate=1e-4, use_gpu=True, encoder_params={'hidden_units': [1024, 1024, 1024, 1024]})
from d3rlpy.metrics.scorer import soft_opc_scorer dataset, env = get_atari('breakout-expert-v0') train_episodes, test_episodes = train_test_split(dataset, test_size=0.2) # train algorithm cql = DiscreteCQL(n_epochs=100, scaler='pixel', q_func_factory='qr', n_frames=4, use_gpu=True) cql.fit(train_episodes, eval_episodes=test_episodes, scorers={ 'environment': evaluate_on_environment(env, epsilon=0.05), 'init_value': initial_state_value_estimation_scorer, 'soft_opc': soft_opc_scorer(70) }) # or load the trained model # cql = DiscreteCQL.from_json('<path-to-json>/params.json') # cql.load_model('<path-to-model>/model.pt') # evaluate the trained policy fqe = DiscreteFQE(algo=cql, n_epochs=100, q_func_factory='qr', learning_rate=1e-4, scaler='pixel', n_frames=4,
from d3rlpy.algos import DQN from d3rlpy.datasets import get_cartpole from d3rlpy.metrics.scorer import evaluate_on_environment from d3rlpy.context import parallel from sklearn.model_selection import GridSearchCV # obtain dataset dataset, env = get_cartpole() # setup algowithm with GPU enabled dqn = DQN(use_gpu=True) # grid search with multiple GPUs assigned to individual processs with parallel(): env_score = evaluate_on_environment(env) gscv = GridSearchCV(estimator=dqn, param_grid={ 'learning_rate': [1e-3, 3e-4, 1e-4], 'gamma': [0.99, 0.95, 0.9] }, scoring={'environment': env_score}, refit=False, n_jobs=3) gscv.fit(dataset.episodes, n_epochs=1, show_progress=False) print(gscv.grid_scores_)
from d3rlpy.algos import DQN from d3rlpy.datasets import get_cartpole from d3rlpy.metrics.scorer import evaluate_on_environment # obtain dataset dataset, env = get_cartpole() # setup algorithm dqn = DQN(n_epochs=1) # train dqn.fit(dataset.episodes) # evaluate trained algorithm evaluate_on_environment(env, render=True)(dqn)