def main(args): dataset, env = get_atari(args.dataset) d3rlpy.seed(args.seed) train_episodes, test_episodes = train_test_split(dataset, test_size=0.2) cql = DiscreteCQL( n_frames=4, # frame stacking q_func_type=args.q_func_type, scaler='pixel', use_gpu=args.gpu) cql.fit(train_episodes, eval_episodes=test_episodes, n_epochs=100, scorers={ 'environment': evaluate_on_environment(env, epsilon=0.05), 'td_error': td_error_scorer, 'discounted_advantage': discounted_sum_of_advantage_scorer, 'value_scale': average_value_estimation_scorer })
def main(args): dataset, env = get_atari(args.dataset) d3rlpy.seed(args.seed) train_episodes, test_episodes = train_test_split(dataset, test_size=0.2) device = None if args.gpu is None else Device(args.gpu) cql = DiscreteCQL(n_epochs=100, q_func_type=args.q_func_type, scaler='pixel', use_batch_norm=False, use_gpu=device) cql.fit(train_episodes, eval_episodes=test_episodes, scorers={ 'environment': evaluate_on_environment(env, epsilon=0.05), 'td_error': td_error_scorer, 'discounted_advantage': discounted_sum_of_advantage_scorer, 'value_scale': average_value_estimation_scorer })
def save_policy(self, path, epoch, as_onnx): params_path = os.path.join(self.get_log_path(), 'params.json') model_path = os.path.join(self.get_log_path(), 'model_%d.pt' % epoch) if not os.path.exists(model_path): raise ValueError('%s does not exist.' % model_path) # initialize algorithm from json file if self.project.algorithm == 'cql': if self.project.dataset.is_discrete: algo = DiscreteCQL.from_json(params_path) else: algo = CQL.from_json(params_path) else: raise ValueError('unsupported algorithm.') # load model parameters algo.load_model(model_path) # save TorchScript policy algo.save_policy(path, as_onnx)
def train(params): # setup algorithm if pretrain: dqn = DQN(batch_size=params.get("batch_size"), learning_rate=params.get("learning_rate"), target_update_interval=params.get("target_update_interval"), q_func_factory=QRQFunctionFactory( n_quantiles=params.get("n_quantiles")), n_steps=params.get("train_freq"), gamma=params.get("gamma"), n_critics=1, target_reduction_type="min", use_gpu=True) # setup replay buffer buffer = ReplayBuffer(maxlen=params.get("buffer_size"), env=env) # setup explorers explorer = LinearDecayEpsilonGreedy( start_epsilon=1.0, end_epsilon=params.get("exploration_final_eps"), duration=100000) # start training dqn.fit_online( env, buffer, n_steps=params.get("train_steps"), explorer= explorer, # you don't need this with probablistic policy algorithms tensorboard_dir=log_dir, eval_env=eval_env) print("Saving Model") dqn.save_model(exp_name) print("convert buffer to dataset") dataset = buffer.to_mdp_dataset() # save MDPDataset dataset.dump('{0}.h5'.format(exp_name)) print("Loading Dataset for Offline Training") dataset = d3rlpy.dataset.MDPDataset.load('{0}.h5'.format(exp_name)) train_episodes, test_episodes = train_test_split(dataset, test_size=0.2) # The dataset can then be used to train a d3rlpy model cql = DiscreteCQL(learning_rate=6.25e-05, encoder_factory='default', q_func_factory='mean', batch_size=32, n_frames=1, n_steps=1, gamma=0.99, n_critics=1, bootstrap=False, share_encoder=False, target_reduction_type='min', target_update_interval=8000, use_gpu=True, scaler=None, augmentation=None, generator=None, impl=None) cql_exp = params.get("model_name") + "_offline_" + params.get( "environment") cql_log = '../../../logs/' + cql_exp cql.fit(dataset.episodes, eval_episodes=test_episodes, n_epochs=1000, scorers={ 'environment': evaluate_on_environment(env, epsilon=0.05), 'td_error': td_error_scorer, 'discounted_advantage': discounted_sum_of_advantage_scorer, 'value_scale': average_value_estimation_scorer, }, tensorboard_dir=cql_log) cql.save_model(cql_exp)
from sklearn.model_selection import train_test_split from d3rlpy.datasets import get_atari from d3rlpy.algos import DiscreteCQL from d3rlpy.ope import DiscreteFQE from d3rlpy.metrics.scorer import evaluate_on_environment from d3rlpy.metrics.scorer import initial_state_value_estimation_scorer from d3rlpy.metrics.scorer import soft_opc_scorer dataset, env = get_atari('breakout-expert-v0') train_episodes, test_episodes = train_test_split(dataset, test_size=0.2) # train algorithm cql = DiscreteCQL(n_epochs=100, scaler='pixel', q_func_factory='qr', n_frames=4, use_gpu=True) cql.fit(train_episodes, eval_episodes=test_episodes, scorers={ 'environment': evaluate_on_environment(env, epsilon=0.05), 'init_value': initial_state_value_estimation_scorer, 'soft_opc': soft_opc_scorer(70) }) # or load the trained model # cql = DiscreteCQL.from_json('<path-to-json>/params.json') # cql.load_model('<path-to-model>/model.pt') # evaluate the trained policy
from d3rlpy.datasets import get_atari from d3rlpy.algos import DiscreteCQL from d3rlpy.metrics.scorer import evaluate_on_environment from d3rlpy.metrics.scorer import td_error_scorer from d3rlpy.metrics.scorer import discounted_sum_of_advantage_scorer from d3rlpy.metrics.scorer import average_value_estimation_scorer from sklearn.model_selection import train_test_split dataset, env = get_atari('breakout-expert-v0') train_episodes, test_episodes = train_test_split(dataset, test_size=0.2) cql = DiscreteCQL(scaler='pixel', n_frames=4, augmentation=['random_shift', 'intensity'], use_gpu=True) cql.fit(train_episodes, eval_episodes=test_episodes, n_epochs=100, scorers={ 'environment': evaluate_on_environment(env, epsilon=0.05), 'td_error': td_error_scorer, 'discounted_advantage': discounted_sum_of_advantage_scorer, 'value_scale': average_value_estimation_scorer })
from d3rlpy.algos import DiscreteCQL from d3rlpy.models.optimizers import AdamFactory from d3rlpy.datasets import get_atari from d3rlpy.metrics.scorer import evaluate_on_environment from d3rlpy.metrics.scorer import average_value_estimation_scorer from sklearn.model_selection import train_test_split dataset, env = get_atari('breakout-medium-v0') _, test_episodes = train_test_split(dataset, test_size=0.2) cql = DiscreteCQL(optim_factory=AdamFactory(eps=1e-2 / 32), scaler='pixel', n_frames=4, q_func_factory='qr', use_gpu=True) cql.fit(dataset.episodes, eval_episodes=test_episodes, n_epochs=2000, scorers={ 'environment': evaluate_on_environment(env, epsilon=0.001), 'value_scale': average_value_estimation_scorer })