예제 #1
0
파일: train_awac.py 프로젝트: wx-b/d3rlpy
def main(args):
    dataset, env = get_pybullet(args.dataset)

    d3rlpy.seed(args.seed)

    train_episodes, test_episodes = train_test_split(dataset, test_size=0.2)

    device = None if args.gpu is None else Device(args.gpu)

    encoder_factory = VectorEncoderFactory(hidden_units=[256, 256, 256, 256])

    awac = AWAC(actor_encoder_factory=encoder_factory,
                critic_encoder_factory=encoder_factory,
                q_func_factory=args.q_func,
                use_gpu=device)

    awac.fit(train_episodes,
             eval_episodes=test_episodes,
             n_epochs=1000,
             scorers={
                 'environment': evaluate_on_environment(env),
                 'td_error': td_error_scorer,
                 'discounted_advantage': discounted_sum_of_advantage_scorer,
                 'value_scale': average_value_estimation_scorer,
                 'value_std': value_estimation_std_scorer,
                 'action_diff': continuous_action_diff_scorer
             })
예제 #2
0
파일: train_combo.py 프로젝트: wx-b/d3rlpy
def main(args):
    dataset, env = get_pybullet(args.dataset)

    d3rlpy.seed(args.seed)

    train_episodes, test_episodes = train_test_split(dataset, test_size=0.2)

    device = None if args.gpu is None else Device(args.gpu)

    dynamics = ProbabilisticEnsembleDynamics(use_gpu=device)
    dynamics.fit(train_episodes,
                 eval_episodes=test_episodes,
                 n_steps=100000,
                 scorers={
                     "obs_error": dynamics_observation_prediction_error_scorer,
                     "reward_error": dynamics_reward_prediction_error_scorer,
                 })

    combo = COMBO(q_func_factory=args.q_func,
                  dynamics=dynamics,
                  use_gpu=device)

    combo.fit(train_episodes,
              eval_episodes=test_episodes,
              n_steps=1000000,
              scorers={
                  'environment': evaluate_on_environment(env),
                  'td_error': td_error_scorer,
                  'discounted_advantage': discounted_sum_of_advantage_scorer,
                  'value_scale': average_value_estimation_scorer,
                  'value_std': value_estimation_std_scorer,
                  'action_diff': continuous_action_diff_scorer
              })
예제 #3
0
파일: train_bc.py 프로젝트: kintatta/d3rl
def main(args):
    dataset, env = get_pybullet(args.dataset)

    d3rlpy.seed(args.seed)

    train_episodes, test_episodes = train_test_split(dataset, test_size=0.2)

    device = None if args.gpu is None else Device(args.gpu)

    bc = BC(n_epochs=100, use_gpu=device)

    bc.fit(train_episodes,
           eval_episodes=test_episodes,
           scorers={
               'environment': evaluate_on_environment(env),
               'action_diff': continuous_action_diff_scorer
           })
예제 #4
0
def main(args):
    dataset, env = get_pybullet(args.dataset)

    d3rlpy.seed(args.seed)

    train_episodes, test_episodes = train_test_split(dataset, test_size=0.2)

    device = None if args.gpu is None else Device(args.gpu)

    awr = AWR(n_epochs=100, use_gpu=device)

    awr.fit(train_episodes,
            eval_episodes=test_episodes,
            scorers={
                'environment': evaluate_on_environment(env),
                'td_error': td_error_scorer,
                'value_scale': average_value_estimation_scorer,
                'action_diff': continuous_action_diff_scorer
            })
예제 #5
0
파일: train_sac.py 프로젝트: kintatta/d3rl
def main(args):
    dataset, env = get_pybullet(args.dataset)

    d3rlpy.seed(args.seed)

    train_episodes, test_episodes = train_test_split(dataset, test_size=0.2)

    device = None if args.gpu is None else Device(args.gpu)

    sac = SAC(n_epochs=100, q_func_type=args.q_func_type, use_gpu=device)

    sac.fit(train_episodes,
            eval_episodes=test_episodes,
            scorers={
                'environment': evaluate_on_environment(env),
                'td_error': td_error_scorer,
                'discounted_advantage': discounted_sum_of_advantage_scorer,
                'value_scale': average_value_estimation_scorer,
                'value_std': value_estimation_std_scorer,
                'action_diff': continuous_action_diff_scorer
            })
예제 #6
0
from sklearn.model_selection import train_test_split
from d3rlpy.datasets import get_pybullet
from d3rlpy.algos import CQL
from d3rlpy.ope import FQE
from d3rlpy.metrics.scorer import evaluate_on_environment
from d3rlpy.metrics.scorer import initial_state_value_estimation_scorer
from d3rlpy.metrics.scorer import soft_opc_scorer

dataset, env = get_pybullet('hopper-bullet-mixed-v0')

train_episodes, test_episodes = train_test_split(dataset, test_size=0.2)

# train algorithm
cql = CQL(n_epochs=100, use_gpu=True)
cql.fit(train_episodes,
        eval_episodes=test_episodes,
        scorers={
            'environment': evaluate_on_environment(env),
            'init_value': initial_state_value_estimation_scorer,
            'soft_opc': soft_opc_scorer(600)
        })

# or load the trained model
# cql = CQL.from_json('<path-to-json>/params.json')
# cql.load_model('<path-to-model>/model.pt')

# evaluate the trained policy
fqe = FQE(algo=cql,
          n_epochs=200,
          q_func_factory='qr',
          learning_rate=1e-4,
예제 #7
0
from d3rlpy.algos import AWAC
from d3rlpy.datasets import get_pybullet
from d3rlpy.online.buffers import ReplayBuffer
from d3rlpy.metrics.scorer import evaluate_on_environment
from d3rlpy.metrics.scorer import discounted_sum_of_advantage_scorer
from d3rlpy.metrics.scorer import average_value_estimation_scorer
from sklearn.model_selection import train_test_split

# prepare dataset and environment
dataset, env = get_pybullet('hopper-bullet-random-v0')
_, eval_env = get_pybullet('hopper-bullet-random-v0')

train_episodes, test_episodes = train_test_split(dataset)

# setup algorithm
awac = AWAC(encoder_params={'hidden_units': [256, 256, 256, 256]},
            use_gpu=True)

## pretrain
awac.fit(train_episodes[:10000],
         eval_episodes=test_episodes,
         n_epochs=30,
         scorers={
             'environment': evaluate_on_environment(env),
             'advantage': discounted_sum_of_advantage_scorer,
             'value_scale': average_value_estimation_scorer
         })

# fine-tuning
awac.fit_online(env,
                ReplayBuffer(1000000, env, train_episodes[:10000]),