示例#1
0
def main(args):
    dataset, env = get_pybullet(args.dataset)

    d3rlpy.seed(args.seed)

    train_episodes, test_episodes = train_test_split(dataset, test_size=0.2)

    device = None if args.gpu is None else Device(args.gpu)

    cql = CQL(n_epochs=100, q_func_type=args.q_func_type, use_gpu=device)

    cql.fit(train_episodes,
            eval_episodes=test_episodes,
            scorers={
                'environment': evaluate_on_environment(env),
                'td_error': td_error_scorer,
                'discounted_advantage': discounted_sum_of_advantage_scorer,
                'value_scale': average_value_estimation_scorer,
                'value_std': value_estimation_std_scorer,
                'action_diff': continuous_action_diff_scorer
            })
示例#2
0
    def save_policy(self, path, epoch, as_onnx):
        params_path = os.path.join(self.get_log_path(), 'params.json')
        model_path = os.path.join(self.get_log_path(), 'model_%d.pt' % epoch)

        if not os.path.exists(model_path):
            raise ValueError('%s does not exist.' % model_path)

        # initialize algorithm from json file
        if self.project.algorithm == 'cql':
            if self.project.dataset.is_discrete:
                algo = DiscreteCQL.from_json(params_path)
            else:
                algo = CQL.from_json(params_path)
        else:
            raise ValueError('unsupported algorithm.')

        # load model parameters
        algo.load_model(model_path)

        # save TorchScript policy
        algo.save_policy(path, as_onnx)
示例#3
0
from sklearn.model_selection import train_test_split
from d3rlpy.datasets import get_pybullet
from d3rlpy.algos import CQL
from d3rlpy.ope import FQE
from d3rlpy.metrics.scorer import evaluate_on_environment
from d3rlpy.metrics.scorer import initial_state_value_estimation_scorer
from d3rlpy.metrics.scorer import soft_opc_scorer

dataset, env = get_pybullet('hopper-bullet-mixed-v0')

train_episodes, test_episodes = train_test_split(dataset, test_size=0.2)

# train algorithm
cql = CQL(n_epochs=100, use_gpu=True)
cql.fit(train_episodes,
        eval_episodes=test_episodes,
        scorers={
            'environment': evaluate_on_environment(env),
            'init_value': initial_state_value_estimation_scorer,
            'soft_opc': soft_opc_scorer(600)
        })

# or load the trained model
# cql = CQL.from_json('<path-to-json>/params.json')
# cql.load_model('<path-to-model>/model.pt')

# evaluate the trained policy
fqe = FQE(algo=cql,
          n_epochs=200,
          q_func_factory='qr',
          learning_rate=1e-4,
示例#4
0
文件: vector.py 项目: ritou11/d3rlpy
from d3rlpy.datasets import get_pybullet
from d3rlpy.algos import CQL
from d3rlpy.metrics.scorer import evaluate_on_environment
from d3rlpy.metrics.scorer import td_error_scorer
from d3rlpy.metrics.scorer import discounted_sum_of_advantage_scorer
from d3rlpy.metrics.scorer import average_value_estimation_scorer
from sklearn.model_selection import train_test_split

dataset, env = get_pybullet('hopper-bullet-mixed-v0')

train_episodes, test_episodes = train_test_split(dataset, test_size=0.2)

cql = CQL(augmentation=['single_amplitude_scaling'], use_gpu=True)

cql.fit(train_episodes,
        eval_episodes=test_episodes,
        n_epochs=100,
        scorers={
            'environment': evaluate_on_environment(env),
            'td_error': td_error_scorer,
            'discounted_advantage': discounted_sum_of_advantage_scorer,
            'value_scale': average_value_estimation_scorer
        })
示例#5
0
from d3rlpy.datasets import get_pybullet
from d3rlpy.algos import CQL
from d3rlpy.metrics.scorer import evaluate_on_environment
from d3rlpy.metrics.scorer import discounted_sum_of_advantage_scorer
from sklearn.model_selection import train_test_split

# get data-driven RL dataset
dataset, env = get_pybullet('hopper-bullet-mixed-v0')

# split dataset
train_episodes, test_episodes = train_test_split(dataset, test_size=0.2)

# setup algorithm
cql = CQL(actor_learning_rate=1e-3,
          critic_learning_rate=1e-3,
          temp_learning_rate=1e-3,
          alpha_learning_rate=1e-3,
          n_critics=10,
          bootstrap=True,
          update_actor_interval=2,
          q_func_type='qr',
          use_gpu=True)

# start training
cql.fit(train_episodes,
        eval_episodes=test_episodes,
        n_epochs=300,
        scorers={
            'environment': evaluate_on_environment(env),
            'advantage': discounted_sum_of_advantage_scorer
        })
示例#6
0
文件: cql.py 项目: navidmdn/d3rlpy
from d3rlpy.algos import CQL
from d3rlpy.datasets import get_d4rl
from d3rlpy.models.encoders import VectorEncoderFactory
from d3rlpy.metrics.scorer import evaluate_on_environment
from d3rlpy.metrics.scorer import average_value_estimation_scorer
from sklearn.model_selection import train_test_split

dataset, env = get_d4rl('hopper-medium-v0')

_, test_episodes = train_test_split(dataset, test_size=0.2)

encoder = VectorEncoderFactory(hidden_units=[256, 256, 256])

cql = CQL(actor_encoder_factory=encoder,
          critic_encoder_factory=encoder,
          alpha_learning_rate=0.0,
          use_gpu=True)

cql.fit(dataset.episodes,
        eval_episodes=test_episodes,
        n_epochs=2000,
        scorers={
            'environment': evaluate_on_environment(env),
            'value_scale': average_value_estimation_scorer
        })