Пример #1
0
def lstm_launcher(variant):
    """
    Run a simple LSTM on an environment.

    :param variant: Dictionary of dictionary with the following keys:
        - algo_params
        - env_params
        - qf_params
        - policy_params
    :return:
    """
    from railrl.algos.ddpg import DDPG as MyDDPG
    from railrl.policies.nn_policy import FeedForwardPolicy
    from railrl.qfunctions.nn_qfunction import FeedForwardCritic
    from rllab.exploration_strategies.ou_strategy import OUStrategy
    from railrl.launchers.launcher_util import get_env_settings
    env_settings = get_env_settings(**variant['env_params'])
    env = env_settings['env']
    es = OUStrategy(env_spec=env.spec)
    qf = FeedForwardCritic(name_or_scope="critic",
                           env_spec=env.spec,
                           **variant.get('qf_params', {}))
    policy = FeedForwardPolicy(name_or_scope="actor",
                               env_spec=env.spec,
                               **variant.get('policy_params', {}))
    algorithm = MyDDPG(env, es, policy, qf, **variant['algo_params'])
    algorithm.train()
def main():
    env = TfEnv(CartpoleEnv())
    es = OUStrategy(env_spec=env.spec)
    qf = FeedForwardCritic(
        name_or_scope="critic",
        env_spec=env.spec,
    )
    policy = FeedForwardPolicy(
        name_or_scope="actor",
        env_spec=env.spec,
    )
    default_ddpg_params = dict(
        batch_size=128,
        n_epochs=10,
        epoch_length=1000,
        eval_samples=1000,
        max_path_length=100,
        min_pool_size=100,
    )
    exp_prefix = 'ddpg-cartpole-speed-{0}'.format(timestamp())
    algorithm = DDPG(
        env,
        es,
        policy,
        qf,
        **default_ddpg_params,
    )

    run_experiment_lite(
        algorithm.train(),
        n_parallel=1,
        snapshot_mode="last",
        exp_prefix=exp_prefix,
        seed=1,
    )
Пример #3
0
 def test_serialize_feedforward_policy(self):
     policy = FeedForwardPolicy(
         name_or_scope="b",
         action_dim=self.action_dim,
         observation_dim=self.observation_dim,
     )
     self.sess.run(tf.global_variables_initializer())
     pickle.dumps(policy)
Пример #4
0
def main():
    stub(globals())
    env = TfEnv(HalfCheetahEnv())
    for seed in range(3):
        ddpg_params = dict(
            batch_size=128,
            n_epochs=100,
            epoch_length=10000,
            eval_samples=10000,
            discount=0.99,
            policy_learning_rate=1e-4,
            qf_learning_rate=1e-3,
            soft_target_tau=0.01,
            replay_pool_size=1000000,
            min_pool_size=256,
            scale_reward=1.0,
            max_path_length=1000,
            qf_weight_decay=0.0,
        )
        vitchyr_es = OUStrategy(env_spec=env.spec)
        vitchyr_qf = FeedForwardCritic(
            name_or_scope="critic",
            env_spec=env.spec,
        )
        vitchyr_policy = FeedForwardPolicy(
            name_or_scope="actor",
            env_spec=env.spec,
        )
        vitchyr_ddpg = DDPG(env, vitchyr_es, vitchyr_policy, vitchyr_qf,
                            **ddpg_params)

        shane_es = GaussianStrategy(env.spec)
        shane_policy = DeterministicMLPPolicy(
            name="init_policy",
            env_spec=env.spec,
            hidden_sizes=(100, 100),
            hidden_nonlinearity=tf.nn.relu,
            output_nonlinearity=tf.nn.tanh,
        )
        shane_qf = ContinuousMLPQFunction(name="qf",
                                          env_spec=env.spec,
                                          hidden_sizes=(100, 100))
        shane_ddpg = ShaneDDPG(env, shane_policy, shane_qf, shane_es,
                               **ddpg_params)

        names_and_algos = [
            ("Vitchyr_DDPG", vitchyr_ddpg),
            ("Shane_DDPG", shane_ddpg),
        ]
        for name, algorithm in names_and_algos:
            env.reset()
            run_experiment_lite(
                algorithm.train(),
                n_parallel=1,
                snapshot_mode="last",
                exp_prefix="ddpg-comparison-cheetah",
                seed=seed,
            )
def run_task(variant):
    import tensorflow as tf
    from railrl.railrl.algos.ddpg import DDPG
    from railrl.policies.nn_policy import FeedForwardPolicy
    from railrl.qfunctions.nn_qfunction import FeedForwardCritic
    from railrl.qfunctions.quadratic_naf_qfunction import QuadraticNAF
    from rllab.exploration_strategies.ou_strategy import OUStrategy
    from sandbox.rocky.tf.envs.base import TfEnv
    from rllab.envs.box2d.cartpole_env import CartpoleEnv

    env = TfEnv(CartpoleEnv())
    algo_name = variant['Algorithm']
    if algo_name == 'Quadratic-DDPG':
        qf = QuadraticNAF(
            name_or_scope="quadratic_qf",
            env_spec=env.spec,
        )
    elif algo_name == 'DDPG':
        qf = FeedForwardCritic(
            name_or_scope="critic",
            env_spec=env.spec,
            embedded_hidden_sizes=(100, ),
            observation_hidden_sizes=(100, ),
            hidden_nonlinearity=tf.nn.relu,
        )
    else:
        raise Exception('Algo name not recognized: {0}'.format(algo_name))

    es = OUStrategy(env_spec=env.spec)
    policy = FeedForwardPolicy(
        name_or_scope="actor",
        env_spec=env.spec,
    )

    ddpg_params = dict(
        batch_size=128,
        n_epochs=100,
        epoch_length=1000,
        eval_samples=1000,
        discount=0.99,
        policy_learning_rate=1e-4,
        qf_learning_rate=1e-3,
        soft_target_tau=0.01,
        replay_pool_size=1000000,
        min_pool_size=256,
        scale_reward=1.0,
        max_path_length=1000,
        qf_weight_decay=0.01,
    )
    algorithm = DDPG(env, es, policy, qf, **ddpg_params)
    algorithm.train()
def my_ddpg_launcher(variant):
	"""
	Run DDPG
	:param variant: Dictionary of dictionary with the following keys:
		- algo_params
		- env_params
		- qf_params
		- policy_params
	:return:
	"""
	from railrl.algos.ddpg import DDPG as MyDDPG
	from railrl.policies.nn_policy import FeedForwardPolicy
	from railrl.qfunctions.nn_qfunction import FeedForwardCritic
	from rllab.exploration_strategies.ou_strategy import OUStrategy
	from railrl.launchers.launcher_util import get_env_settings
	from railrl.core.tf_util import BatchNormConfig
	if ('batch_norm_params' in variant
		and variant['batch_norm_params'] is not None):
		bn_config = BatchNormConfig(**variant['batch_norm_params'])
	else:
		bn_config = None
	env_settings = get_env_settings(**variant['env_params'])
	env = env_settings['env']
	es = OUStrategy(env_spec=env.spec)
	qf = FeedForwardCritic(
		name_or_scope="critic",
		env_spec=env.spec,
		batch_norm_config=bn_config,
		**variant.get('qf_params', {})
	)
	policy = FeedForwardPolicy(
		name_or_scope="actor",
		env_spec=env.spec,
		batch_norm_config=bn_config,
		**variant.get('policy_params', {})
	)

	algorithm = MyDDPG(
		env,
		es,
		policy,
		qf,
		variant['tensorboard'],
		batch_norm_config=bn_config,
		**variant['algo_params'],
	)
	algorithm.train()
def run_task(_):
    from railrl.algos.ddpg import DDPG
    from railrl.policies.nn_policy import FeedForwardPolicy
    from railrl.qfunctions.quadratic_naf_qfunction import QuadraticNAF
    from rllab.exploration_strategies.ou_strategy import OUStrategy
    from sandbox.rocky.tf.envs.base import TfEnv
    from rllab.envs.gym_env import GymEnv

    def gym_env(name):
        return GymEnv(name,
                      record_video=False,
                      log_dir='/tmp/gym-test',  # Ignore gym log.
                      record_log=False)

    env = TfEnv(gym_env('AxeTwoDPoint-v0'))
    ddpg_params = dict(
        batch_size=128,
        n_epochs=50,
        epoch_length=1000,
        eval_samples=1000,
        discount=0.99,
        policy_learning_rate=1e-4,
        qf_learning_rate=1e-3,
        soft_target_tau=0.01,
        replay_pool_size=1000000,
        min_pool_size=256,
        scale_reward=1.0,
        max_path_length=1000,
        qf_weight_decay=0.01,
    )
    es = OUStrategy(env_spec=env.spec)
    qf = QuadraticNAF(
        name_or_scope="quadratic_qf",
        env_spec=env.spec,
    )
    policy = FeedForwardPolicy(
        name_or_scope="actor",
        env_spec=env.spec,
    )
    algorithm = DDPG(
        env,
        es,
        policy,
        qf,
        **ddpg_params
    )
    algorithm.train()
 def _create_network_internal(self, observation_input, action_input):
     observation_input = self._process_layer(observation_input,
                                             scope_name="observation_input")
     action_input = self._process_layer(action_input,
                                        scope_name="action_input")
     self._vf = MlpStateNetwork(
         name_or_scope="V_function",
         output_dim=1,
         observation_dim=self.observation_dim,
         observation_input=observation_input,
         observation_hidden_sizes=(100, 100),
         hidden_W_init=None,
         hidden_b_init=None,
         output_W_init=None,
         output_b_init=None,
         hidden_nonlinearity=tf.nn.relu,
         output_nonlinearity=tf.identity,
         batch_norm_config=self._batch_norm_config,
     )
     self._policy = FeedForwardPolicy(
         name_or_scope="implict_policy",
         action_dim=self.action_dim,
         observation_dim=self.observation_dim,
         observation_input=observation_input,
         observation_hidden_sizes=(100, 100),
         hidden_W_init=None,
         hidden_b_init=None,
         output_W_init=None,
         output_b_init=None,
         hidden_nonlinearity=tf.nn.relu,
         output_nonlinearity=tf.nn.tanh,
         batch_norm_config=self._batch_norm_config,
     )
     self._af = QuadraticQF(
         name_or_scope="advantage_function",
         action_input=action_input,
         observation_input=observation_input,
         action_dim=self.action_dim,
         observation_dim=self.observation_dim,
         policy=self._policy,
         batch_norm_config=self._batch_norm_config,
     )
     vf_out = self._add_subnetwork_and_get_output(self._vf)
     af_out = self._add_subnetwork_and_get_output(self._af)
     return vf_out + af_out
def main():
    env = TfEnv(CartpoleEnv())
    es = OUStrategy(env_spec=env.spec)
    qf = FeedForwardCritic(
        name_or_scope="critic",
        env_spec=env.spec,
    )
    policy = FeedForwardPolicy(
        name_or_scope="actor",
        env_spec=env.spec,
    )
    default_ddpg_params = dict(
        batch_size=32,
        n_epochs=10,
        epoch_length=1000,
        eval_samples=1000,
        max_path_length=100,
        min_pool_size=1000,
    )
    sweeper = DeterministicHyperparameterSweeper(
        {'scale_reward': [1e-4, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4]}, )
    exp_prefix = 'ddpg-cart-reward-scale-sweep-{0}'.format(timestamp())
    for ddpg_params in sweeper.iterate_hyperparameters():
        algorithm = DDPG(
            env,
            es,
            policy,
            qf,
            scale_reward=ddpg_params['scale_reward'],
            **default_ddpg_params,
        )

        for seed in range(3):
            run_experiment_lite(
                algorithm.train(),
                n_parallel=1,
                snapshot_mode="last",
                exp_prefix=exp_prefix,
                seed=seed,
                # mode="local",
                # use_cloudpickle=True,
            )
def example(*_):
    env = HalfCheetahEnv()
    es = OUStrategy(env_spec=env.spec)
    qf = FeedForwardCritic(
        name_or_scope="critic",
        env_spec=env.spec,
    )
    policy = FeedForwardPolicy(
        name_or_scope="actor",
        env_spec=env.spec,
    )
    algorithm = DDPG(
        env,
        es,
        policy,
        qf,
        n_epochs=25,
        batch_size=1024,
        replay_pool_size=10000,
    )
    algorithm.train()
def oat_qddpg_launcher(variant):
	"""
	Quadratic optimal action target DDPG
	"""
	from railrl.algos.optimal_action_target_ddpg import OptimalActionTargetDDPG as OAT
	from railrl.policies.nn_policy import FeedForwardPolicy
	from railrl.qfunctions.quadratic_naf_qfunction import QuadraticNAF
	from rllab.exploration_strategies.ou_strategy import OUStrategy
	from railrl.launchers.launcher_util import get_env_settings
	from railrl.core.tf_util import BatchNormConfig
	if ('batch_norm_params' in variant
		and variant['batch_norm_params'] is not None):
		bn_config = BatchNormConfig(**variant['batch_norm_params'])
	else:
		bn_config = None
	env_settings = get_env_settings(**variant['env_params'])
	env = env_settings['env']
	es = OUStrategy(env_spec=env.spec)
	qf = QuadraticNAF(
		name_or_scope="critic",
		env_spec=env.spec,
		batch_norm_config=bn_config,
		**variant['qf_params']
	)
	policy = FeedForwardPolicy(
		name_or_scope="actor",
		env_spec=env.spec,
		batch_norm_config=bn_config,
		**variant['policy_params']
	)
	algorithm = OAT(
		env,
		es,
		policy,
		qf,
		batch_norm_config=bn_config,
		**variant['algo_params']
	)
	algorithm.train()
Пример #12
0
def main():
    stub(globals())
    env = TfEnv(CartpoleEnv())
    ddpg_params = dict(
        batch_size=128,
        n_epochs=50,
        epoch_length=1000,
        eval_samples=1000,
        discount=0.99,
        policy_learning_rate=1e-4,
        qf_learning_rate=1e-3,
        soft_target_tau=0.01,
        replay_pool_size=1000000,
        min_pool_size=256,
        scale_reward=1.0,
        max_path_length=1000,
        qf_weight_decay=0.01,
    )
    es = OUStrategy(env_spec=env.spec)
    qf = QuadraticNAF(
        name_or_scope="quadratic_qf",
        env_spec=env.spec,
    )
    policy = FeedForwardPolicy(
        name_or_scope="actor",
        env_spec=env.spec,
    )
    algorithm = DDPG(env, es, policy, qf, **ddpg_params)

    for seed in range(3):
        env.reset()
        run_experiment_lite(
            algorithm.train(),
            n_parallel=1,
            snapshot_mode="last",
            exp_prefix="test-qddpg-cartpole",
            seed=seed,
        )
def run_task(_):
    for seed in range(3):
        env = TfEnv(HalfCheetahEnv())
        es = OUStrategy(env_spec=env.spec)
        qf = FeedForwardCritic(
            name_or_scope="critic",
            env_spec=env.spec,
        )
        policy = FeedForwardPolicy(
            name_or_scope="actor",
            env_spec=env.spec,
        )
        ddpg_params = dict(
            batch_size=16,
            n_epochs=100,
            epoch_length=100,
            eval_samples=100,
            max_path_length=10,
            min_pool_size=2,
        )
        algorithm = DDPG(env, es, policy, qf, **ddpg_params)

        algorithm.train(),
Пример #14
0
gammas = [0.9, 0.99, 0.995]
taus = [1e-3, 1e-2]

for policy_lr, qf_lr, gamma, tau in itertools.product(policy_lrs, qf_lrs, gammas, taus):
	env = TfEnv(normalize(env=GymEnv('Box3dReach-v4',record_video=False, \
	log_dir='/tmp/gym_test',record_log=False)))
	
	es = OUStrategy(env_spec=env.spec)
	qf = FeedForwardCritic(
		name_or_scope="critic",
		env_spec=env.spec,
		hidden_nonlinearity=tf.nn.tanh,
	)
	policy = FeedForwardPolicy(
		name_or_scope="actor",
		env_spec=env.spec,
		hidden_nonlinearity=tf.nn.tanh,
	)

	algo = DDPG(
		env,
		es,
		policy,
		qf,
		"/data0/dianchen/box3d/ddpg_box3d_state_v4_tf_policy_{0}_qf_{1}_gamma_{2}_tau_{3}".format(
			policy_lr,
			qf_lr,
			gamma,
			tau,
		),
		qf_learning_rate=qf_lr,
def icm_launcher(variant):

	if variant["Algorithm"] == "DDPG":
		from railrl.algos.ddpg import DDPG as MyDDPG
		from railrl.policies.nn_policy import FeedForwardPolicy
		from railrl.qfunctions.nn_qfunction import FeedForwardCritic
		from rllab.exploration_strategies.ou_strategy import OUStrategy
		from railrl.exploration_strategies.simple_gaussian_strategy import SimpleGaussianStrategy
		from railrl.launchers.launcher_util import get_env_settings
		from railrl.core.tf_util import BatchNormConfig
		from railrl.algos.icm import ICM

		if ('batch_norm_params' in variant
			and variant['batch_norm_params'] is not None):
			bn_config = BatchNormConfig(**variant['batch_norm_params'])
		else:
			bn_config = None
		env_settings = get_env_settings(**variant['env_params'])
		env = env_settings['env']
		es = OUStrategy(env_spec=env.spec)
		# es = SimpleGaussianStrategy(env_spec=env.spec, sigma=0.5)
		qf = FeedForwardCritic(
			name_or_scope="critic",
			env_spec=env.spec,
			batch_norm_config=bn_config,
			**variant.get('qf_params', {})
		)
		policy = FeedForwardPolicy(
			name_or_scope="actor",
			env_spec=env.spec,
			batch_norm_config=bn_config,
			**variant.get('policy_params', {})
		)

		algo = MyDDPG(
			env,
			es,
			policy,
			qf,
			variant['tensorboard'],
			batch_norm_config=bn_config,
			**variant['algo_params'],
		)
		algorithm = ICM(
			env, 
			algo,
			no_encoder=False,
			feature_dim=env.spec.observation_space.flat_dim, 
			forward_weight=0.9,
			external_reward_weight=0.95,
			inverse_tanh=True,
			init_learning_rate=1e-3
		)
		algorithm.train()
	elif variant["Algorithm"] == "Idle":
		from railrl.algos.idle import IdleAlgo
		from railrl.launchers.launcher_util import get_env_settings
		from railrl.algos.icm import ICM
		env_settings = get_env_settings(**variant['env_params'])
		env = env_settings['env']
		algo = IdleAlgo(env, variant['tensorboard'])
		algorithm = ICM(
			env, 
			algo,
			no_encoder=False,
			feature_dim=env.spec.observation_space.flat_dim,
			forward_weight=0.9,
			external_reward_weight=0.0,
			inverse_tanh=True,
			init_learning_rate=1e-3,
		)
		algorithm.train()
	elif variant["Algorithm"] == "rllab-TRPO":
		from rllab.algos.trpo import TRPO
		from railrl.launchers.launcher_util import get_env_settings
		from railrl.algos.icm_trpo import ICM
		from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy
		from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
		from railrl.algos.icm_trpo import ICM
		import lasagne.nonlinearities as NL

		env_settings = get_env_settings(**variant['env_params'])
		env = env_settings['env']
		policy = GaussianMLPPolicy(
			env_spec=env.spec,
			hidden_sizes=(64, 32),
			output_nonlinearity=NL.tanh,
		)

		baseline = LinearFeatureBaseline(
			env.spec,
		)

		batch_size = 5000
		algo = TRPO(
			env=env,
			policy=policy,
			baseline=baseline,
			batch_size=batch_size,
			whole_paths=True,
			max_path_length=1000,
			n_itr=1000,
			step_size=0.01,
			subsample_factor=1.0,
		)
		algorithm = ICM(
			env, 
			algo,
			variant['tensorboard'],
			no_encoder=False,
			feature_dim=env.spec.observation_space.flat_dim,
			forward_weight=0.2,
			external_reward_weight=0.99,
			inverse_tanh=True,
			init_learning_rate=1e-4,
		)
		algorithm.train()

	elif variant["Algorithm"] == 'tf-TRPO':
		from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
		from sandbox.rocky.tf.baselines.gaussian_conv_baseline import GaussianConvBaseline
		from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy
		from sandbox.rocky.tf.policies.gaussian_conv_policy import GaussianConvPolicy
		from sandbox.rocky.tf.algos.trpo import TRPO
		from sandbox.rocky.tf.envs.base import TfEnv

		from railrl.launchers.launcher_util import get_env_settings
		# from railrl.algos.icm_trpo_tf import ICM
		from railrl.algos.icm_trpo_tf_box3d import ICM
		import tensorflow as tf

		env_settings = get_env_settings(**variant['env_params'])
		env = TfEnv(env_settings['env'])
		if len(env.observation_space.shape) == 1:
			policy = GaussianMLPPolicy(
				"mlp_policy",
				env_spec=env.spec,
				hidden_sizes=(64, 32),
				output_nonlinearity=tf.nn.tanh,
			)
			baseline = LinearFeatureBaseline(
				env.spec,
			)
		elif len(env.observation_space.shape) == 2:
			policy = ConvNNPolicy(
				"conv_policy",
				env_spec=mdp.spec,
				conv_filters=(32, 32, 32, 32),
				conv_filter_sizes=((3,3),(3,3),(3,3),(3,3)),
				conv_strides=(2, 2, 2, 2),
				conv_pads=('SAME', 'SAME', 'SAME', 'SAME'),
				hidden_sizes=(256,),
			)

			baseline = GaussianConvBaseline(
				mdp.spec,
				regressor_args={
					'conv_filters':(32, 32, 32, 32),
					'conv_filter_sizes':((3,3),(3,3),(3,3),(3,3)),
					'conv_strides':(2, 2, 2, 2),
					'conv_pads':('SAME', 'SAME', 'SAME', 'SAME'),
					'hidden_sizes':(256,),
				}
			)
		else:
			raise NotImplementedError("Sorry, no support for observatin space: {}".format(env.observation_space.shape))

		batch_size = 5000
		algo = TRPO(
			env=env,
			policy=policy,
			baseline=baseline,
			batch_size=batch_size,
			whole_paths=True,
			max_path_length=500,
			n_itr=1000,
			step_size=0.01,
			subsample_factor=1.0,
		)

		algorithm = ICM(
			env, 
			algo,
			variant['tensorboard'],
			no_encoder=False,
			feature_dim=env.spec.observation_space.flat_dim,
			forward_weight=0.2,
			external_reward_weight=0.99,
			inverse_tanh=True,
			init_learning_rate=1e-4
		)
		algorithm.train()

	else:
		raise NotImplementedError("Currently only supports DDPG!")
def main():

    parser = argparse.ArgumentParser()
    # Hyperparameters
    parser.add_argument('--seed', type=int, default=0)
    parser.add_argument('--policy_initlr', type=float, default=1e-4)
    parser.add_argument('--qf_initlr', type=float, default=1e-3)

    parser.add_argument('--qf_decay', type=float, default=.0)
    parser.add_argument('--qf_soft_tau', type=float, default=1e-3)

    # Exploration hyperparameters
    parser.add_argument('--ou_theta', type=float, default=0.15)
    parser.add_argument('--ou_sigma', type=float, default=0.3)

    parser.add_argument('--tfboard_path', type=str, default='/tmp/tfboard')
    parser.add_argument('--gpu_ratio', type=float, default=1.0)

    args = parser.parse_args()

    env = TfEnv(normalize(env=GymEnv('Box3dReach-v11',record_video=False, \
    log_dir='/tmp/gym_test',record_log=False)))

    name = 'ddpg-state-v11-plr{0}-qlr{1}-tau{2}-qfdecay{3}-ou_theta{4}-ou_sigma{5}'.format(
        args.policy_initlr, args.qf_initlr, args.qf_soft_tau, args.qf_decay,
        args.ou_theta, args.ou_sigma)

    es = OUStrategy(env_spec=env.spec,
                    theta=args.ou_theta,
                    sigma=args.ou_sigma)

    policy = FeedForwardPolicy(
        name_or_scope="actor",
        observation_hidden_sizes=(400, 300),
        env_spec=env.spec,
    )

    qf = FeedForwardCritic(
        name_or_scope="critic",
        env_spec=env.spec,
        embedded_hidden_sizes=(100, ),
        observation_hidden_sizes=(100, ),
    )

    algo = DDPG(
        env=env,
        exploration_strategy=es,
        policy=policy,
        qf=qf,
        tensorboard_path=os.path.join(args.tfboard_path, name,
                                      '_%d' % args.seed),
        qf_learning_rate=args.qf_initlr,
        policy_learning_rate=args.policy_initlr,
        soft_target_tau=args.qf_soft_tau,
        gpu_ratio=args.gpu_ratio,
    )

    run_experiment_lite(algo.train(),
                        exp_prefix=name,
                        n_parallel=1,
                        snapshot_mode="last",
                        seed=args.seed,
                        mode="local")