Exemplo n.º 1
0
def parse_noise_types(noise_type, nb_actions):
    """
    Parse noise types for policies
    """
    action_noise = None
    param_noise = None
    for current_noise_type in noise_type.split(','):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == 'none':
            pass
        elif 'adaptive-param' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            param_noise = AdaptiveParamNoiseSpec(
                initial_stddev=float(stddev),
                desired_action_stddev=float(stddev))
        elif 'normal' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = NormalActionNoise(mean=np.zeros(nb_actions),
                                             sigma=float(stddev) *
                                             np.ones(nb_actions))
        elif 'ou' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = OrnsteinUhlenbeckActionNoise(
                mean=np.zeros(nb_actions),
                sigma=float(stddev) * np.ones(nb_actions))
        else:
            raise RuntimeError(
                'unknown noise type "{}"'.format(current_noise_type))
    return action_noise, param_noise
Exemplo n.º 2
0
def create_action_noise(env, noise_type):
    action_noise = None
    nb_actions = env.action_space.shape[-1]
    for current_noise_type in noise_type.split(','):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == 'none':
            pass
        elif 'adaptive-param' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = AdaptiveParamNoiseSpec(
                initial_stddev=float(stddev),
                desired_action_stddev=float(stddev))
        elif 'normal' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = NormalActionNoise(mean=np.zeros(nb_actions),
                                             sigma=float(stddev) *
                                             np.ones(nb_actions))
            # action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))
        elif 'ou' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = OrnsteinUhlenbeckActionNoise(
                mean=np.zeros(nb_actions),
                sigma=float(stddev) * np.ones(nb_actions))
        else:
            raise RuntimeError(
                'unknown noise type "{}"'.format(current_noise_type))
    return action_noise
Exemplo n.º 3
0
    def run(self):
        self._init()

        env = self.env
        model = self.model
        objective = self.objective

        if objective == "infogain":
            wenv = InfogainEnv(env, model)
        elif objective == "prederr":
            wenv = PrederrEnv(env, model)
        else:
            raise AttributeError(
                "Objective '{}' is unknown. Needs to be 'infogain' or 'prederr'"
                .format(objective))

        wenv.max_episode_len = self.horizon
        wenv.end_episode_callback = self._end_episode
        dvenv = DummyVecEnv([lambda: wenv])

        if self.rl_algo == "ddpg":
            self.logger.info("Setting up DDPG as model-free RL algorithm.")
            pn = AdaptiveParamNoiseSpec()
            an = NormalActionNoise(np.array([0]), np.array([1]))
            rl_model = DDPG(DDPGMlpPolicy,
                            dvenv,
                            verbose=1,
                            render=False,
                            action_noise=an,
                            param_noise=pn,
                            nb_rollout_steps=self.horizon,
                            nb_train_steps=self.horizon)
        elif self.rl_algo == "sac":
            self.logger.info("Setting up SAC as model-free RL algorithm.")
            rl_model = SAC(SACMlpPolicy,
                           dvenv,
                           verbose=1,
                           learning_starts=self.horizon)
        else:
            raise AttributeError(
                "Model-free RL algorithm '{}' is unknown.".format(
                    self.rl_algo))

        # Train the agent
        max_steps_total = self.horizon * self.n_episodes * 100
        try:
            self.logger.info("Start the agent")
            rl_model.learn(total_timesteps=max_steps_total, seed=self.seed)
        except MaxEpisodesReachedException:
            print("Exploration finished.")
Exemplo n.º 4
0
def main():

    # Save argument values to yaml file
    args_file_path = os.path.join(args.log_dir, 'args.yaml')
    with open(args_file_path, 'w') as f:
        yaml.dump(vars(args), f, default_flow_style=False)

    # Create and wrap the environment
    env = gym.make(args.env)
    env = Monitor(env, args.log_dir, allow_early_resets=True)
    env = DummyVecEnv([lambda: env])

    # Add some param noise for exploration
    if args.model == 'DDPG':
        param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.2,
                                             desired_action_stddev=0.2)
        model = MODEL_CLASS(MlpPolicy,
                            env,
                            param_noise=param_noise,
                            memory_limit=int(1e6),
                            verbose=0)
    if args.model == 'SAC':
        # TODO: This doesn't work
        model = MODEL_CLASS(MlpPolicy,
                            env,
                            verbose=1,
                            policy_kwargs={
                                'n_env': 1,
                                'n_steps': 64,
                                'n_batch': 64
                            })
    else:
        model = MODEL_CLASS(MlpPolicy, env, verbose=0)

    # Train the agent
    model.learn(total_timesteps=args.n_steps, callback=callback)

    # Save the final model
    if args.save_model:
        model_file_path = os.path.join(args.log_dir, 'model.pkl')
        model.save(model_file_path)
        print("Best and final models saved in ", os.path.abspath(args.log_dir))

    if args.plots:
        raise NotImplementedError
Exemplo n.º 5
0
def main(_algo_name, _algo_tag, _tag_suffix, _save_freq, _lock_rotation, _eval_num, _eval_freq, hyperparams):
    rotation_tag = "_LOCKED_ROT_" if _lock_rotation else "_ROTATION_"
    full_tag = _algo_name + rotation_tag + _algo_tag + _tag_suffix
    current_dir = _algo_name + "/" + full_tag
    log_dir = current_dir + "/log/"
    eval_log_dir = current_dir + "/log/eval/"
    trained_models_dir = current_dir + "/models/"
    os.makedirs(log_dir, exist_ok=True)
    os.makedirs(eval_log_dir, exist_ok=True)
    os.makedirs(trained_models_dir, exist_ok=True)

    is_discrete = True if _algo_name == 'DQN' else False

    panda_env = HERGoalEnvWrapper(CustomMonitor(get_environment(_lock_rotation=_lock_rotation,
                                                                _is_discrete=is_discrete), log_dir))
    eval_env = HERGoalEnvWrapper(CustomMonitor(get_environment(_lock_rotation=_lock_rotation,
                                                               _is_discrete=is_discrete), eval_log_dir))

    callbacks = []
    callbacks.append(CheckpointCallback(_save_freq, trained_models_dir)) if _save_freq > 0 else None
    callbacks.append(MeanHundredEpsTensorboardCallback(log_dir))
    callbacks.append(StdHundredEpsTensorboardCallback(log_dir))
    callbacks.append(SuccessRateTensorboardCallback(log_dir))
    if _algo_name == 'DDPG':
        callbacks.append(SaveOnBestTrainingRewardCallback(10000, log_dir))
    else:
        callbacks.append(EvalCallback(eval_env,
                                      best_model_save_path=trained_models_dir,
                                      log_path=log_dir,
                                      eval_freq=_eval_freq,
                                      deterministic=True,
                                      render=False,
                                      n_eval_episodes=_eval_num)) if _eval_freq > 0 else None

    time_steps = hyperparams.pop('n_timesteps') if hyperparams.get('n_timesteps') is not None else None

    param_noise = None
    action_noise = None
    if hyperparams.get('noise_type') is not None:
        noise_type = hyperparams.pop('noise_type').strip()
        if 'ornstein-uhlenbeck' in noise_type:
            n_actions = panda_env.action_space.shape[-1]
            action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                        sigma=float(0.005) * np.ones(n_actions))
        elif 'param_noise' in noise_type:
            param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.1, desired_action_stddev=0.1)

    # add action noise for DDPG or TD3, in DQN noise as flag already in hyperparams
    if _algo_name == 'DDPG' or _algo_name == 'TD3':
        hyperparams['action_noise'] = action_noise

    # add hyperparams specific only for DDPG
    if _algo_name == 'DDPG':
        hyperparams['param_noise'] = param_noise
        hyperparams['eval_env'] = eval_env

    model = ALGOS[_algo_name](env=panda_env,
                              tensorboard_log="tensorboard/",
                              n_cpu_tf_sess=None,
                              **hyperparams)

    model.learn(total_timesteps=time_steps,
                callback=callbacks,
                tb_log_name=full_tag,
                log_interval=10)

    model.save(current_dir + "/" + full_tag + "_final")
Exemplo n.º 6
0
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs):
    """
    run the training of DDPG

    :param env_id: (str) the environment ID
    :param seed: (int) the initial random seed
    :param noise_type: (str) the wanted noises ('adaptive-param', 'normal' or 'ou'), can use multiple noise type by
        seperating them with commas
    :param layer_norm: (bool) use layer normalization
    :param evaluation: (bool) enable evaluation of DDPG training
    :param kwargs: (dict) extra keywords for the training.train function
    """

    # Configure things.
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)

    # Create envs.
    env = gym.make(env_id)
    env = bench.Monitor(
        env,
        logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))

    if evaluation and rank == 0:
        eval_env = gym.make(env_id)
        eval_env = bench.Monitor(eval_env,
                                 os.path.join(logger.get_dir(), 'gym_eval'))
        env = bench.Monitor(env, None)
    else:
        eval_env = None

    # Parse noise_type
    action_noise = None
    param_noise = None
    nb_actions = env.action_space.shape[-1]
    for current_noise_type in noise_type.split(','):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == 'none':
            pass
        elif 'adaptive-param' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            param_noise = AdaptiveParamNoiseSpec(
                initial_stddev=float(stddev),
                desired_action_stddev=float(stddev))
        elif 'normal' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = NormalActionNoise(mean=np.zeros(nb_actions),
                                             sigma=float(stddev) *
                                             np.ones(nb_actions))
        elif 'ou' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = OrnsteinUhlenbeckActionNoise(
                mean=np.zeros(nb_actions),
                sigma=float(stddev) * np.ones(nb_actions))
        else:
            raise RuntimeError(
                'unknown noise type "{}"'.format(current_noise_type))

    # Seed everything to make things reproducible.
    seed = seed + 1000000 * rank
    logger.info('rank {}: seed={}, logdir={}'.format(rank, seed,
                                                     logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    env.seed(seed)
    if eval_env is not None:
        eval_env.seed(seed)

    # Disable logging for rank != 0 to avoid noise.
    start_time = 0
    if rank == 0:
        start_time = time.time()
    model = DDPG(policy=MlpPolicy,
                 env=env,
                 memory_policy=Memory,
                 eval_env=eval_env,
                 param_noise=param_noise,
                 action_noise=action_noise,
                 memory_limit=int(1e6),
                 layer_norm=layer_norm,
                 verbose=2,
                 **kwargs)
    model.learn(total_timesteps=10000)
    env.close()
    if eval_env is not None:
        eval_env.close()
    if rank == 0:
        logger.info('total runtime: {}s'.format(time.time() - start_time))
Exemplo n.º 7
0
from stable_baselines.ddpg.noise import AdaptiveParamNoiseSpec
import numpy as np

powerenv = ActiveEnv()
powerenv.set_parameters({
    'state_space': ['sun', 'demand', 'imbalance'],
    'reward_terms': ['voltage', 'current', 'imbalance']
})

powerenv = DummyVecEnv([lambda: powerenv])
action_mean = np.zeros(powerenv.action_space.shape)
action_sigma = 0.3 * np.ones(powerenv.action_space.shape)
action_noise = OrnsteinUhlenbeckActionNoise(mean=action_mean,
                                            sigma=action_sigma)

param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.2,
                                     desired_action_stddev=0.01)

t_steps = 800000
logdir = 'C:\\Users\\vegar\\Dropbox\\Master\\logs'
powermodel = DDPG(
    LnMlpPolicy,
    powerenv,
    verbose=2,
    action_noise=action_noise,
    gamma=0.99,
    #param_noise=param_noise,
    tensorboard_log=logdir,
    memory_limit=int(800000),
    nb_train_steps=50,
    nb_rollout_steps=100,
    critic_lr=0.001,
Exemplo n.º 8
0
Arquivo: ddpg.py Projeto: s206283/gcrl
    def train(self, args, callback, env_kwargs=None, train_kwargs=None):
        env = self.makeEnv(args, env_kwargs=env_kwargs)

        if train_kwargs is None:
            train_kwargs = {}

        # Parse noise_type
        action_noise = None
        param_noise = None
        n_actions = env.action_space.shape[-1]
        if args.noise_param:
            param_noise = AdaptiveParamNoiseSpec(initial_stddev=args.noise_param_sigma,
                                                 desired_action_stddev=args.noise_param_sigma)

        if train_kwargs.get("noise_action", args.noise_action) == 'normal':
            action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                             sigma=args.noise_action_sigma * np.ones(n_actions))
        elif train_kwargs.get("noise_action", args.noise_action) == 'ou':
            action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                        sigma=args.noise_action_sigma * np.ones(n_actions))

        # filter the hyperparam, and set default values in case no hyperparam
        train_kwargs = {k: v for k, v in train_kwargs.items() if k not in ["noise_action_sigma", "noise_action"]}

        # get the associated policy for the architecture requested
        if args.srl_model == "raw_pixels":
            args.policy = "cnn"
        else:
            args.policy = "mlp"

        self.policy = args.policy
        self.ob_space = env.observation_space
        self.ac_space = env.action_space

        policy_fn = {'cnn': CnnPolicy,
                     'mlp': MlpPolicy}[args.policy]

        param_kwargs = {
            "verbose": 1,
            "render_eval": False,
            "render": False,
            "reward_scale": 1.,
            "param_noise": param_noise,
            "normalize_returns": False,
            "normalize_observations": (args.srl_model == "raw_pixels"),
            "critic_l2_reg": 1e-2,
            "actor_lr": 1e-4,
            "critic_lr": 1e-3,
            "action_noise": action_noise,
            "enable_popart": False,
            "gamma": 0.99,
            "clip_norm": None,
            "nb_train_steps": 100,
            "nb_rollout_steps": 100,
            "nb_eval_steps": 50,
            "batch_size": args.batch_size
        }

        self.model = self.model_class(policy_fn, env, **{**param_kwargs, **train_kwargs})
        self.model.learn(total_timesteps=args.num_timesteps, seed=args.seed, callback=callback)
        env.close()
import gym
import numpy as np
from matplotlib import pyplot as plt
from stable_baselines import TD3, DDPG
from stable_baselines.ddpg.policies import MlpPolicy, LnMlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines.ddpg.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise, AdaptiveParamNoiseSpec

env = gym.make('gym_squeeze:squeeze-v0')

# the noise objects for DDPG
n_actions = env.action_space.shape[-1]
param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.1,
                                     desired_action_stddev=0.1,
                                     adoption_coefficient=1.01)
action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                            sigma=float(0.1) *
                                            np.ones(n_actions))

model = DDPG(LnMlpPolicy,
             env,
             verbose=1,
             param_noise=param_noise,
             action_noise=action_noise,
             tensorboard_log="./ppo1_squeeze_tensorboard_1/",
             full_tensorboard_log=True)
model.learn(total_timesteps=10000000)
model.save("ddpg_squeeze")

del model  # remove to demonstrate saving and loading
Exemplo n.º 10
0
def train_HER(env, out_dir, seed=None, **kwargs):
    # Logs will be saved in log_dir/monitor.csv
    global output_dir, log_dir
    output_dir = out_dir
    log_dir = os.path.join(out_dir, 'log')
    os.makedirs(log_dir, exist_ok=True)
    env = Monitor(env, log_dir + '/', allow_early_resets=True)

    policy = kwargs['policy']
    algo_name = kwargs['algo_name']
    n_timesteps = kwargs['n_timesteps']
    noise_type = None
    if 'noise_type' in kwargs:
        noise_type = kwargs['noise_type']
        del kwargs['noise_type']

    # HER Available strategies (cf paper): future, final, episode, random
    goal_selection_strategy = kwargs['goal_selection_strategy']
    n_sampled_goal = kwargs['n_sampled_goal']

    del kwargs['policy']
    del kwargs['algo_name']
    del kwargs['n_timesteps']
    del kwargs['goal_selection_strategy']
    del kwargs['n_sampled_goal']

    # Set agent algorithm
    agent = set_agent(algo_name)
    if not agent:
        print("invalid algorithm for HER")
        return

    # the noise objects
    nb_actions = env.action_space.shape[-1]
    param_noise = None
    action_noise = None

    if noise_type:

        for current_noise_type in noise_type.split(','):

            current_noise_type = current_noise_type.strip()

            if 'adaptive-param' in current_noise_type and algo_name is 'ddpg':
                _, stddev = current_noise_type.split('_')
                param_noise = AdaptiveParamNoiseSpec(
                    initial_stddev=float(stddev),
                    desired_action_stddev=float(stddev))

            elif 'normal' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = NormalActionNoise(mean=np.zeros(nb_actions),
                                                 sigma=float(stddev) *
                                                 np.ones(nb_actions))

            elif 'ou' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = OrnsteinUhlenbeckActionNoise(
                    mean=np.zeros(nb_actions),
                    sigma=float(stddev) * np.ones(nb_actions))

            else:
                raise RuntimeError(
                    'unknown noise type "{}"'.format(current_noise_type))

    # Create learning rate schedule
    for key in ['learning_rate', 'learning_rate_pi', 'cliprange']:
        if key in kwargs:
            if isinstance(kwargs[key], str):
                schedule, initial_value = kwargs[key].split('_')
                initial_value = float(initial_value)
                kwargs[key] = linear_schedule(initial_value)
            elif isinstance(kwargs[key], float):
                kwargs[key] = constfn(kwargs[key])
            else:
                raise ValueError('Invalid valid for {}: {}'.format(
                    key, kwargs[key]))

    kwargs['tensorboard_log'] = os.path.join(log_dir, 'tb')
    kwargs['full_tensorboard_log'] = False
    kwargs['seed'] = seed
    kwargs['action_noise'] = action_noise
    if algo_name is 'ddpg':
        kwargs['param_noise'] = param_noise

    if 'continue' in kwargs and kwargs['continue'] is True:
        # Continue training
        print("Loading pretrained agent")
        # Policy should not be changed
        for key in ['policy', 'policy_kwargs']:
            if key in kwargs:
                del kwargs[key]

        model = HER.load(os.path.join(out_dir, 'final_model.pkl'),
                         env=env,
                         verbose=1,
                         **kwargs)
    else:
        if 'continue' in kwargs:
            del kwargs['continue']
        model = HER(policy,
                    env,
                    agent,
                    goal_selection_strategy=goal_selection_strategy,
                    n_sampled_goal=n_sampled_goal,
                    verbose=1,
                    **kwargs)

    model.learn(total_timesteps=n_timesteps, callback=log_callback)

    return model
Exemplo n.º 11
0
def train(env_id, num_timesteps, seed, model_path=None, images=False):
    """
    Train PPO2 model for Mujoco environment, for testing purposes

    :param env_id: (str) the environment id string
    :param num_timesteps: (int) the number of timesteps to run
    :param seed: (int) Used to seed the random generator.
    """
    def make_env():
        if images:
            env_out = GymWrapper(
                suite.make(
                    "SawyerLift",
                    use_object_obs=False,
                    use_camera_obs=True,  # do not use pixel observations
                    has_offscreen_renderer=
                    True,  # not needed since not using pixel obs
                    has_renderer=False,  # make sure we can render to the screen
                    camera_depth=True,
                    reward_shaping=True,  # use dense rewards
                    control_freq=
                    10,  # control should happen fast enough so that simulation looks smooth
                    render_visual_mesh=False,
                ),
                keys=["image", "depth"],
                images=True,
            )
        else:
            env_out = GymWrapper(
                suite.make(
                    "SawyerLift",
                    use_object_obs=True,
                    use_camera_obs=False,  # do not use pixel observations
                    has_offscreen_renderer=
                    False,  # not needed since not using pixel obs
                    has_renderer=False,  # make sure we can render to the screen
                    camera_depth=False,
                    reward_shaping=True,  # use dense rewards
                    control_freq=
                    10,  # control should happen fast enough so that simulation looks smooth
                    render_visual_mesh=False,
                )  #, keys=["image", "depth"], images=True,
            )
        env_out.reward_range = None
        env_out.metadata = None
        env_out.spec = None
        env_out = bench.Monitor(env_out,
                                logger.get_dir(),
                                allow_early_resets=True)
        return env_out

    #env = make_env()

    if images:
        env = DummyVecEnv([make_env])
        env = VecNormalize(env)

        set_global_seeds(seed)
        policy = CnnPolicy
        tblog = "/cvgl2/u/surajn/workspace/tb_logs/sawyerlift_all/"
    else:
        env = DummyVecEnv([make_env])
        env = VecNormalize(env)

        set_global_seeds(seed)
        policy = MlpPolicy
        tblog = "/cvgl2/u/surajn/workspace/tb_logs/sawyerlift_all/"
    nb_actions = env.action_space.shape[-1]
    #model = PPO2(policy=policy, env=env, n_steps=2048, nminibatches=32, lam=0.95, gamma=0.99, noptepochs=10,
    #             ent_coef=0.0, learning_rate=3e-4, cliprange=0.2, verbose=1, tensorboard_log=tblog)
    #model = TRPO(policy=policy, env, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, entcoeff=0.0,
    #                 gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3, tensorboard_log=tblog, verbose=1)
    model = DDPG(policy=ddpgMlpPolicy,
                 env=env,
                 memory_policy=Memory,
                 eval_env=None,
                 param_noise=AdaptiveParamNoiseSpec(initial_stddev=0.2,
                                                    desired_action_stddev=0.2),
                 action_noise=OrnsteinUhlenbeckActionNoise(
                     mean=np.zeros(nb_actions),
                     sigma=float(0.2) * np.ones(nb_actions)),
                 memory_limit=int(1e6),
                 verbose=2,
                 tensorboard_log=tblog)

    model.learn(total_timesteps=num_timesteps)
    env.close()

    if model_path:
        model.save(model_path)
        #tf_util.save_state(model_path)

    return model, env
Exemplo n.º 12
0
def train_DDPG(env, out_dir, seed=None, **kwargs):

    # Logs will be saved in log_dir/monitor.csv
    global output_dir, log_dir
    output_dir = out_dir
    log_dir = os.path.join(out_dir, 'log')
    os.makedirs(log_dir, exist_ok=True)
    env = Monitor(env, log_dir + '/', allow_early_resets=True)

    policy = kwargs['policy']
    n_timesteps = kwargs['n_timesteps']
    noise_type = kwargs['noise_type']
    del kwargs['policy']
    del kwargs['n_timesteps']
    del kwargs['noise_type']
    ''' Parameter space noise:
    injects randomness directly into the parameters of the agent, altering the types of decisions it makes
    such that they always fully depend on what the agent currently senses. '''

    # the noise objects for DDPG
    nb_actions = env.action_space.shape[-1]
    param_noise = None
    action_noise = None
    if not noise_type is None:

        for current_noise_type in noise_type.split(','):

            current_noise_type = current_noise_type.strip()

            if 'adaptive-param' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                param_noise = AdaptiveParamNoiseSpec(
                    initial_stddev=float(stddev),
                    desired_action_stddev=float(stddev))

            elif 'normal' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = NormalActionNoise(mean=np.zeros(nb_actions),
                                                 sigma=float(stddev) *
                                                 np.ones(nb_actions))

            elif 'ou' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = OrnsteinUhlenbeckActionNoise(
                    mean=np.zeros(nb_actions),
                    sigma=float(stddev) * np.ones(nb_actions))

            else:
                raise RuntimeError(
                    'unknown noise type "{}"'.format(current_noise_type))

    if 'continue' in kwargs and kwargs['continue'] is True:
        # Continue training
        print("Loading pretrained agent")
        # Policy should not be changed
        del kwargs['policy']
        model = DDPG.load(os.path.join(out_dir, 'final_model.pkl'),
                          env=env,
                          tensorboard_log=os.path.join(log_dir, 'tb'),
                          verbose=1,
                          **kwargs)
    else:
        if 'continue' in kwargs:
            del kwargs['continue']
        model = DDPG(policy,
                     env,
                     param_noise=param_noise,
                     action_noise=action_noise,
                     seed=seed,
                     verbose=1,
                     tensorboard_log=os.path.join(log_dir, 'tb'),
                     full_tensorboard_log=False,
                     **kwargs)

    model.learn(total_timesteps=n_timesteps, callback=log_callback)

    return model
Exemplo n.º 13
0
# environment
# env = OsmoEnv()
# env = Monitor(env, log_dir, allow_early_resets=True)
env = DummyVecEnv([lambda: OsmoEnv()])

# parameters(for training)
tau = 0.1  # update rate for target model
gamma = 0.95  # discount rate for q value.
# batch_size = NUMCONC*5+3    # size of batch
batch_size = 10
alr = 0.003  # actor learning rate
clr = 0.003  # critic learning rate

# noise(to better exploration)
n_actions = env.action_space.shape[-1]
param_noise = AdaptiveParamNoiseSpec()
# action_noise = None
# param_noise = None
action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                 sigma=float(0.5) *
                                 np.ones(n_actions))  # A gaussian action noise

# model(DDPG)
#        Deep Deterministic Policy Gradient Algorithms.
#        DDPG is the combination of Nature DQN、Actor-Critic and DPG, it is designed to tackle continuous action space problems.
#        Policy-learning
#        The policy function(actor) takes state as input and is updated according to policy gradient.
#        Q-learning
#        The value function(critic) take state and action as input and is adjusted to minimize the loss.
#        Q-learning algorithm for function approximator is largely based on minimizing this MSBE loss function, with two main tricks, viz replay buffer and targrt network.
#        The replay buffer is used to store experience, because DDPG is an off-policy algorithm.