def _init_noise(self, noise_type): action_noise = None param_noise = None if noise_type is not None: for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise( mu=np.zeros(self.action_shape[-1]), sigma=float(stddev) * np.ones(self.action_shape[-1])) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(self.action_shape[-1]), sigma=float(stddev) * np.ones(self.action_shape[-1])) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) return action_noise, param_noise
def run(cfg, seed, noise_type, layer_norm, evaluation, architecture, **kwargs): if MPI.COMM_WORLD.Get_rank() == 0: dir_path = os.path.dirname(os.path.realpath(__file__)) logger.configure(dir_path, ['stdout']) # Configure things. rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) # Create envs. env = GRLEnv(cfg) gym.logger.setLevel(logging.WARN) env = MyMonitor(env, os.path.join(logger.get_dir(), kwargs['output'])) # Parse noise_type action_noise = None param_noise = None nb_actions = env.action_space.shape[-1] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev, theta = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), dt=0.03, sigma=float(stddev) * np.ones(nb_actions), theta=float(theta) * np.ones(nb_actions)) else: raise RuntimeError('unknown noise type "{}"'.format(current_noise_type)) # Configure components. memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = MyCritic(layer_norm=layer_norm, architecture=architecture) actor = MyActor(nb_actions, layer_norm=layer_norm, architecture=architecture) # Seed everything to make things reproducible. seed = seed + 1000000 * rank logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(seed) env.seed(seed) # Disable logging for rank != 0 to avoid noise. if rank == 0: start_time = time.time() training.train(env=env, param_noise=param_noise, action_noise=action_noise, actor=actor, critic=critic, memory=memory, **kwargs) env.close() if rank == 0: logger.info('total runtime: {}s'.format(time.time() - start_time))
def learn( network, env, seed=None, total_timesteps=None, nb_epochs=None, # with default settings, perform 1M steps total nb_epoch_cycles=20, nb_rollout_steps=100, reward_scale=1.0, render=False, render_eval=False, noise_type='adaptive-param_0.2', normalize_returns=False, normalize_observations=True, critic_l2_reg=1e-2, actor_lr=1e-4, critic_lr=1e-3, popart=False, gamma=0.99, clip_norm=None, nb_train_steps=50, # per epoch cycle and MPI worker, nb_eval_steps=100, batch_size=64, # per MPI worker tau=0.01, eval_env=None, param_noise_adaption_interval=50, **network_kwargs): set_global_seeds(seed) if total_timesteps is not None: assert nb_epochs is None nb_epochs = int(total_timesteps) // (nb_epoch_cycles * nb_rollout_steps) else: nb_epochs = 500 rank = MPI.COMM_WORLD.Get_rank() nb_actions = env.action_space.shape[-1] assert (np.abs(env.action_space.low) == env.action_space.high ).all() # we assume symmetric actions. memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(network=network, **network_kwargs) actor = Actor(nb_actions, network=network, **network_kwargs) action_noise = None param_noise = None if noise_type is not None: for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) max_action = env.action_space.high logger.info( 'scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) sess = U.get_session() # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() nenvs = obs.shape[0] episode_reward = np.zeros(nenvs, dtype=np.float32) #vector episode_step = np.zeros(nenvs, dtype=int) # vector episodes = 0 #scalar t = 0 # scalar epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for epoch in range(nb_epochs): for cycle in range(nb_epoch_cycles): # Perform rollouts. if nenvs > 1: # if simulating multiple envs in parallel, impossible to reset agent at the end of the episode in each # of the environments, so resetting here instead agent.reset() for t_rollout in range(nb_rollout_steps): # Predict next action. action, q, _, _ = agent.step(obs, apply_noise=True, compute_Q=True) # Execute next action. if rank == 0 and render: env.render() # max_action is of dimension A, whereas action is dimension (nenvs, A) - the multiplication gets broadcasted to the batch new_obs, r, done, info = env.step( max_action * action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) # note these outputs are batched from vecenv t += 1 if rank == 0 and render: env.render() episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) agent.store_transition( obs, action, r, new_obs, done ) #the batched data will be unrolled in memory.py's append. obs = new_obs for d in range(len(done)): if done[d]: # Episode done. epoch_episode_rewards.append(episode_reward[d]) episode_rewards_history.append(episode_reward[d]) epoch_episode_steps.append(episode_step[d]) episode_reward[d] = 0. episode_step[d] = 0 epoch_episodes += 1 episodes += 1 if nenvs == 1: agent.reset() # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: nenvs_eval = eval_obs.shape[0] eval_episode_reward = np.zeros(nenvs_eval, dtype=np.float32) for t_rollout in range(nb_eval_steps): eval_action, eval_q, _, _ = agent.step(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step( max_action * eval_action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) for d in range(len(eval_done)): if eval_done[d]: eval_episode_rewards.append(eval_episode_reward[d]) eval_episode_rewards_history.append( eval_episode_reward[d]) eval_episode_reward[d] = 0.0 mpi_size = MPI.COMM_WORLD.Get_size() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean( episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean( epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float(duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = eval_episode_rewards combined_stats['eval/return_history'] = np.mean( eval_episode_rewards_history) combined_stats['eval/Q'] = eval_qs combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s' % x) combined_stats_sums = MPI.COMM_WORLD.allreduce( np.array( [np.array(x).flatten()[0] for x in combined_stats.values()])) combined_stats = { k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums) } # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) if rank == 0: logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f) return agent
def __init__(self, network, env, gamma=1, tau=0.01, total_timesteps=1e6, normalize_observations=True, normalize_returns=False, enable_popart=False, noise_type='adaptive-param_0.2', clip_norm=None, reward_scale=1., batch_size=128, l2_reg_coef=0.2, actor_lr=1e-4, critic_lr=1e-3, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf), **network_kwargs): # logger.info('Using agent with the following configuration:') # logger.info(str(self.__dict__.items())) observation_shape = env.observation_space.shape action_shape = env.action_space.shape # Inputs. self.obs0 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs0') self.obs1 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs1') self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1') self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions = tf.placeholder(tf.float32, shape=(None,) + action_shape, name='actions') self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target') self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev') # Parameters. self.env = env self.gamma = gamma self.tau = tau self.total_timesteps = total_timesteps self.normalize_observations = normalize_observations self.normalize_returns = normalize_returns self.enable_popart = enable_popart self.clip_norm = clip_norm self.reward_scale = reward_scale self.action_range = action_range self.return_range = return_range self.observation_range = observation_range self.batch_size = batch_size self.actor_lr = actor_lr self.critic_lr = critic_lr self.l2_reg_coef = l2_reg_coef self.stats_sample = None self.action_noise = None self.param_noise = None nb_actions = self.env.action_space.shape[-1] if noise_type is not None: for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') self.param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') self.action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') self.action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError('unknown noise type "{}"'.format(current_noise_type)) assert (np.abs(env.action_space.low) == env.action_space.high).all() # we assume symmetric actions. self.memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) self.critic = Critic(network=network, **network_kwargs) self.actor = Actor(nb_actions, network=network, **network_kwargs) # Observation normalization. if self.normalize_observations: with tf.variable_scope('obs_rms'): self.obs_rms = RunningMeanStd(shape=observation_shape) else: self.obs_rms = None normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms), self.observation_range[0], self.observation_range[1]) normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms), self.observation_range[0], self.observation_range[1]) # Return normalization. if self.normalize_returns: with tf.variable_scope('ret_rms'): self.ret_rms = RunningMeanStd() else: self.ret_rms = None # Create target networks. target_actor = copy(self.actor) target_actor.name = 'target_actor' self.target_actor = target_actor target_critic = copy(self.critic) target_critic.name = 'target_critic' self.target_critic = target_critic # Create networks and core TF parts that are shared across setup parts. self.actor_tf = self.actor(normalized_obs0) self.normalized_critic_tf = self.critic(normalized_obs0, self.actions) self.critic_tf = denormalize( tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms) self.normalized_critic_with_actor_tf = self.critic(normalized_obs0, self.actor_tf, reuse=True) self.critic_with_actor_tf = denormalize( tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) Q_obs1 = denormalize(target_critic(normalized_obs1, target_actor(normalized_obs1)), self.ret_rms) self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1 # Set up parts. if self.param_noise is not None: self.setup_param_noise(normalized_obs0) self.setup_actor_optimizer() self.setup_critic_optimizer() if self.normalize_returns and self.enable_popart: self.setup_popart() self.setup_stats() self.setup_target_network_updates() self.initial_state = None # recurrent architectures not supported yet self.def_path_pre = os.path.dirname(os.path.abspath(__file__)) + '/tmp/'
def __init__( self, env, gamma, total_timesteps, network='mlp', nb_rollout_steps=100, reward_scale=1.0, noise_type='adaptive-param_0.2', normalize_returns=False, normalize_observations=False, critic_l2_reg=1e-2, actor_lr=1e-4, critic_lr=1e-3, popart=False, clip_norm=None, nb_train_steps=50, # per epoch cycle and MPI worker, <- HERE! nb_eval_steps=100, buffer_size=1000000, batch_size=64, # per MPI worker tau=0.01, param_noise_adaption_interval=50, **network_kwargs): # Adjusting hyper-parameters by considering the number of options policies to learn num_options = env.get_number_of_options() buffer_size = num_options * buffer_size batch_size = num_options * batch_size observation_space = env.option_observation_space action_space = env.option_action_space nb_actions = action_space.shape[-1] assert (np.abs(action_space.low) == action_space.high ).all() # we assume symmetric actions. memory = Memory(limit=buffer_size, action_shape=action_space.shape, observation_shape=observation_space.shape) critic = Critic(network=network, **network_kwargs) actor = Actor(nb_actions, network=network, **network_kwargs) action_noise = None param_noise = None if noise_type is not None: for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) max_action = action_space.high logger.info( 'scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, observation_space.shape, action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) sess = U.get_session() # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() # Variables that are used during learning self.agent = agent self.memory = memory self.max_action = max_action self.batch_size = batch_size self.nb_train_steps = nb_train_steps self.nb_rollout_steps = nb_rollout_steps self.param_noise_adaption_interval = param_noise_adaption_interval
def train(self, env_fn, num_timesteps, noise_type, layer_norm, folder, load_policy, video_width, video_height, plot_rewards, save_every=50, seed=1234, episode_length=1000, pi_hid_size=150, pi_num_hid_layers=3, render_frames=_render_frames, **kwargs): num_cpu = self.workers if sys.platform == 'darwin': num_cpu //= 2 config = tf.ConfigProto( allow_soft_placement=True, intra_op_parallelism_threads=num_cpu, inter_op_parallelism_threads=num_cpu) if self.gpu_usage is None or self.gpu_usage <= 0.: os.environ["CUDA_VISIBLE_DEVICES"] = "-1" else: config.gpu_options.allow_growth = True # pylint: disable=E1101 config.gpu_options.per_process_gpu_memory_fraction = self.gpu_usage / self.workers tf.Session(config=config).__enter__() worker_seed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(worker_seed) tf.set_random_seed(worker_seed) np.random.seed(worker_seed) save_every = max(1, save_every) env = env_fn() env.seed(worker_seed) rank = MPI.COMM_WORLD.Get_rank() logger.info('rank {}: seed={}, logdir={}'.format(rank, worker_seed, logger.get_dir())) def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy( name=name, ob_space=ob_space, ac_space=ac_space, hid_size=pi_hid_size, num_hid_layers=pi_num_hid_layers) env = bench.Monitor( env, logger.get_dir() and osp.join(logger.get_dir(), str(rank)), allow_early_resets=True) gym.logger.setLevel(logging.INFO) that = self iter_name = 'iters_so_far' if self.method == 'sql': iter_name = 'epoch' # TODO replace with utils.create_callback(...) def callback(locals, globals): if that.method != "ddpg": if load_policy is not None and locals[iter_name] == 0: # noinspection PyBroadException try: utils.load_state(load_policy) if MPI.COMM_WORLD.Get_rank() == 0: logger.info("Loaded policy network weights from %s." % load_policy) # save TensorFlow summary (contains at least the graph definition) except: logger.error("Failed to load policy network weights from %s." % load_policy) if MPI.COMM_WORLD.Get_rank() == 0 and locals[iter_name] == 0: _ = tf.summary.FileWriter(folder, tf.get_default_graph()) if MPI.COMM_WORLD.Get_rank() == 0 and locals[iter_name] % save_every == 0: print('Saving video and checkpoint for policy at iteration %i...' % locals[iter_name]) ob = env.reset() images = [] rewards = [] max_reward = 1. # if any reward > 1, we have to rescale lower_part = video_height // 5 for i in range(episode_length): if that.method == "ddpg": ac, _ = locals['agent'].pi(ob, apply_noise=False, compute_Q=False) elif that.method == "sql": ac, _ = locals['policy'].get_action(ob) elif isinstance(locals['pi'], GaussianMlpPolicy): ac, _, _ = locals['pi'].act(np.concatenate((ob, ob))) else: ac, _ = locals['pi'].act(False, ob) ob, rew, new, _ = env.step(ac) images.append(render_frames(env)) if plot_rewards: rewards.append(rew) max_reward = max(rew, max_reward) if new: break orange = np.array([255, 163, 0]) red = np.array([255, 0, 0]) video = [] width_factor = 1. / episode_length * video_width for i, imgs in enumerate(images): for img in imgs: img[-lower_part, :10] = orange img[-lower_part, -10:] = orange if episode_length < video_width: p_rew_x = 0 for j, r in enumerate(rewards[:i]): rew_x = int(j * width_factor) if r < 0: img[-1:, p_rew_x:rew_x] = red img[-1:, p_rew_x:rew_x] = red else: rew_y = int(r / max_reward * lower_part) img[-rew_y - 1:, p_rew_x:rew_x] = orange img[-rew_y - 1:, p_rew_x:rew_x] = orange p_rew_x = rew_x else: for j, r in enumerate(rewards[:i]): rew_x = int(j * width_factor) if r < 0: img[-1:, rew_x] = red img[-1:, rew_x] = red else: rew_y = int(r / max_reward * lower_part) img[-rew_y - 1:, rew_x] = orange img[-rew_y - 1:, rew_x] = orange video.append(np.hstack(imgs)) imageio.mimsave( os.path.join(folder, "videos", "%s_%s_iteration_%i.mp4" % (that.environment, that.method, locals[iter_name])), video, fps=60) env.reset() if that.method != "ddpg": utils.save_state(os.path.join(that.folder, "checkpoints", "%s_%i" % (that.environment, locals[iter_name]))) if self.method == "ppo": pposgd_simple.learn( env, policy_fn, max_timesteps=int(num_timesteps), timesteps_per_actorbatch=1024, # 256 clip_param=0.2, entcoeff=0.01, optim_epochs=4, optim_stepsize=1e-3, # 1e-3 optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', # 'linear' callback=callback) elif self.method == "trpo": trpo_mpi.learn( env, policy_fn, max_timesteps=int(num_timesteps), timesteps_per_batch=1024, max_kl=0.1, # 0.01 cg_iters=10, cg_damping=0.1, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3, callback=callback) elif self.method == "acktr": from algos.acktr import acktr with tf.Session(config=tf.ConfigProto()): ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] with tf.variable_scope("vf"): vf = NeuralNetValueFunction(ob_dim, ac_dim) with tf.variable_scope("pi"): policy = GaussianMlpPolicy(ob_dim, ac_dim) acktr.learn( env, pi=policy, vf=vf, gamma=0.99, lam=0.97, timesteps_per_batch=1024, desired_kl=0.01, # 0.002 num_timesteps=num_timesteps, animate=False, callback=callback) elif self.method == "ddpg": from algos.ddpg import ddpg # Parse noise_type action_noise = None param_noise = None nb_actions = env.action_space.shape[-1] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') from baselines.ddpg.noise import AdaptiveParamNoiseSpec param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') from baselines.ddpg.noise import NormalActionNoise action_noise = NormalActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: from baselines.ddpg.noise import OrnsteinUhlenbeckActionNoise _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) # Configure components. memory = Memory( limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_norm=layer_norm) actor = Actor(nb_actions, layer_norm=layer_norm) ddpg.train( env=env, eval_env=None, param_noise=param_noise, render=False, render_eval=False, action_noise=action_noise, actor=actor, critic=critic, memory=memory, callback=callback, **kwargs) elif self.method == "sql": from softqlearning.algorithms import SQL from softqlearning.misc.kernel import adaptive_isotropic_gaussian_kernel from softqlearning.misc.utils import timestamp from softqlearning.replay_buffers import SimpleReplayBuffer from softqlearning.value_functions import NNQFunction from softqlearning.policies import StochasticNNPolicy from rllab.envs.gym_env import GymEnv env = GymEnv(env) variant = { 'seed': [1, 2, 3], 'policy_lr': 3E-4, 'qf_lr': 3E-4, 'discount': 0.99, 'layer_size': 128, 'batch_size': 128, 'max_pool_size': 1E6, 'n_train_repeat': 1, 'epoch_length': 1000, 'snapshot_mode': 'last', 'snapshot_gap': 100, } pool = SimpleReplayBuffer( env_spec=env.spec, max_replay_buffer_size=variant['max_pool_size'], ) base_kwargs = dict( min_pool_size=episode_length, epoch_length=episode_length, n_epochs=num_timesteps, max_path_length=episode_length, batch_size=variant['batch_size'], n_train_repeat=variant['n_train_repeat'], eval_render=False, eval_n_episodes=1, iter_callback=callback ) qf = NNQFunction( env_spec=env.spec, hidden_layer_sizes=tuple([pi_hid_size] * pi_num_hid_layers), ) pi_layers = tuple([pi_hid_size] * pi_num_hid_layers) policy = StochasticNNPolicy(env_spec=env.spec, hidden_layer_sizes=pi_layers) algorithm = SQL( base_kwargs=base_kwargs, env=env, pool=pool, qf=qf, policy=policy, kernel_fn=adaptive_isotropic_gaussian_kernel, kernel_n_particles=32, kernel_update_ratio=0.5, value_n_particles=16, td_target_update_interval=1000, qf_lr=variant['qf_lr'], policy_lr=variant['policy_lr'], discount=variant['discount'], reward_scale=1, save_full_state=False, ) algorithm.train() else: print('ERROR: Invalid "method" argument provided.', file=sys.stderr) env.close()
def learn( network, env, seed=None, total_timesteps=None, nb_epochs=None, # with default settings, perform 1M steps total nb_epoch_cycles=20, nb_rollout_steps=100, reward_scale=1.0, render=False, render_eval=False, noise_type='adaptive-param_0.2', normalize_returns=False, normalize_observations=True, critic_l2_reg=1e-2, actor_lr=1e-4, critic_lr=1e-3, popart=False, gamma=0.99, clip_norm=None, nb_train_steps=50, # per epoch cycle and MPI worker, nb_eval_steps=100, batch_size=64, # per MPI worker tau=0.01, eval_env=None, param_noise_adaption_interval=50, shared_critic=False, save_rate=1, save_model=True, save_actions=True, restore=False, saver=None, **network_kwargs): if save_model or save_actions: assert saver is not None set_global_seeds(seed) if shared_critic: raise NotImplementedError() if total_timesteps is not None: assert nb_epochs is None nb_epochs = int(total_timesteps) // (nb_epoch_cycles * nb_rollout_steps) else: nb_epochs = 500 if MPI is not None: rank = MPI.COMM_WORLD.Get_rank() else: rank = 0 nb_actions_n = [ action_space.shape[-1] for action_space in env.action_space ] assert np.array([(np.abs(action_space.low) == action_space.high) for action_space in env.action_space ]).all() # we assume symmetric actions. action_noise_n = [] param_noise_n = [] if noise_type is not None: for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise_n = [ AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev)) for _ in range(env.n) ] action_noise_n = [None for _ in range(env.n)] elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise_n = [None for _ in range(env.n)] action_noise_n = [ NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) for nb_actions in nb_actions_n ] elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise_n = [None for _ in range(env.n)] action_noise_n = [ OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) for nb_actions in nb_actions_n ] else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) max_action_n = [action_space.high for action_space in env.action_space] logger.info( 'scaling actions by {} before executing in env'.format(max_action_n)) agent = MADDPG(env, network, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise_n=action_noise_n, param_noise_n=param_noise_n, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale, shared_critic=shared_critic) if saver is not None and restore: saver.load_model() logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = [deque(maxlen=100) for _ in range(env.n)] # sess = U.get_session() # Prepare everything. agent.initialize() # sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() nveh = obs.shape[0] episode_reward = np.zeros((nveh, 1), dtype=np.float32) # vector episode_step = np.zeros(nveh, dtype=int) # vector episodes = 0 # scalar t = 0 # scalar epoch = 0 start_time = time.time() epoch_episode_rewards = [[] for _ in range(env.n)] epoch_episode_steps = [[] for _ in range(env.n)] epoch_actions = [[] for _ in range(env.n)] epoch_qs = [[] for _ in range(env.n)] for epoch in range(nb_epochs): epoch_episodes = 0 for cycle in range(nb_epoch_cycles): from glog import info info("epoch %d, cycle %d" % (epoch, cycle)) # Perform rollouts. # if nveh > 1: # # if simulating multiple envs in parallel, impossible to reset agent at the end of the episode in each # # of the environments, so resetting here instead agent.reset() env.reset() for t_rollout in range(nb_rollout_steps): # Predict next action. # todo no compute Q for now action_n, q_n, _, _ = agent.step(obs, apply_noise=True, compute_Q=True) if cycle == 0 and save_actions: saver.add_action(action_n) # Execute next action. if cycle == 0 and rank == 0 and render: env.render() # max_action is of dimension A, whereas action is dimension (nenvs, A) - the multiplication gets broadcasted to the batch # todo max_action not scale yet new_obs, r, done, info = env.step(action_n) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) # note these outputs are batched from vecenv # set epoch and cycle for video recorder if isinstance(env, VecVideoRecorder): env.set_stample(epoch, cycle) t += 1 episode_reward += r episode_step += 1 # Book-keeping. for i in range(env.n): epoch_actions[i].append(action_n[i]) epoch_qs[i].append(q_n[i]) agent.store_transition( obs, action_n, r, new_obs, done ) # the batched data will be unrolled in memory.py's append. obs = new_obs terminal = (t_rollout == (nb_rollout_steps - 1)) if any(done) or terminal: agent.reset() env.reset() for d in range(len(done)): if done[d] or terminal: # Episode done. epoch_episode_rewards[d].append( episode_reward[d][0]) episode_rewards_history[d].append( episode_reward[d][0]) epoch_episode_steps[d].append(episode_step[d]) episode_reward[d] = 0. episode_step[d] = 0 epoch_episodes += 1 episodes += 1 if save_actions and cycle == 0: saver.save_actions(epoch, cycle) # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if agent.memory_nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: nenvs_eval = eval_obs.shape[0] eval_episode_reward = np.zeros(nenvs_eval, dtype=np.float32) for t_rollout in range(nb_eval_steps): eval_action, eval_q, _, _ = agent.step(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step( max_action_n * eval_action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) for d in range(len(eval_done)): if eval_done[d]: eval_episode_rewards.append(eval_episode_reward[d]) eval_episode_rewards_history.append( eval_episode_reward[d]) eval_episode_reward[d] = 0.0 if MPI is not None: mpi_size = MPI.COMM_WORLD.Get_size() else: mpi_size = 1 if save_model and save_rate is not None and epoch % save_rate == 0: saver.save_model() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time # todo not record agent stats here # stats = agent.get_stats() # combined_stats = stats.copy() # todo simplified log combined_stats = {} for i in range(env.n): combined_stats['rollout/return_%d' % i] = np.mean( epoch_episode_rewards[i]) combined_stats['rollout/return_std_%d' % i] = np.std( epoch_episode_rewards[i]) combined_stats['rollout/return_history_%d' % i] = np.mean( episode_rewards_history[i]) combined_stats['rollout/return_history_std_%d' % i] = np.std( episode_rewards_history[i]) combined_stats['rollout/episode_steps_%d' % i] = np.mean( epoch_episode_steps[i]) combined_stats['rollout/Q_mean_%d' % i] = np.mean(epoch_qs[i]) combined_stats['rollout/actions_mean_%d' % i] = np.mean( epoch_actions[i]) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float(duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = eval_episode_rewards combined_stats['eval/return_history'] = np.mean( eval_episode_rewards_history) combined_stats['eval/Q'] = eval_qs combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s' % x) # todo now only log mean reward combined_stats_sums = np.array( [np.array(x).flatten().mean() for x in combined_stats.values()]) if MPI is not None: combined_stats_sums = MPI.COMM_WORLD.allreduce(combined_stats_sums) combined_stats = { k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums) } # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) if rank == 0: logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f) return agent
def learn( network, env, data_path='', model_path='./model/', model_name='ddpg_none_fuzzy_150', file_name='test', model_based=False, memory_extend=False, model_type='linear', restore=False, dyna_learning=False, seed=None, nb_epochs=5, # with default settings, perform 1M steps total nb_sample_cycle=5, nb_epoch_cycles=150, nb_rollout_steps=400, nb_model_learning=10, nb_sample_steps=50, nb_samples_extend=5, reward_scale=1.0, noise_type='normal_0.2', #'adaptive-param_0.2', ou_0.2, normal_0.2 normalize_returns=False, normalize_observations=True, critic_l2_reg=1e-2, actor_lr=1e-4, critic_lr=1e-3, popart=False, gamma=0.99, clip_norm=None, nb_train_steps=50, # per epoch cycle and MPI worker, batch_size=32, # per MPI worker tau=0.01, param_noise_adaption_interval=50, **network_kwargs): nb_actions = env.action_space.shape[0] memory = Memory(limit=int(1e5), action_shape=env.action_space.shape[0], observation_shape=env.observation_space.shape) if model_based: """ store fake_data""" fake_memory = Memory(limit=int(1e5), action_shape=env.action_space.shape[0], observation_shape=env.observation_space.shape) """ select model or not """ if model_type == 'gp': kernel = ConstantKernel(1.0, (1e-3, 1e3)) * RBF(10, (1e-2, 1e2)) dynamic_model = GaussianProcessRegressor(kernel=kernel) reward_model = GaussianProcessRegressor(kernel=kernel) elif model_type == 'linear': dynamic_model = LinearRegression() reward_model = LinearRegression() elif model_type == 'mlp': dynamic_model = MLPRegressor(hidden_layer_sizes=(100, ), activation='relu', solver='adam', alpha=0.0001, batch_size='auto', learning_rate='constant', learning_rate_init=0.001, power_t=0.5, max_iter=200, shuffle=True, random_state=None, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08) reward_model = MLPRegressor(hidden_layer_sizes=(100, ), activation='relu', solver='adam', alpha=0.0001, batch_size='auto', learning_rate='constant', learning_rate_init=0.001, power_t=0.5, max_iter=200, shuffle=True, random_state=None, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08) else: logger.info( "You need to give the model_type to fit the dynamic and reward!!!" ) critic = Critic(network=network, **network_kwargs) actor = Actor(nb_actions, network=network, **network_kwargs) """ set noise """ action_noise = None param_noise = None if noise_type is not None: for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) """action scale""" max_action = env.action_high_bound logger.info( 'scaling actions by {} before executing in env'.format(max_action)) """ agent ddpg """ agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape[0], gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) sess = U.get_session() if restore: agent.restore(sess, model_path, model_name) else: agent.initialize(sess) sess.graph.finalize() agent.reset() episodes = 0 epochs_rewards = np.zeros((nb_epochs, nb_epoch_cycles), dtype=np.float32) epochs_times = np.zeros((nb_epochs, nb_epoch_cycles), dtype=np.float32) epochs_steps = np.zeros((nb_epochs, nb_epoch_cycles), dtype=np.float32) epochs_states = [] for epoch in range(nb_epochs): logger.info( "======================== The {} epoch start !!! =========================" .format(epoch)) epoch_episode_rewards = [] epoch_episode_steps = [] epoch_episode_times = [] epoch_actions = [] epoch_episode_states = [] epoch_qs = [] epoch_episodes = 0 for cycle in range(nb_epoch_cycles): start_time = time.time() obs, state, done = env.reset() obs_reset = cp.deepcopy(obs) episode_reward = 0. episode_step = 0 episode_states = [] logger.info( "================== The {} episode start !!! ===================" .format(cycle)) for t_rollout in range(nb_rollout_steps): logger.info( "================== The {} steps finish !!! ===================" .format(t_rollout)) """ Predict next action """ action, q, _, _ = agent.step(obs, stddev, apply_noise=True, compute_Q=True) new_obs, next_state, r, done, safe_or_not, final_action = env.step( max_action * action, t_rollout) if safe_or_not is False: break episode_reward += r episode_step += 1 episode_states.append([ cp.deepcopy(state), cp.deepcopy(final_action), np.array(cp.deepcopy(r)), cp.deepcopy(next_state) ]) epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) obs = new_obs state = next_state if done: break """ extend the memory """ if model_based and cycle > (nb_model_learning + 1) and memory_extend: pred_x = np.zeros((1, 18), dtype=np.float32) for j in range(nb_samples_extend): m_action, _, _, _ = agent.step(obs, stddev, apply_noise=True, compute_Q=False) pred_x[:, :12] = obs pred_x[:, 12:] = m_action m_new_obs = dynamic_model.predict(pred_x)[0] """ get real reward """ # state = env.inverse_state(m_new_obs) # m_reward = env.get_reward(state, m_action) m_reward = reward_model.predict(pred_x)[0] agent.store_transition(obs, m_action, m_reward, m_new_obs, done) """ generate new data and fit model""" if model_based and cycle > nb_model_learning: logger.info( "============================== Model Fit !!! ===============================" ) input_x = np.concatenate( (memory.observations0.data[:memory.nb_entries], memory.actions.data[:memory.nb_entries]), axis=1) input_y_obs = memory.observations1.data[:memory.nb_entries] input_y_reward = memory.rewards.data[:memory.nb_entries] dynamic_model.fit(input_x, input_y_obs) reward_model.fit(input_x, input_y_reward) if dyna_learning: logger.info( "========================= Collect data !!! =================================" ) pred_obs = np.zeros((1, 18), dtype=np.float32) for sample_index in range(nb_sample_cycle): fake_obs = obs_reset for t_episode in range(nb_sample_steps): fake_action, _, _, _ = agent.step(fake_obs, stddev, apply_noise=True, compute_Q=False) pred_obs[:, :12] = fake_obs pred_obs[:, 12:] = fake_action next_fake_obs = dynamic_model.predict(pred_obs)[0] fake_reward = reward_model.predict(pred_obs)[0] # next_fake_obs = dynamic_model.predict(np.concatenate((fake_obs, fake_action)))[0] # fake_reward = reward_model.predict(np.concatenate((fake_obs, fake_action)))[0] fake_obs = next_fake_obs fake_terminals = False fake_memory.append(fake_obs, fake_action, fake_reward, next_fake_obs, fake_terminals) """ noise decay """ stddev = float(stddev) * 0.95 duration = time.time() - start_time epoch_episode_rewards.append(episode_reward) epoch_episode_steps.append(episode_step) epoch_episode_times.append(cp.deepcopy(duration)) epoch_episode_states.append(cp.deepcopy(episode_states)) epochs_rewards[epoch, cycle] = episode_reward epochs_steps[epoch, cycle] = episode_step epochs_times[epoch, cycle] = cp.deepcopy(duration) logger.info( "============================= The Episode_Times:: {}!!! ============================" .format(epoch_episode_rewards)) logger.info( "============================= The Episode_Times:: {}!!! ============================" .format(epoch_episode_times)) epoch_episodes += 1 episodes += 1 """ Training process """ epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): logger.info("") # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() """ planning training """ if model_based and cycle > (nb_model_learning + 1) and dyna_learning: for t_train in range(nb_train_steps): # setting for adapt param noise, if necessary. if fake_memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) batch = fake_memory.sample(batch_size=batch_size) fake_cl, fake_al = agent.train_fake_data(batch) epoch_critic_losses.append(fake_cl) epoch_actor_losses.append(fake_al) agent.update_target_net() epochs_states.append(cp.deepcopy(epoch_episode_states)) # # save data np.save( data_path + 'train_reward_' + algorithm_name + '_' + noise_type + file_name, epochs_rewards) np.save( data_path + 'train_step_' + algorithm_name + '_' + noise_type + file_name, epochs_steps) np.save( data_path + 'train_states_' + algorithm_name + '_' + noise_type + file_name, epochs_states) np.save( data_path + 'train_times_' + algorithm_name + '_' + noise_type + file_name, epochs_times) # # agent save agent.store(model_path + 'train_model_' + algorithm_name + '_' + noise_type + file_name)
def run(args, seed, noise_type, layer_norm, evaluation, **kwargs): import time import os import baselines.ddpg.training as training from baselines.ddpg.models import Actor, Critic from baselines.ddpg.memory import Memory from baselines.ddpg.noise import AdaptiveParamNoiseSpec, NormalActionNoise, OrnsteinUhlenbeckActionNoise import tensorflow as tf # Configure things. rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) # Create envs. env = common.make_env(args) env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) if evaluation and rank==0: eval_env = common.make_env(args) eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval')) env = bench.Monitor(env, None) else: eval_env = None # Parse noise_type action_noise = None param_noise = None nb_actions = env.action_space.shape[-1] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError('unknown noise type "{}"'.format(current_noise_type)) # Configure components. memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_norm=layer_norm) actor = Actor(nb_actions, layer_norm=layer_norm) # Seed everything to make things reproducible. seed = seed + 1000000 * rank logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(seed) env.seed(seed) if eval_env is not None: eval_env.seed(seed) # Disable logging for rank != 0 to avoid noise. if rank == 0: start_time = time.time() training.train(env=env, eval_env=eval_env, param_noise=param_noise, action_noise=action_noise, actor=actor, critic=critic, memory=memory, **kwargs) env.close() if eval_env is not None: eval_env.close() if rank == 0: logger.info('total runtime: {}s'.format(time.time() - start_time))
def __init__(self, env, agent_index, sess, action_range=(-1., 1.), reward_scale=0.1, critic_l2_reg=1e-2, actor_lr=1e-4, critic_lr=1e-3, popart=False, gamma=0.975, clip_norm=10, batch_size=64, memory_size=1e6, tau=0.01, normalize_returns=False, normalize_observations=False, noise_type="adaptive-param_0.1", layer_norm=True, nb_layers=2, nb_neurons=64, activation='tanh', **network_kwargs): super(DDPGAgent, self).__init__(agent_index) # self.sess = sess self.nb_actions = env.action_space[agent_index].n print('agent action_space ' + str(env.action_space[agent_index].n)) self.state_size = env.observation_space[agent_index].shape self.action_range = action_range with tf.variable_scope('ddpg_' + str(agent_index)): critic = Critic(name='critic_' + str(agent_index), layer_norm=layer_norm, nb_layers=nb_layers, nb_neurons=nb_neurons) actor = Actor(self.nb_actions, name='actor_' + str(agent_index), layer_norm=layer_norm, nb_neurons=nb_neurons, activation=activation) memory = Memory(limit=int(memory_size), action_shape=(self.nb_actions, ), observation_shape=self.state_size) action_noise = None param_noise = None if noise_type is not None: for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise( mu=np.zeros(self.nb_actions), sigma=float(stddev) * np.ones(self.nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(self.nb_actions), sigma=float(stddev) * np.ones(self.nb_actions), dt=env.world.dt, theta=0.1) else: raise RuntimeError('unknown noise type "{}"'.format( current_noise_type)) self.agent = DDPG(actor, critic, memory, self.state_size, (self.nb_actions, ), action_range=self.action_range, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(self.agent.__dict__.items())) self.agent.initialize(sess) self.agent.reset()
def __init__(self, env_in): EnvLearner.__init__(self, env_in) # from baselines.ddpg.models import Actor, Critic # Parse noise_type action_noise = None param_noise = None noise_type = 'adaptive-param_0.2' layer_norm = True nb_actions = self.state_dim for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) # Configure components. self.buff_len = 10 self.buffer = deque(self.buff_init * self.buff_len, maxlen=self.buff_len) obs_space = (self.buff_init[0].size * self.buff_len, ) self.memory = Memory(limit=int(1e6), action_shape=env_in.observation_space.shape, observation_shape=obs_space) self.critic = models.Critic(layer_norm=layer_norm) self.actor = models.Actor(nb_actions, layer_norm=layer_norm) self.agent = DDPG(self.actor, self.critic, self.memory, obs_space, env_in.observation_space.shape, gamma=0.99, tau=0.01, normalize_returns=False, normalize_observations=True, batch_size=64, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=1e-2, actor_lr=1e-5, critic_lr=1e-5, enable_popart=False, clip_norm=None, reward_scale=1.)
def learn_setup( network, env, seed=None, total_timesteps=None, iterations=None, nb_epochs=None, # with default settings, perform 1M steps total nb_epoch_cycles=None, nb_rollout_steps=100, n_episodes=None, logspace=True, n_steps_per_episode=None, reward_threshold=0, reward_scale=1.0, render=False, render_eval=False, noise_type='adaptive-param_0.2', noise_level="0.2", normalize_returns=False, normalize_observations=True, critic_l2_reg=1e-2, exp_name="test", actor_lr=1e-4, critic_lr=1e-3, popart=False, gamma=0.99, clip_norm=None, nb_train_steps=50, # per epoch cycle and MPI worker, nb_eval_steps=100, batch_size=64, # per MPI worker tau=0.01, eval_env=None, param_noise_adaption_interval=50, **network_kwargs): if logspace: actor_lr = 10**-actor_lr critic_lr = 10**-critic_lr batch_size = 2**int(batch_size) if seed is None: seed = 17 seed = int(seed) tau = 10**-tau set_global_seeds(seed) if nb_epoch_cycles is None: nb_epoch_cycles = n_episodes nb_rollout_steps = n_steps_per_episode else: input("Not using automated interface? ") if total_timesteps is not None: assert nb_epochs is None nb_epochs = int(total_timesteps) // (nb_epoch_cycles * nb_rollout_steps) if MPI is not None: rank = MPI.COMM_WORLD.Get_rank() else: rank = 0 nb_actions = env.action_space.shape[-1] assert (np.abs(env.action_space.low) == env.action_space.high ).all() # we assume symmetric actions. memory = Memory(limit=int(1e5), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(network=network, **network_kwargs) actor = Actor(nb_actions, network=network, **network_kwargs) action_noise = None param_noise = None if noise_type is not None: for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') stddev = noise_level param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) max_action = env.action_space.high #print("actual max action", max_action) max_action = 1 logger.info( 'scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) sess = U.get_session() # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() nenvs = obs.shape[0] episode_reward = np.zeros(nenvs, dtype=np.float32) #vector episode_step = np.zeros(nenvs, dtype=int) # vector episodes = 0 #scalar t = 0 # scalar epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_actions = [] epoch_qs = [] epoch_episodes = 0 local_variables = { "epoch_episode_rewards": epoch_episode_rewards, "epoch_episode_steps": epoch_episode_steps, "batch_size": batch_size, "eval_env": eval_env, "reward_threshold": reward_threshold, "epoch_actions": epoch_actions, "nb_train_steps": nb_train_steps, "epoch_qs": epoch_qs, "start_time": start_time, "epoch_episodes": [epoch_episodes], "nb_epoch_cycles": nb_epoch_cycles, "nb_rollout_steps": nb_rollout_steps, "agent": agent, "memory": memory, "max_action": max_action, "env": env, "nenvs": nenvs, "obs": [obs], #Forgive me 6.031 "t": [t], "episode_reward": episode_reward, "episode_rewards_history": episode_rewards_history, "episode_step": episode_step, "episodes": [episodes], "rank": rank, "param_noise_adaption_interval": param_noise_adaption_interval, "noise_type": noise_type, "render": render } return local_variables
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs): """ run the training of DDPG :param env_id: (str) the environment ID :param seed: (int) the initial random seed :param noise_type: (str) the wanted noises ('adaptive-param', 'normal' or 'ou'), can use multiple noise type by seperating them with commas :param layer_norm: (bool) use layer normalization :param evaluation: (bool) enable evaluation of DDPG training :param kwargs: (dict) extra keywords for the training.train function """ # Configure things. rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) # Create envs. env = gym.make(env_id) env = bench.Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) if evaluation and rank == 0: eval_env = gym.make(env_id) eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval')) env = bench.Monitor(env, None) else: eval_env = None # Parse noise_type action_noise = None param_noise = None nb_actions = env.action_space.shape[-1] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mean=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mean=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) # Configure components. memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_norm=layer_norm) actor = Actor(nb_actions, layer_norm=layer_norm) # Seed everything to make things reproducible. seed = seed + 1000000 * rank logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(seed) env.seed(seed) if eval_env is not None: eval_env.seed(seed) # Disable logging for rank != 0 to avoid noise. if rank == 0: start_time = time.time() training.train(env=env, eval_env=eval_env, param_noise=param_noise, action_noise=action_noise, actor=actor, critic=critic, memory=memory, **kwargs) env.close() if eval_env is not None: eval_env.close() if rank == 0: logger.info('total runtime: {}s'.format(time.time() - start_time))
def learn( network, env, seed=None, total_timesteps=None, nb_epochs=None, # with default settings, perform 1M steps total nb_epoch_cycles=20, nb_rollout_steps=100, reward_scale=1.0, render=False, render_eval=False, noise_type='adaptive-param_0.2', normalize_returns=False, normalize_observations=True, actor_l2_reg=0.0, critic_l2_reg=1e-2, actor_lr=1e-4, critic_lr=1e-3, popart=False, gamma=0.99, clip_norm=None, nb_train_steps=50, # per epoch cycle and MPI worker, nb_eval_steps=1000, batch_size=64, # per MPI worker tau=0.01, eval_env=None, param_noise_adaption_interval=50, rb_size=1e6, save_interval=1, pretrain_epochs=0, load_path=None, demos_path=None, bc_teacher_lambda=0.0, use_qfilter=False, **network_kwargs): """Learns policy using DDPG, with vectorized environments. If we pass other arguments that aren't specified here, they are considered as network_kwargs. Parameters ---------- noise_type: for noise to be added to the behavior policy. They are NOT using the noise type from the paper but 'AdaptiveParamNoiseSpec'. I _think_ that if one does the OU process, we get action noise, but not parameter noise. Also, be sure to use `name_stdev` in that convention, as the code will split the argument at the underscores. actor_lr: 1e-4 (matches paper) critic_lr: 1e-3 (matches paper) critic_l2: 1e-2 (matches paper) gamma: 0.99 (matches paper) batch_size: 64 (matches paper for lower-dim env obs/states) tau: 0.01 for soft target updates of actor and critic nets. Paper used 0.001. nb_epoch_cycles: number of times we go through this cycle of: (1) get rollouts with noise added to policy and apply to replay buffer, (2) gradient updates for actor/critic, (3) evaluation rollouts (if any). AFTER all of these cycles happen, THEN we log statistics. nb_rollout_steps: number of steps in each parallel env we take with exploration policy without training, so this is just to populate the replay buffer. More parallel envs *should* mean that we get more samples in the buffer between each gradient updates of the network, so this might need to be environment *and* machine (# of CPUs) specific. nb_train_steps: after doing `nb_rollout_steps` in each parallel env, we do this many updates; each involves sampling from the replay buffer and updating the actor and critic (via lagged target updates). nb_eval_steps: 1000, I changed from the 100 as default. Using 1000 ensures that fixed length envs like Ant-v2 can get one full episode (assuming one parallel env) during evaluation stagtes. eval_env: A separate environment for evaluation only, where no noise is applied, similar to how rlkit does it. save_interval: Frequency between saving. """ set_global_seeds(seed) # Daniel: this helps to maintain compatibility with PPO2 code. For now # we're ignoring it, but we should check that we're always clipping. I # changed the nb_epochs to match with PPO2 in that we divide by nenvs. if 'limit_act_range' in network_kwargs: network_kwargs.pop('limit_act_range') nenvs = env.num_envs nbatchsize = nenvs * nb_epoch_cycles * nb_rollout_steps if total_timesteps is not None: assert nb_epochs is None nb_epochs = int(total_timesteps) // nbatchsize else: nb_epochs = 500 if MPI is not None: rank = MPI.COMM_WORLD.Get_rank() else: rank = 0 # we assume symmetric actions. nb_actions = env.action_space.shape[-1] assert (np.abs(env.action_space.low) == env.action_space.high).all() # Form XP (1M steps, same as in paper), and critic/actor networks. # Daniel: force dtype here so we can use uint8 type images. assert env.observation_space.low.dtype == env.observation_space.high.dtype memory = Memory(limit=int(rb_size), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape, dtype=env.observation_space.low.dtype) critic = Critic(network=network, **network_kwargs) actor = Actor(nb_actions, network=network, **network_kwargs) action_noise = None param_noise = None if noise_type is not None: for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') #action_noise = NormalActionNoise(mu=np.zeros(nb_actions), # sigma=float(stddev)*np.ones(nb_actions)) #if nenvs > 1: # Daniel: adding this to replace the former. action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions), shape=(nenvs, nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) max_action = env.action_space.high logger.info( 'scaling actions by {} before executing in env'.format(max_action)) # The `learn` defaults above have priority over defaults in DDPG class. agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, actor_l2_reg=actor_l2_reg, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale, bc_teacher_lambda=bc_teacher_lambda, use_qfilter=use_qfilter) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Prepare everything. sess = U.get_session() agent.initialize(sess) # -------------------------------------------------------------------------- # Daniel: similar as PPO2 code as `agent` is similar to `model` but has to # be initialized explicitly above. Must call after `agent.load` gets # created. Not sure if this works with parameter space noise or with # normalization, but I don't plan to resume training (for now). It also has # to be *before* the `graph.finalize()` because otherwise we get an error. # -------------------------------------------------------------------------- if load_path is not None: logger.info("\nInside ddpg, loading model from: {}".format(load_path)) agent.load(load_path) # -------------------------------------------------------------------------- sess.graph.finalize() agent.reset() # -------------------------------------------------------------------------- # Daniel: populate replay buffer, followed by pretraining stage. # But if load_path is not None, then doesn't make sense -- we want to load. # We also don't need to do this if timesteps is 0 (e.g., for playing policy). # -------------------------------------------------------------------------- if total_timesteps == 0: return agent assert seed == 1500, 'We normally want seed 1500, yet: {}'.format(seed) if (demos_path is not None and load_path is None): _ddpg_demos(demos_path, agent, memory) assert memory.nb_entries == memory.nb_teach_entries, memory.nb_entries checkdir = osp.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) # Pretrain, based on their training code for some # of minibatches. pt_actor_losses = [] pt_critic_losses = [] batches_per_ep = int(memory.nb_entries / batch_size) logger.info( 'Running pre-training for {} epochs'.format(pretrain_epochs)) logger.info(' data size in memory: {}'.format(memory.nb_entries)) logger.info(' each batch: {}, epoch mbs: {}'.format( batch_size, batches_per_ep)) pt_start = time.time() for epoch in range(1, pretrain_epochs + 1): c_losses = [] a_losses = [] for _ in range(batches_per_ep): cl, al = agent.train(during_pretrain=True) agent.update_target_net() c_losses.append(cl) a_losses.append(al) pt_critic_losses.append(np.mean(c_losses)) pt_actor_losses.append(np.mean(a_losses)) # Check and save model occasionally. if epoch == 1 or epoch % 5 == 0: pt_time = (time.time() - pt_start) / 60. logger.info( ' epoch done: {}, loss over past epoch: {:.4f}'.format( str(epoch).zfill(4), pt_actor_losses[-1])) logger.info(' critic loss over past epoch: {:.4f}'.format( pt_critic_losses[-1])) logger.info(' elapsed time: {:.1f}m'.format(pt_time)) savepath = osp.join( checkdir, 'pretrain_epoch_{}'.format(str(epoch).zfill(4))) logger.info('Saving model checkpoint to: ', savepath) agent.save(savepath) pt_time = (time.time() - pt_start) / 60. logger.info('losses a: {}'.format(np.array(pt_actor_losses))) logger.info('losses c: {}'.format(np.array(pt_critic_losses))) logger.info('Finished loading teacher samples + pre-training.') logger.info('Pre-training took {:.1f}m.\n'.format(pt_time)) # -------------------------------------------------------------------------- # Back to their code. For cloth, `env.reset()` takes a while so we put it here. obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() nenvs = obs.shape[0] # Daniel: Debugging/sanity checks. _variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) U.display_var_info(_variables) logger.info("\nInside DDPG, about to start epochs") logger.info( "nbatchsize: {}, get this in buffer before DDPG updates".format( nbatchsize)) logger.info(" i.e.: (nenv {}) * (cycles {}) * (nsteps {})".format( nenvs, nb_epoch_cycles, nb_rollout_steps)) logger.info("nb_epochs: {}, number of cycles to use".format(nb_epochs)) logger.info("eval_env None? {}".format(eval_env is None)) logger.info("(end of debugging messages)\n") # File paths. checkdir = osp.join(logger.get_dir(), 'checkpoints') action_dir = osp.join(logger.get_dir(), 'actions') episode_dir = osp.join(logger.get_dir(), 'ep_all_infos') os.makedirs(checkdir, exist_ok=True) os.makedirs(action_dir, exist_ok=True) os.makedirs(episode_dir, exist_ok=True) # Daniel: use these two to store past 100 episode history. Report these stats! eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) all_eval_episode_rewards = [] # reward/step: cumulative quantities for each episode in vecenv. # epoch_{actions,qs} will grow without bound, fyi. episode_reward = np.zeros(nenvs, dtype=np.float32) #vector episode_step = np.zeros(nenvs, dtype=int) # vector episodes = 0 #scalar t = 0 # scalar epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for epoch in range(nb_epochs): mb_actions = [] mb_epinfos = [] for cycle in range(nb_epoch_cycles): # Perform rollouts. if nenvs > 1: # if simulating multiple envs in parallel, impossible to reset # agent at the end of the episode in each of the environments, # so resetting here instead agent.reset() # Daniel: pure data collection (noise added) to populate replay buffer. # No training until after this, and note the parallel stepping (VecEnv). for t_rollout in range(nb_rollout_steps): # Predict next action. # action: (#_parallel_envs, ac_dim), q: (#_parallel_envs, 1) action, q, _, _ = agent.step(obs, apply_noise=True, compute_Q=True) # Execute next action. if rank == 0 and render: env.render() # max_action is of dimension A, whereas action is dimension # (nenvs, A) - the multiplication gets broadcasted to the batch # scale for execution in env (as far as DDPG is concerned, # every action is in [-1, 1]) new_obs, r, done, info = env.step(max_action * action) r = r.astype(np.float32) # note these outputs are batched from vecenv (first dim = batch). t += 1 if rank == 0 and render: env.render() episode_reward += r episode_step += 1 # Book-keeping. (Daniel: agent.train() doesn't require these two lists) epoch_actions.append(action) epoch_qs.append(q) # Daniel: Same as PPO2 code. mb_actions.append(action) for inf in info: maybeepinfo = inf.get('episode') if maybeepinfo: mb_epinfos.append(inf) #the batched data will be unrolled in memory.py's append. agent.store_transition(obs, action, r, new_obs, done) obs = new_obs for d in range(len(done)): if done[d]: # Episode done. epoch_episode_rewards.append( episode_reward[d]) # Entire history episode_rewards_history.append( episode_reward[d]) # Last 100 only epoch_episode_steps.append(episode_step[d]) episode_reward[d] = 0. episode_step[d] = 0 epoch_episodes += 1 episodes += 1 if nenvs == 1: agent.reset() # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. (Daniel: note that no noise is applied here.) # Also it seems like episodes do not naturally reset before this starts? # Also, unlike epoch_episode_reward, here we create eval_episode_reward here ... eval_episode_rewards = [] eval_qs = [] if eval_env is not None: logger.info('Now on the eval_env for {} steps...'.format( nb_eval_steps)) nenvs_eval = eval_obs.shape[0] eval_episode_reward = np.zeros(nenvs_eval, dtype=np.float32) for t_rollout in range(nb_eval_steps): eval_action, eval_q, _, _ = agent.step(eval_obs, apply_noise=False, compute_Q=True) # scale for execution in env (for DDPG, every action is in [-1, 1]) eval_obs, eval_r, eval_done, eval_info = eval_env.step( max_action * eval_action) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) for d in range(len(eval_done)): if eval_done[d]: eval_episode_rewards.append(eval_episode_reward[d]) eval_episode_rewards_history.append( eval_episode_reward[d]) all_eval_episode_rewards.append( eval_episode_reward[d]) eval_episode_reward[ d] = 0.0 # Daniel: reset for next episode. if MPI is not None: mpi_size = MPI.COMM_WORLD.Get_size() else: mpi_size = 1 # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['memory/nb_entries'] = memory.nb_entries combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean( episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean( epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float(duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) # Evaluation statistics. (Daniel: use eval/return_history for plots) if eval_env is not None: combined_stats['eval/return'] = np.mean(eval_episode_rewards) combined_stats['eval/return_history'] = np.mean( eval_episode_rewards_history) combined_stats['eval/Q'] = eval_qs combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s' % x) combined_stats_sums = np.array( [np.array(x).flatten()[0] for x in combined_stats.values()]) if MPI is not None: combined_stats_sums = MPI.COMM_WORLD.allreduce(combined_stats_sums) combined_stats = { k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums) } # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps_per_env'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) if rank == 0: logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(osp.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(osp.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f) # Daniel: arguable, we can save all episodes but hard if we don't know the steps. #if eval_env: # with open(os.path.join(logdir, 'all_eval_episode_rewards.pkl'), 'wb') as f: # pickle.dump(all_eval_episode_rewards, f) # Daniel: we can use cycle or epoch for this if condition ... kind of annoying but w/e. if cycle % save_interval == 0: logger.info('We are now saving stuff!!') savepath = osp.join(checkdir, '%.5i' % epoch) logger.info('Saving model checkpoint to: ', savepath) agent.save(savepath) # ------------------------------------------------------------------ # Daniel: extra stuff for debugging PPO on cloth, actions and infos for each episode. mb_actions = _sf01(np.asarray(mb_actions)) act_savepath = osp.join(action_dir, 'actions_%.5i.pkl' % epoch) epi_savepath = osp.join(episode_dir, 'infos_%.5i.pkl' % epoch) with open(act_savepath, 'wb') as fh: pickle.dump(mb_actions, fh) with open(epi_savepath, 'wb') as fh: pickle.dump(mb_epinfos, fh) # Daniel: we were not resetting earlier. Actually there are other # epoch_stats which we might consider resetting here? epoch_episodes = 0 return agent