def __init__(self, config, action_bound, obs_bound): super().__init__(config=config) self.obs_dim = self.config.config_dict['STATE_SPACE'] self.obs_dim = self.obs_dim[0] + 1 self.act_dim = self.config.config_dict['ACTION_SPACE'][0] with tf.variable_scope(name_or_scope=self.config.config_dict['NAME']): self.scaler = Scaler(self.obs_dim) self.val_func = NNValueFunction( self.obs_dim, hid1_mult=self.config.config_dict['HIDDEN_MULTIPLE'], name_scope=self.config.config_dict['NAME']) self.policy = Policy( self.obs_dim, self.act_dim, kl_targ=self.config.config_dict['KL_TARG'], hid1_mult=self.config.config_dict['HIDDEN_MULTIPLE'], policy_logvar=self.config.config_dict['POLICY_LOGVAR'], name_scope=self.config.config_dict['NAME']) self._real_trajectories = { 'observes': [], 'actions': [], 'rewards': [], 'unscaled_obs': [] } self._cyber_trajectories = { 'observes': [], 'actions': [], 'rewards': [], 'unscaled_obs': [] } self._real_trajectories_memory = deque( maxlen=self.config.config_dict['EPISODE_REAL_MEMORY_SIZE']) self._cyber_trajectories_memory = deque( maxlen=self.config.config_dict['EPISODE_CYBER_MEMORY_SIZE']) self._real_step_count = 0.0 self._cyber_step_count = 0.0 self.action_low = action_bound[0] self.action_high = action_bound[1] self._env_status = None self.real_data_memory = Memory( limit=10000, action_shape=self.config.config_dict['ACTION_SPACE'], observation_shape=self.config.config_dict['STATE_SPACE']) self.simulation_data_memory = Memory( limit=10000, action_shape=self.config.config_dict['ACTION_SPACE'], observation_shape=self.config.config_dict['STATE_SPACE'])
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs): # Configure things. rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) # Create envs. env = gym.make(env_id) eval_env = None # Parse noise_type action_noise = None param_noise = None nb_actions = env.action_space.shape[-1] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError('unknown noise type "{}"'.format(current_noise_type)) # Configure components. # TODO: Change back to 1e6 memory = Memory(limit=int(1e2), state_shape=env.state_space.shape, action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_norm=layer_norm) actor = Actor(nb_actions, layer_norm=layer_norm) # Seed everything to make things reproducible. seed = seed + 1000000 * rank logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(seed) env.seed(seed) if eval_env is not None: eval_env.seed(seed) # Disable logging for rank != 0 to avoid noise. if rank == 0: start_time = time.time() kwargs.pop('state_shape') training.train(env=env, eval_env=eval_env, param_noise=param_noise, action_noise=action_noise, actor=actor, critic=critic, memory=memory, **kwargs) env.close() if eval_env is not None: eval_env.close() if rank == 0: logger.info('total runtime: {}s'.format(time.time() - start_time))
def train(self, env, nb_steps): # Configure things. rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) # Parse noise_type action_noise = None param_noise = None nb_actions = env.action_space.shape[-1] param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(0.2), desired_action_stddev=float(0.2)) # Configure components. memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_norm=True) actor = Actor(nb_actions, layer_norm=True) # Seed everything to make things reproducible. seed = self.seed + 1000000 * rank logger.info('rank {}: seed={}, logdir={}'.format( rank, seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(seed) env.seed(seed) # Disable logging for rank != 0 to avoid noise. if rank == 0: start_time = time.time() #load_state("D:\project\osim-rl-helper\ddpg.pkl") training.train(env=env, param_noise=param_noise, restore=True, action_noise=action_noise, actor=actor, critic=critic, memory=memory, nb_epochs=1, nb_epoch_cycles=1, render_eval=False, reward_scale=1.0, render=False, normalize_returns=False, normalize_observations=True, critic_l2_reg=1e-2, actor_lr=1e-4, critic_lr=1e-3, popart=False, gamma=0.99, clip_norm=None, nb_train_steps=nb_steps, nb_rollout_steps=5, nb_eval_steps=5, batch_size=64) #save_state("D:\project\osim-rl-helper\ddpg.pkl") if rank == 0: logger.info('total runtime: {}s'.format(time.time() - start_time))
def __init__(self, config, action_bound): super(DQNModel, self).__init__(config=config) self.proposed_action_list = [] self.action_bound = action_bound action_list = [] for i in range(len(action_bound[0])): low = action_bound[0][i] high = action_bound[1][i] action_list.append( np.arange(start=low, stop=high, step=(high - low) / self.config.config_dict['ACTION_SPLIT_COUNT'])) action_iterator = itertools.product(*action_list) self.action_selection_list = [] for action_sample in action_iterator: self.action_selection_list.append(tf.constant(action_sample)) self.reward_input = tf.placeholder(shape=[None, 1], dtype=tf.float32) self.state_input = tf.placeholder( shape=[None] + list(self.config.config_dict['STATE_SPACE']), dtype=tf.float32) self.next_state_input = tf.placeholder( shape=[None] + list(self.config.config_dict['STATE_SPACE']), dtype=tf.float32) self.action_input = tf.placeholder( shape=[None] + list(self.config.config_dict['ACTION_SPACE']), dtype=tf.float32) self.done_input = tf.placeholder(shape=[None, 1], dtype=tf.bool) self.input = tf.concat([self.state_input, self.action_input]) self.done = tf.cast(self.done_input, dtype=tf.float32) self.q_value_list = [] var_list = None for action_sample in self.action_selection_list: q_net, q_output, var_list = NetworkCreator.create_network( input=tf.concat(self.state_input, action_sample), network_config=self.config.config_dict['NET_CONFIG'], net_name=self.config.config_dict['NAME']) self.q_value_list.append(q_output) self.var_list = var_list self.target_q_value_list = [] for action_sample in self.action_selection_list: q_net, q_output, var_list = NetworkCreator.create_network( input=tf.concat(self.next_state_input, action_sample), network_config=self.config.config_dict['NET_CONFIG'], net_name='TARGET' + self.config.config_dict['NAME']) self.target_var_list.append(q_output) self.target_var_list = var_list self.loss, self.optimizer, self.optimize = self.create_training_method( ) self.update_target_q_op = self.create_target_q_update() self.memory = Memory( limit=1e100, action_shape=self.config.config_dict['ACTION_SPACE'], observation_shape=self.config.config_dict['STATE_SPACE']) self.sess = tf.get_default_session()
def run_baselines(env, seed, log_dir): ''' Create baselines model and training. Replace the ddpg and its training with the algorithm you want to run. :param env: Environment of the task. :param seed: Random seed for the trial. :param log_dir: Log dir path. :return ''' rank = MPI.COMM_WORLD.Get_rank() seed = seed + 1000000 * rank set_global_seeds(seed) env.seed(seed) # Set up logger for baselines configure(dir=log_dir, format_strs=['stdout', 'log', 'csv', 'tensorboard']) baselines_logger.info('rank {}: seed={}, logdir={}'.format( rank, seed, baselines_logger.get_dir())) # Set up params for baselines ddpg nb_actions = env.action_space.shape[-1] layer_norm = False action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(params['sigma']) * np.ones(nb_actions)) memory = Memory(limit=params['replay_buffer_size'], action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_norm=layer_norm) actor = Actor(nb_actions, layer_norm=layer_norm) training.train(env=env, eval_env=None, param_noise=None, action_noise=action_noise, actor=actor, critic=critic, memory=memory, nb_epochs=params['n_epochs'], nb_epoch_cycles=params['n_epoch_cycles'], render_eval=False, reward_scale=1., render=False, normalize_returns=False, normalize_observations=False, critic_l2_reg=0, actor_lr=params['policy_lr'], critic_lr=params['qf_lr'], popart=False, gamma=params['discount'], clip_norm=None, nb_train_steps=params['n_train_steps'], nb_rollout_steps=params['n_rollout_steps'], nb_eval_steps=100, batch_size=64) return osp.join(log_dir, 'progress.csv')
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs): # Configure things. rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) logger.configure(dir='/home/vaisakhs_shaj/Desktop/DeepReinforcementLearning/5_Deep_Deterministic_Policy_Gradients/LOGS/OSIM') # Create envs. env = ProstheticsEnv(visualize=True) env.change_model(model = '2D', difficulty = 0, prosthetic = True, seed=seed) #env.seed(seed) #env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) eval_env = None # Parse noise_type action_noise = None param_noise = None nb_actions = env.action_space.shape[-1] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError('unknown noise type "{}"'.format(current_noise_type)) # Configure components. memory = Memory(limit=int(2e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_norm=layer_norm) actor = Actor(nb_actions, layer_norm=layer_norm) # Seed everything to make things reproducible. seed = seed + 2000000 * rank logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(seed) env.seed(seed) if eval_env is not None: eval_env.seed(seed) # Disable logging for rank != 0 to avoid noise. if rank == 0: start_time = time.time() training.train(env=env, eval_env=eval_env, param_noise=param_noise, action_noise=action_noise, actor=actor, critic=critic, memory=memory, **kwargs) env.close() if eval_env is not None: eval_env.close() if rank == 0: logger.info('total runtime: {}s'.format(time.time() - start_time))
def setup(self, obs_shape, nb_actions, action_spec, noise_type, gamma=1., tau=0.01, layer_norm=True): super(DDPGAgent, self).setup(obs_shape, nb_actions, action_spec, noise_type, gamma, tau, layer_norm) self.action_spec_internal = action_spec self.obs_dim = obs_shape action_noise = None param_noise = None # Parse noise_type for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) # Configure components. self.memory = Memory(limit=int(500), action_shape=(nb_actions, ), observation_shape=obs_shape) self.critic = Critic(layer_norm=layer_norm, hidden_size=128) self.actor = Actor(nb_actions, layer_norm=layer_norm, hidden_size=128) tf.reset_default_graph() # max_action = env.action_space.high self.ddpg = DDPG(actor=self.actor, critic=self.critic, memory=self.memory, observation_shape=obs_shape, action_shape=(nb_actions, ), gamma=gamma, tau=tau, action_noise=action_noise, param_noise=param_noise)
def __init__(self, limit, env): self.limit = limit self.env = env self.memory = Memory( limit=self.limit, action_shape=self.env.action_space.shape, observation_shape=self.env.observation_space.shape) self.file_dir = None
def run(cfg, seed, noise_type, layer_norm, evaluation, architecture, **kwargs): if MPI.COMM_WORLD.Get_rank() == 0: dir_path = os.path.dirname(os.path.realpath(__file__)) logger.configure(dir_path, ['stdout']) # Configure things. rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) # Create envs. env = GRLEnv(cfg) gym.logger.setLevel(logging.WARN) env = MyMonitor(env, os.path.join(logger.get_dir(), kwargs['output'])) # Parse noise_type action_noise = None param_noise = None nb_actions = env.action_space.shape[-1] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev, theta = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), dt=0.03, sigma=float(stddev) * np.ones(nb_actions), theta=float(theta) * np.ones(nb_actions)) else: raise RuntimeError('unknown noise type "{}"'.format(current_noise_type)) # Configure components. memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = MyCritic(layer_norm=layer_norm, architecture=architecture) actor = MyActor(nb_actions, layer_norm=layer_norm, architecture=architecture) # Seed everything to make things reproducible. seed = seed + 1000000 * rank logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(seed) env.seed(seed) # Disable logging for rank != 0 to avoid noise. if rank == 0: start_time = time.time() training.train(env=env, param_noise=param_noise, action_noise=action_noise, actor=actor, critic=critic, memory=memory, **kwargs) env.close() if rank == 0: logger.info('total runtime: {}s'.format(time.time() - start_time))
def train_ddpg(env, N_episodes): param_noise = None nb_actions = env.action_space.shape[-1] action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) # Configure components. memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_norm=layer_norm) actor = Actor(nb_actions, layer_norm=layer_norm)
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs): param_noise = None # Configure things. rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) # Create envs. env = gym.make(env_id) # env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) if evaluation and rank == 0: eval_env = gym.make(env_id) eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval')) env = bench.Monitor(env, None) else: eval_env = None # Parse noise_type nb_actions = env.action_space.shape[-1] action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=np.ones(nb_actions)) # Configure components. memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) actor = Actor(nb_actions, layer_norm=layer_norm) critic = Critic(layer_norm=layer_norm) # Seed everything to make things reproducible. seed = seed + 1000000 * rank logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(seed) env.seed(seed) if eval_env is not None: eval_env.seed(seed) # Disable logging for rank != 0 to avoid noise. if rank == 0: start_time = time.time() training.train(env=env, action_noise=action_noise, actor=actor, critic=critic, memory=memory, **kwargs) env.close() if eval_env is not None: eval_env.close() if rank == 0: logger.info('total runtime: {}s'.format(time.time() - start_time))
def main(): args = parse_args() # create the environment env = gym.make("kuka-v0") # <-- this we need to create env.init_bullet(render=True) # create the learning agent # model = deepq.models.mlp([16, 16]) ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n # Parse noise_type action_noise = None param_noise = None nb_actions = env.action_space.shape[-1] noise_type = 'adaptive-param_0.2' for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError('unknown noise type "{}"'.format(current_noise_type)) # policy = GaussianMlpPolicy(ob_dim, ac_dim) # vf = NeuralNetValueFunction(ob_dim, ac_dim) # learn(env, policy=policy, vf=vf, # gamma=0.99, lam=0.97, timesteps_per_batch=2500, # desired_kl=0.002, # num_timesteps=1000, animate=False) # Configure components. memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_norm=False) actor = Actor(nb_actions, layer_norm=False) training.train(env=env, param_noise=param_noise, action_noise=action_noise, actor=actor, critic=critic, memory=memory, **args) env.close()
def __init__(self, hyperparams, dX, dU): """Initializes the policy. Args: hyperparams: Dictionary of hyperparameters. dX: Dimension of state space. dU: Dimension of action space. """ PolicyOpt.__init__(self, hyperparams, dX, dU) self.dX = dX self.dU = dU self.epochs = hyperparams['epochs'] self.param_noise_adaption_interval = hyperparams[ 'param_noise_adaption_interval'] set_global_seeds(hyperparams['seed']) # Initialize DDPG policy self.pol = DDPG(Actor(dU, network=hyperparams['network'], **hyperparams['network_kwargs']), Critic(network=hyperparams['network'], **hyperparams['network_kwargs']), Memory(limit=hyperparams['memory_limit'], action_shape=(dU, ), observation_shape=(dX, )), observation_shape=(dX, ), action_shape=(dU, ), param_noise=AdaptiveParamNoiseSpec( initial_stddev=0.2, desired_action_stddev=0.2), **hyperparams['ddpg_kwargs']) sess = get_session() self.pol.initialize(sess) sess.graph.finalize() self.policy = self # Act method is contained in this class
dataPrimary = pd.read_csv("data_p/monitorChange.csv") dataProgress = pd.read_csv("data_p/progress.csv") action_shape = (1, ) nb_action = 1 observation_shape = (3, ) t_train_time = 10000 t_test_time = 10000 network = 'mlp' action_noise = None param_noise = None popart = False, load_path = 'ddpg_model' load_path = None memory = Memory(limit=int(1e6), action_shape=action_shape, observation_shape=observation_shape) critic = Critic(network=network) actor = Actor(nb_action, network=network) agent = DDPG(actor, critic, memory, observation_shape, action_shape, gamma=0.99, tau=0.01, normalize_returns=False, normalize_observations=True, batch_size=32, action_noise=action_noise,
def run(env_id, seed, noise_type, layer_norm, evaluation, perform, use_expert, expert_dir, use_trpo_expert, expert_limit, **kwargs): # Configure things. rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) # Create envs. env = gym.make(env_id) env = bench.Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) gym.logger.setLevel(logging.WARN) if evaluation and perform: perform = False if evaluation and rank == 0 or perform: eval_env = gym.make(env_id) eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval')) # env = bench.Monitor(env, None) else: eval_env = None # Parse noise_type action_noise = None param_noise = None nb_actions = env.action_space.shape[-1] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) # Configure components. memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_norm=layer_norm) actor = Actor(nb_actions, layer_norm=layer_norm) if use_expert: expert = Expert(limit=expert_limit, env=env) if expert_dir is None: expert_dir = os.path.join('./expert', env.env.spec.id) + '/expert.pkl' expert.load_file(expert_dir) elif use_trpo_expert: assert expert_dir is not None expert = Expert(limit=expert_limit, env=env) expert.load_file_trpo(expert_dir) else: expert = None # Seed everything to make things reproducible. seed = seed + 1000000 * rank logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(seed) env.seed(seed) if eval_env is not None: eval_env.seed(seed) # Disable logging for rank != 0 to avoid noise. if rank == 0: start_time = time.time() training.train(env=env, eval_env=eval_env, param_noise=param_noise, action_noise=action_noise, actor=actor, critic=critic, memory=memory, perform=perform, expert=expert, **kwargs) env.close() if eval_env is not None: eval_env.close() if rank == 0: logger.info('total runtime: {}s'.format(time.time() - start_time))
def run(env_id, seed, noise_type, num_cpu, layer_norm, logdir, gym_monitor, evaluation, bind_to_core, **kwargs): kwargs['logdir'] = logdir whoami = mpi_fork(num_cpu, bind_to_core=bind_to_core) if whoami == 'parent': sys.exit(0) # Configure things. rank = MPI.COMM_WORLD.Get_rank() if rank != 0: # Write to temp directory for all non-master workers. actual_dir = None Logger.CURRENT.close() Logger.CURRENT = Logger(dir=mkdtemp(), output_formats=[]) logger.set_level(logger.DISABLED) # Create envs. if rank == 0: env = gym.make(env_id) if gym_monitor and logdir: env = gym.wrappers.Monitor(env, os.path.join(logdir, 'gym_train'), force=True) env = SimpleMonitor(env) if evaluation: eval_env = gym.make(env_id) if gym_monitor and logdir: eval_env = gym.wrappers.Monitor(eval_env, os.path.join(logdir, 'gym_eval'), force=True) eval_env = SimpleMonitor(eval_env) else: eval_env = None else: env = gym.make(env_id) if evaluation: eval_env = gym.make(env_id) else: eval_env = None # Parse noise_type action_noise = None param_noise = None nb_actions = env.action_space.shape[-1] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError('unknown noise type "{}"'.format(current_noise_type)) # Configure components. memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_norm=layer_norm) actor = Actor(nb_actions, layer_norm=layer_norm) # Seed everything to make things reproducible. seed = seed + 1000000 * rank logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(seed) env.seed(seed) if eval_env is not None: eval_env.seed(seed) # Disable logging for rank != 0 to avoid noise. if rank == 0: start_time = time.time() training.train(env=env, eval_env=eval_env, param_noise=param_noise, action_noise=action_noise, actor=actor, critic=critic, memory=memory, **kwargs) env.close() if eval_env is not None: eval_env.close() Logger.CURRENT.close() if rank == 0: logger.info('total runtime: {}s'.format(time.time() - start_time))
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs): # Configure things. rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) # Create envs. if env_id == 'navigate': env = NavigateEnv(use_camera=False, continuous_actions=True, neg_reward=False, max_steps=500) elif env_id == 'toy': #env = continuous_gridworld.ContinuousGridworld('', max_steps=1000, obstacle_mode=continuous_gridworld.NO_OBJECTS) from toy_environment import room_obstacle_list env = gridworld.Gridworld(room_obstacle_list.obstacle_list, step_size=0.2) elif env_id == 'arm2pos': env = Arm2PosEnv(continuous=True, max_steps=500, neg_reward=False) elif env_id == 'pick-and-place': env = PickAndPlaceEnv(max_steps=500) else: env = gym.make(env_id) env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) # env = gym.wrappers.Monitor(env, '/tmp/ddpg/', force=True) gym.logger.setLevel(logging.WARN) if evaluation and rank == 0: eval_env = gym.make(env_id) eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval')) env = bench.Monitor(env, None) else: eval_env = None # Parse noise_type action_noise = None param_noise = None nb_actions = env.action_space.shape[-1] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError('unknown noise type "{}"'.format(current_noise_type)) # Configure components. memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_norm=layer_norm) actor = Actor(nb_actions, layer_norm=layer_norm) # Seed everything to make things reproducible. seed = seed + 1000000 * rank logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(seed) env.seed(seed) if eval_env is not None: eval_env.seed(seed) # Disable logging for rank != 0 to avoid noise. if rank == 0: start_time = time.time() del kwargs['tb_dir'] del kwargs['save_path'] hindsight_mode = kwargs['hindsight_mode'] del kwargs['hindsight_mode'] training.train(env=env, eval_env=eval_env, param_noise=param_noise, action_noise=action_noise, actor=actor, critic=critic, memory=memory, hindsight_mode=hindsight_mode, **kwargs) env.close() if eval_env is not None: eval_env.close() if rank == 0: logger.info('total runtime: {}s'.format(time.time() - start_time))
def run(self): """Override Process.run()""" # Create environment env = create_environment( action_repeat=self.action_repeat, full=self.full, exclude_centering_frame=self.exclude_centering_frame, visualize=self.visualize, fail_reward=self.fail_reward, integrator_accuracy=self.integrator_accuracy) nb_actions = env.action_space.shape[-1] # keep tracks of the number of trajectory done num_traj = 0 env.seed(os.getpid()) set_global_seeds(os.getpid()) # Create OU Noise action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=0.2, theta=0.1) # Allocate ReplayBuffer memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) # Create DPPG agent agent = DDPG(self.actor, self.critic, memory, env.observation_space.shape, env.action_space.shape, gamma=self.gamma, tau=self.tau, normalize_returns=self.normalize_returns, normalize_observations=self.normalize_observations, batch_size=self.batch_size, action_noise=action_noise, param_noise=self.param_noise, critic_l2_reg=self.critic_l2_reg, enable_popart=self.popart, clip_norm=self.clip_norm, reward_scale=self.reward_scale) # Build the sampling logic fn sampling_fn = make_sampling_fn(agent, env, self.episode_length, self.action_repeat, self.max_action, self.nb_episodes, self.action_noise_prob) # Start TF session with U.single_threaded_session() as sess: agent.initialize(sess) set_parameters = U.SetFromFlat(self.actor.trainable_vars) # Start sampling-worker loop. while True: # self.event.wait() # Wait for a new message # self.event.clear() # Upon message receipt, mark as read message, actor_ws = self.inputQ.get() # Pop message if message == 'sample': # Set weights set_parameters(actor_ws) # Do sampling transitions = sampling_fn() self.outputQ.put((self.process_index, transitions)) # update number of trajectories num_traj += self.nb_episodes # restore environment if needed if num_traj >= self.max_env_traj: env.restore() num_traj = 0 elif message == 'exit': print('[Worker {}] Exiting...'.format(os.getpid())) env.close() break
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs): logging.basicConfig(filename='noGazebo_ddpg.log', level=logging.DEBUG, filemode="w") logging.getLogger().addHandler(logging.StreamHandler()) # Configure logger for the process with rank 0 (main-process?) # MPI = Message Passing Interface, for parallel computing; rank = process identifier within a group of processes rank = MPI.COMM_WORLD.Get_rank() if rank != 0: # Disable logging for rank != 0 to avoid noise. logging.debug( "I'm MPI worker {} and I guess I just log nothing".format(rank)) logger.set_level(logger.DISABLED) logging.disable(logging.CRITICAL) logging.info( "********************************************* Starting RL algorithm *********************************************" ) now = datetime.datetime.now() logging.info(now.isoformat()) # Create envs. env = gym.make(env_id) env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)), allow_early_resets=True) if evaluation and rank == 0: eval_env = gym.make(env_id) eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval')) env = bench.Monitor(env, None) else: eval_env = None # Parse noise_type action_noise = None param_noise = None nb_actions = env.action_space.shape[0] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) # Configure components. (initialize memory, critic & actor objects) logging.info("action space of env: {}".format(env.action_space)) # Box(2,) logging.info("observation space of env: {}".format( env.observation_space)) # Box(51200,) memory = Memory(limit=int(1e4), action_shape=(env.action_space.shape[0], ), observation_shape=env.observation_space.shape) critic = Critic(layer_norm=layer_norm) actor = Actor(nb_actions, layer_norm=layer_norm) # Seed everything to make things reproducible. seed = seed + 1000000 * rank logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(seed) env.seed(seed) if eval_env is not None: eval_env.seed(seed) # Train the RL algorithm start_time = time.time() training.train(env=env, eval_env=eval_env, param_noise=param_noise, action_noise=action_noise, actor=actor, critic=critic, memory=memory, **kwargs) # Training is done env.close() if eval_env is not None: eval_env.close() logger.info('total runtime: {}s'.format(time.time() - start_time)) now = datetime.datetime.now() logging.info(now.isoformat()) logging.info( "********************************************* End of RL algorithm *********************************************" ) return True
def __init__( self, env, gamma, total_timesteps, network='mlp', nb_rollout_steps=100, reward_scale=1.0, noise_type='adaptive-param_0.2', normalize_returns=False, normalize_observations=False, critic_l2_reg=1e-2, actor_lr=1e-4, critic_lr=1e-3, popart=False, clip_norm=None, nb_train_steps=50, # per epoch cycle and MPI worker, <- HERE! nb_eval_steps=100, buffer_size=1000000, batch_size=64, # per MPI worker tau=0.01, param_noise_adaption_interval=50, **network_kwargs): # Adjusting hyper-parameters by considering the number of options policies to learn num_options = env.get_number_of_options() buffer_size = num_options * buffer_size batch_size = num_options * batch_size observation_space = env.option_observation_space action_space = env.option_action_space nb_actions = action_space.shape[-1] assert (np.abs(action_space.low) == action_space.high ).all() # we assume symmetric actions. memory = Memory(limit=buffer_size, action_shape=action_space.shape, observation_shape=observation_space.shape) critic = Critic(network=network, **network_kwargs) actor = Actor(nb_actions, network=network, **network_kwargs) action_noise = None param_noise = None if noise_type is not None: for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) max_action = action_space.high logger.info( 'scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, observation_space.shape, action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) sess = U.get_session() # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() # Variables that are used during learning self.agent = agent self.memory = memory self.max_action = max_action self.batch_size = batch_size self.nb_train_steps = nb_train_steps self.nb_rollout_steps = nb_rollout_steps self.param_noise_adaption_interval = param_noise_adaption_interval
def train(self, env_fn, num_timesteps, noise_type, layer_norm, folder, load_policy, video_width, video_height, plot_rewards, save_every=50, seed=1234, episode_length=1000, pi_hid_size=150, pi_num_hid_layers=3, render_frames=_render_frames, **kwargs): num_cpu = self.workers if sys.platform == 'darwin': num_cpu //= 2 config = tf.ConfigProto( allow_soft_placement=True, intra_op_parallelism_threads=num_cpu, inter_op_parallelism_threads=num_cpu) if self.gpu_usage is None or self.gpu_usage <= 0.: os.environ["CUDA_VISIBLE_DEVICES"] = "-1" else: config.gpu_options.allow_growth = True # pylint: disable=E1101 config.gpu_options.per_process_gpu_memory_fraction = self.gpu_usage / self.workers tf.Session(config=config).__enter__() worker_seed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(worker_seed) tf.set_random_seed(worker_seed) np.random.seed(worker_seed) save_every = max(1, save_every) env = env_fn() env.seed(worker_seed) rank = MPI.COMM_WORLD.Get_rank() logger.info('rank {}: seed={}, logdir={}'.format(rank, worker_seed, logger.get_dir())) def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy( name=name, ob_space=ob_space, ac_space=ac_space, hid_size=pi_hid_size, num_hid_layers=pi_num_hid_layers) env = bench.Monitor( env, logger.get_dir() and osp.join(logger.get_dir(), str(rank)), allow_early_resets=True) gym.logger.setLevel(logging.INFO) that = self iter_name = 'iters_so_far' if self.method == 'sql': iter_name = 'epoch' # TODO replace with utils.create_callback(...) def callback(locals, globals): if that.method != "ddpg": if load_policy is not None and locals[iter_name] == 0: # noinspection PyBroadException try: utils.load_state(load_policy) if MPI.COMM_WORLD.Get_rank() == 0: logger.info("Loaded policy network weights from %s." % load_policy) # save TensorFlow summary (contains at least the graph definition) except: logger.error("Failed to load policy network weights from %s." % load_policy) if MPI.COMM_WORLD.Get_rank() == 0 and locals[iter_name] == 0: _ = tf.summary.FileWriter(folder, tf.get_default_graph()) if MPI.COMM_WORLD.Get_rank() == 0 and locals[iter_name] % save_every == 0: print('Saving video and checkpoint for policy at iteration %i...' % locals[iter_name]) ob = env.reset() images = [] rewards = [] max_reward = 1. # if any reward > 1, we have to rescale lower_part = video_height // 5 for i in range(episode_length): if that.method == "ddpg": ac, _ = locals['agent'].pi(ob, apply_noise=False, compute_Q=False) elif that.method == "sql": ac, _ = locals['policy'].get_action(ob) elif isinstance(locals['pi'], GaussianMlpPolicy): ac, _, _ = locals['pi'].act(np.concatenate((ob, ob))) else: ac, _ = locals['pi'].act(False, ob) ob, rew, new, _ = env.step(ac) images.append(render_frames(env)) if plot_rewards: rewards.append(rew) max_reward = max(rew, max_reward) if new: break orange = np.array([255, 163, 0]) red = np.array([255, 0, 0]) video = [] width_factor = 1. / episode_length * video_width for i, imgs in enumerate(images): for img in imgs: img[-lower_part, :10] = orange img[-lower_part, -10:] = orange if episode_length < video_width: p_rew_x = 0 for j, r in enumerate(rewards[:i]): rew_x = int(j * width_factor) if r < 0: img[-1:, p_rew_x:rew_x] = red img[-1:, p_rew_x:rew_x] = red else: rew_y = int(r / max_reward * lower_part) img[-rew_y - 1:, p_rew_x:rew_x] = orange img[-rew_y - 1:, p_rew_x:rew_x] = orange p_rew_x = rew_x else: for j, r in enumerate(rewards[:i]): rew_x = int(j * width_factor) if r < 0: img[-1:, rew_x] = red img[-1:, rew_x] = red else: rew_y = int(r / max_reward * lower_part) img[-rew_y - 1:, rew_x] = orange img[-rew_y - 1:, rew_x] = orange video.append(np.hstack(imgs)) imageio.mimsave( os.path.join(folder, "videos", "%s_%s_iteration_%i.mp4" % (that.environment, that.method, locals[iter_name])), video, fps=60) env.reset() if that.method != "ddpg": utils.save_state(os.path.join(that.folder, "checkpoints", "%s_%i" % (that.environment, locals[iter_name]))) if self.method == "ppo": pposgd_simple.learn( env, policy_fn, max_timesteps=int(num_timesteps), timesteps_per_actorbatch=1024, # 256 clip_param=0.2, entcoeff=0.01, optim_epochs=4, optim_stepsize=1e-3, # 1e-3 optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', # 'linear' callback=callback) elif self.method == "trpo": trpo_mpi.learn( env, policy_fn, max_timesteps=int(num_timesteps), timesteps_per_batch=1024, max_kl=0.1, # 0.01 cg_iters=10, cg_damping=0.1, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3, callback=callback) elif self.method == "acktr": from algos.acktr import acktr with tf.Session(config=tf.ConfigProto()): ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] with tf.variable_scope("vf"): vf = NeuralNetValueFunction(ob_dim, ac_dim) with tf.variable_scope("pi"): policy = GaussianMlpPolicy(ob_dim, ac_dim) acktr.learn( env, pi=policy, vf=vf, gamma=0.99, lam=0.97, timesteps_per_batch=1024, desired_kl=0.01, # 0.002 num_timesteps=num_timesteps, animate=False, callback=callback) elif self.method == "ddpg": from algos.ddpg import ddpg # Parse noise_type action_noise = None param_noise = None nb_actions = env.action_space.shape[-1] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') from baselines.ddpg.noise import AdaptiveParamNoiseSpec param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') from baselines.ddpg.noise import NormalActionNoise action_noise = NormalActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: from baselines.ddpg.noise import OrnsteinUhlenbeckActionNoise _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) # Configure components. memory = Memory( limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_norm=layer_norm) actor = Actor(nb_actions, layer_norm=layer_norm) ddpg.train( env=env, eval_env=None, param_noise=param_noise, render=False, render_eval=False, action_noise=action_noise, actor=actor, critic=critic, memory=memory, callback=callback, **kwargs) elif self.method == "sql": from softqlearning.algorithms import SQL from softqlearning.misc.kernel import adaptive_isotropic_gaussian_kernel from softqlearning.misc.utils import timestamp from softqlearning.replay_buffers import SimpleReplayBuffer from softqlearning.value_functions import NNQFunction from softqlearning.policies import StochasticNNPolicy from rllab.envs.gym_env import GymEnv env = GymEnv(env) variant = { 'seed': [1, 2, 3], 'policy_lr': 3E-4, 'qf_lr': 3E-4, 'discount': 0.99, 'layer_size': 128, 'batch_size': 128, 'max_pool_size': 1E6, 'n_train_repeat': 1, 'epoch_length': 1000, 'snapshot_mode': 'last', 'snapshot_gap': 100, } pool = SimpleReplayBuffer( env_spec=env.spec, max_replay_buffer_size=variant['max_pool_size'], ) base_kwargs = dict( min_pool_size=episode_length, epoch_length=episode_length, n_epochs=num_timesteps, max_path_length=episode_length, batch_size=variant['batch_size'], n_train_repeat=variant['n_train_repeat'], eval_render=False, eval_n_episodes=1, iter_callback=callback ) qf = NNQFunction( env_spec=env.spec, hidden_layer_sizes=tuple([pi_hid_size] * pi_num_hid_layers), ) pi_layers = tuple([pi_hid_size] * pi_num_hid_layers) policy = StochasticNNPolicy(env_spec=env.spec, hidden_layer_sizes=pi_layers) algorithm = SQL( base_kwargs=base_kwargs, env=env, pool=pool, qf=qf, policy=policy, kernel_fn=adaptive_isotropic_gaussian_kernel, kernel_n_particles=32, kernel_update_ratio=0.5, value_n_particles=16, td_target_update_interval=1000, qf_lr=variant['qf_lr'], policy_lr=variant['policy_lr'], discount=variant['discount'], reward_scale=1, save_full_state=False, ) algorithm.train() else: print('ERROR: Invalid "method" argument provided.', file=sys.stderr) env.close()
def run(env_id, seed, noise_type, layer_norm, evaluation, outdir, no_hyp, **kwargs): params = locals() # Configure things. # rank = MPI.COMM_WORLD.Get_rank() # if rank != 0: logger.set_level(logger.DISABLED) rank = 0 # Create envs. env = make_env(env_id) weight_file = kwargs.pop('weight_file') if not weight_file: outdir = exp_utils.prepare_exp_dirs(params, outdir, env_id) else: outdir = exp_utils.prepare_exp_dirs(params, outdir, env_id, 'eval') logger.configure(outdir) os.makedirs(outdir, exist_ok=True) env = bench.Monitor(env, os.path.join(outdir, "%i.monitor.json" % rank)) gym.logger.setLevel(logging.WARN) logger.info('Output directory:{}, env:{}, no_hyp:{}'.format( outdir, env_id, no_hyp)) if evaluation: eval_env = make_env(env_id) eval_env.seed(42) eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval'), allow_early_resets=True) # env = bench.Monitor(env, None) else: eval_env = None # Parse noise_type action_noise = None param_noise = None nb_actions = env.action_space.shape[-1] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) # Configure components. memory = Memory(limit=int(1e5), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) # critic = models.ConvCritic(layer_norm=layer_norm) # actor = models.ConvActor(nb_actions, layer_norm=layer_norm, no_hyp=no_hyp) critic = Critic(layer_norm=layer_norm) actor = Actor(nb_actions, layer_norm=layer_norm, no_hyp=no_hyp) # Seed everything to make things reproducible. seed = seed + 1000000 * rank logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir())) tf.reset_default_graph() # set_global_seeds(seed) # env.seed(seed) if eval_env is not None: eval_env.seed(seed) # Disable logging for rank != 0 to avoid noise. if rank == 0: start_time = time.time() if weight_file: evaluate( env, nb_episodes=kwargs.get('nb_epochs', 100), reward_scale=kwargs.get('reward_scale'), render=kwargs.get('render'), param_noise=None, action_noise=None, actor=actor, critic=critic, critic_l2_reg=kwargs.get('critic_l2_reg'), memory=memory, weight_file=weight_file, ) else: training.train(env=env, eval_env=eval_env, param_noise=param_noise, action_noise=action_noise, actor=actor, critic=critic, memory=memory, outdir=outdir, no_hyp=no_hyp, **kwargs) env.close() if eval_env is not None: eval_env.close() if rank == 0: logger.info('total runtime: {}s'.format(time.time() - start_time))
def run(env_id, seed, noise_type, layer_norm, evaluation, actor_lr, critic_lr, classifier_lr, dropout, rho_W=-4, rho_b=-4, entropy_coeff=1.0, g_step=20, timesteps_per_batch=1024, **kwargs): # Configure things. rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) # Create envs. #env = gym.make(env_id) env = make_env(env_id) env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)), allow_early_resets=True) gym.logger.setLevel(logging.WARN) if evaluation and rank == 0: #eval_env = gym.make(env_id) eval_env = make_env(env_id) eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval')) env = bench.Monitor(env, None) else: eval_env = None # Parse noise_type action_noise = None nb_actions = env.action_space.shape[-1] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) # Configure components. fifomemory = FIFOMemory(limit=int(64)) # TODO: customize choosing of limit memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_norm=layer_norm) if 0 < dropout and dropout < 1: actor = NoiseDropoutActor(nb_actions, rho_W=rho_W, rho_b=rho_b, layer_norm=layer_norm, p=dropout) else: actor = NoiseActor(nb_actions, rho_W=rho_W, rho_b=rho_b, layer_norm=layer_norm) classifier = Classifier(layer_norm=layer_norm) # Seed everything to make things reproducible. seed_old = seed seed = seed + 1000000 * rank logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(seed) env.seed(seed) if eval_env is not None: eval_env.seed(seed) # Build callback arg = {} arg['seed'] = seed_old arg['env_id'] = env_id arg['noise_type'] = noise_type arg['rhoW'] = rho_W arg['rhob'] = rho_b arg['entropy_coeff'] = entropy_coeff arg['actor_lr'] = actor_lr arg['critic_lr'] = critic_lr arg['classifier_lr'] = classifier_lr arg['dropout'] = dropout arg['gstep'] = g_step arg['timesteps_per_batch'] = timesteps_per_batch callback = CALLBACK(arg) # Disable logging for rank != 0 to avoid noise. if rank == 0: start_time = time.time() setup_and_learn(env=env, eval_env=eval_env, action_noise=action_noise, actor=actor, critic=critic, classifier=classifier, memory=memory, fifomemory=fifomemory, actor_lr=actor_lr, critic_lr=critic_lr, classifier_lr=classifier_lr, callback=callback, entropy_coeff=entropy_coeff, g_step=g_step, timesteps_per_batch=timesteps_per_batch, **kwargs) env.close() if eval_env is not None: eval_env.close() if rank == 0: logger.info('total runtime: {}s'.format(time.time() - start_time))
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs): # Configure things. rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) # Create envs. # env = CartpoleSwingupEnvX() env = experiments[env_id]['env_call']() if experiments[env_id]['normalize_env']: env = normalize(env) env = bench.Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) # env = gym.wrappers.Monitor(env, log_dir, video_callable=False, # force=True) gym.logger.setLevel(logging.WARN) if evaluation and rank == 0: eval_env = experiments[env_id]['env_call']() if experiments[env_id]['normalize_env']: eval_env = normalize(eval_env) eval_env = bench.Monitor( eval_env, os.path.join(logger.get_dir(), 'gym_eval')) env = bench.Monitor(env, None) else: eval_env = None # Parse noise_type action_noise = None param_noise = None nb_actions = env.action_space.shape[-1] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) # Configure components. memory = Memory( limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_norm=layer_norm) actor = Actor(nb_actions, layer_norm=layer_norm) # Seed everything to make things reproducible. seed = seed + 1000000 * rank logger.info( 'rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(seed) # env.seed(seed) # if eval_env is not None: # eval_env.seed(seed) # Disable logging for rank != 0 to avoid noise. if rank == 0: start_time = time.time() training.train(env=env, eval_env=eval_env, param_noise=param_noise, action_noise=action_noise, actor=actor, critic=critic, memory=memory, **kwargs) env.close() if eval_env is not None: eval_env.close() if rank == 0: logger.info('total runtime: {}s'.format(time.time() - start_time))
_, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) nb_actions = env.action_space.shape[-1] memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_norm=layer_norm) actor = Actor(nb_actions, layer_norm=layer_norm) tf.reset_default_graph() agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations,
def __init__(self, network, env, gamma=1, tau=0.01, total_timesteps=1e6, normalize_observations=True, normalize_returns=False, enable_popart=False, noise_type='adaptive-param_0.2', clip_norm=None, reward_scale=1., batch_size=128, l2_reg_coef=0.2, actor_lr=1e-4, critic_lr=1e-3, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf), **network_kwargs): # logger.info('Using agent with the following configuration:') # logger.info(str(self.__dict__.items())) observation_shape = env.observation_space.shape action_shape = env.action_space.shape # Inputs. self.obs0 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs0') self.obs1 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs1') self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1') self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions = tf.placeholder(tf.float32, shape=(None,) + action_shape, name='actions') self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target') self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev') # Parameters. self.env = env self.gamma = gamma self.tau = tau self.total_timesteps = total_timesteps self.normalize_observations = normalize_observations self.normalize_returns = normalize_returns self.enable_popart = enable_popart self.clip_norm = clip_norm self.reward_scale = reward_scale self.action_range = action_range self.return_range = return_range self.observation_range = observation_range self.batch_size = batch_size self.actor_lr = actor_lr self.critic_lr = critic_lr self.l2_reg_coef = l2_reg_coef self.stats_sample = None self.action_noise = None self.param_noise = None nb_actions = self.env.action_space.shape[-1] if noise_type is not None: for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') self.param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') self.action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') self.action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError('unknown noise type "{}"'.format(current_noise_type)) assert (np.abs(env.action_space.low) == env.action_space.high).all() # we assume symmetric actions. self.memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) self.critic = Critic(network=network, **network_kwargs) self.actor = Actor(nb_actions, network=network, **network_kwargs) # Observation normalization. if self.normalize_observations: with tf.variable_scope('obs_rms'): self.obs_rms = RunningMeanStd(shape=observation_shape) else: self.obs_rms = None normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms), self.observation_range[0], self.observation_range[1]) normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms), self.observation_range[0], self.observation_range[1]) # Return normalization. if self.normalize_returns: with tf.variable_scope('ret_rms'): self.ret_rms = RunningMeanStd() else: self.ret_rms = None # Create target networks. target_actor = copy(self.actor) target_actor.name = 'target_actor' self.target_actor = target_actor target_critic = copy(self.critic) target_critic.name = 'target_critic' self.target_critic = target_critic # Create networks and core TF parts that are shared across setup parts. self.actor_tf = self.actor(normalized_obs0) self.normalized_critic_tf = self.critic(normalized_obs0, self.actions) self.critic_tf = denormalize( tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms) self.normalized_critic_with_actor_tf = self.critic(normalized_obs0, self.actor_tf, reuse=True) self.critic_with_actor_tf = denormalize( tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) Q_obs1 = denormalize(target_critic(normalized_obs1, target_actor(normalized_obs1)), self.ret_rms) self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1 # Set up parts. if self.param_noise is not None: self.setup_param_noise(normalized_obs0) self.setup_actor_optimizer() self.setup_critic_optimizer() if self.normalize_returns and self.enable_popart: self.setup_popart() self.setup_stats() self.setup_target_network_updates() self.initial_state = None # recurrent architectures not supported yet self.def_path_pre = os.path.dirname(os.path.abspath(__file__)) + '/tmp/'
def learn( network, env, seed=None, total_timesteps=None, nb_epochs=None, # with default settings, perform 1M steps total nb_epoch_cycles=20, nb_rollout_steps=100, reward_scale=1.0, render=False, render_eval=False, noise_type='adaptive-param_0.2', normalize_returns=False, normalize_observations=True, critic_l2_reg=1e-2, actor_lr=1e-4, critic_lr=1e-3, popart=False, gamma=0.99, clip_norm=None, nb_train_steps=50, # per epoch cycle and MPI worker, nb_eval_steps=100, batch_size=64, # per MPI worker tau=0.01, eval_env=None, param_noise_adaption_interval=50, **network_kwargs): set_global_seeds(seed) if total_timesteps is not None: assert nb_epochs is None nb_epochs = int(total_timesteps) // (nb_epoch_cycles * nb_rollout_steps) else: nb_epochs = 500 rank = MPI.COMM_WORLD.Get_rank() nb_actions = env.action_space.shape[-1] assert (np.abs(env.action_space.low) == env.action_space.high ).all() # we assume symmetric actions. memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(network=network, **network_kwargs) actor = Actor(nb_actions, network=network, **network_kwargs) action_noise = None param_noise = None if noise_type is not None: for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) max_action = env.action_space.high logger.info( 'scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) sess = U.get_session() # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() nenvs = obs.shape[0] episode_reward = np.zeros(nenvs, dtype=np.float32) #vector episode_step = np.zeros(nenvs, dtype=int) # vector episodes = 0 #scalar t = 0 # scalar epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for epoch in range(nb_epochs): for cycle in range(nb_epoch_cycles): # Perform rollouts. if nenvs > 1: # if simulating multiple envs in parallel, impossible to reset agent at the end of the episode in each # of the environments, so resetting here instead agent.reset() for t_rollout in range(nb_rollout_steps): # Predict next action. action, q, _, _ = agent.step(obs, apply_noise=True, compute_Q=True) # Execute next action. if rank == 0 and render: env.render() # max_action is of dimension A, whereas action is dimension (nenvs, A) - the multiplication gets broadcasted to the batch new_obs, r, done, info = env.step( max_action * action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) # note these outputs are batched from vecenv t += 1 if rank == 0 and render: env.render() episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) agent.store_transition( obs, action, r, new_obs, done ) #the batched data will be unrolled in memory.py's append. obs = new_obs for d in range(len(done)): if done[d]: # Episode done. epoch_episode_rewards.append(episode_reward[d]) episode_rewards_history.append(episode_reward[d]) epoch_episode_steps.append(episode_step[d]) episode_reward[d] = 0. episode_step[d] = 0 epoch_episodes += 1 episodes += 1 if nenvs == 1: agent.reset() # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: nenvs_eval = eval_obs.shape[0] eval_episode_reward = np.zeros(nenvs_eval, dtype=np.float32) for t_rollout in range(nb_eval_steps): eval_action, eval_q, _, _ = agent.step(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step( max_action * eval_action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) for d in range(len(eval_done)): if eval_done[d]: eval_episode_rewards.append(eval_episode_reward[d]) eval_episode_rewards_history.append( eval_episode_reward[d]) eval_episode_reward[d] = 0.0 mpi_size = MPI.COMM_WORLD.Get_size() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean( episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean( epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float(duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = eval_episode_rewards combined_stats['eval/return_history'] = np.mean( eval_episode_rewards_history) combined_stats['eval/Q'] = eval_qs combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s' % x) combined_stats_sums = MPI.COMM_WORLD.allreduce( np.array( [np.array(x).flatten()[0] for x in combined_stats.values()])) combined_stats = { k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums) } # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) if rank == 0: logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f) return agent
def main(): with U.single_threaded_session() as sess: batch_size = 64 current_noise_type = 'adaptive-param_0.2' _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev)) param_noise_adaption_interval = 2 env = gym.make("Pendulum-v0") nb_actions = env.action_space.shape[-1] layer_norm = True # Configure components. memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_norm=layer_norm) actor = Actor(nb_actions, layer_norm=layer_norm) # Seed everything to make things reproducible. seed = int(1000000 * np.random.rand()) logger.info('seed={}, logdir={}'.format(seed, logger.get_dir())) tf.set_random_seed(seed) np.random.seed(seed) random.seed(seed) env.seed(seed) max_action = env.action_space.high logger.info('scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, batch_size=batch_size, param_noise=param_noise) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() obs = env.reset() for t in itertools.count(): episode_rewards = [] done = False while not done: env.render() # Take action and update exploration to the newest value action, q = agent.pi(obs, apply_noise=True, compute_Q=True) new_obs, rew, done, _ = env.step(max_action * action) # Book-keeping. agent.store_transition(obs, action, rew, new_obs, done) obs = new_obs episode_rewards.append(rew) if done: agent.reset() obs = env.reset() nb_train_steps = 100 epoch_adaptive_distances = [] epoch_critic_losses = [] epoch_actor_losses = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() if t % 10 == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", len(episode_rewards)) logger.record_tabular("mean episode reward", round(np.mean(episode_rewards), 1)) logger.record_tabular('train/loss_actor', round(np.mean(epoch_actor_losses))) logger.record_tabular('train/loss_critic', round(np.mean(epoch_critic_losses))) logger.record_tabular('train/param_noise_distance', round(np.mean(epoch_adaptive_distances))) logger.dump_tabular()
def __init__(self, config, action_bound, obs_bound): super(DDPGModelNew, self).__init__(config=config) self.action_noise = None self.para_noise = None if self.config.config_dict['NOISE_FLAG']: nb_actions = self.config.config_dict['ACTION_SPACE'] noise_type = self.config.config_dict['NOISE_TYPE'] action_noise = None param_noise = None for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError('unknown noise type "{}"'.format(current_noise_type)) self.action_noise = action_noise self.para_noise = param_noise actor = Actor(nb_actions=self.config.config_dict['ACTION_SPACE'][0], layer_norm=self.config.config_dict['LAYER_NORM_FLAG'], net_config=self.config.config_dict['ACTOR_LAYER_CONFIG'], action_low=action_bound[0], action_high=action_bound[1]) critic = Critic(net_config=self.config.config_dict['CRITIC_LAYER_CONFIG']) self.real_data_memory = Memory(limit=int(1e5), action_shape=self.config.config_dict['ACTION_SPACE'], observation_shape=self.config.config_dict['STATE_SPACE']) self.simulation_data_memory = Memory(limit=int(1e5), action_shape=self.config.config_dict['ACTION_SPACE'], observation_shape=self.config.config_dict['STATE_SPACE']) # TODO deal with obs range self.ddpg_model = baseline_ddpg(actor=actor, critic=critic, memory=self.real_data_memory, observation_shape=self.config.config_dict['STATE_SPACE'], action_shape=self.config.config_dict['ACTION_SPACE'], param_noise=self.para_noise, action_noise=self.action_noise, gamma=self.config.config_dict['GAMMA'], tau=self.config.config_dict['TAU'], action_range=action_bound, return_range=(-np.inf, np.inf), normalize_observations=False, actor_lr=self.config.config_dict['ACTOR_LEARNING_RATE'], critic_lr=self.config.config_dict['CRITIC_LEARNING_RATE'], critic_l2_reg=self.config.config_dict['CRITIC_L2_REG'], batch_size=self.config.config_dict['BATCH_SIZE'], observation_range=(-np.inf, np.inf)) self.ddpg_model.sess = tf.get_default_session() var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) self.var_list = [] for var in var_list: # TODO THIS MAY LEAD TO SOME BUGS IN THE FUTURE if 'actor' in var.name or 'critic' in var.name or 'obs' in var.name: self.var_list.append(var) self.variables_initializer = tf.variables_initializer(var_list=self.var_list) self._env_status = None
def run(self): """Override Process.run()""" # Create environment env = create_environment( action_repeat=self.action_repeat, full=self.full, exclude_centering_frame=self.exclude_centering_frame, visualize=self.visualize, fail_reward=self.fail_reward, integrator_accuracy=self.integrator_accuracy) nb_actions = env.action_space.shape[-1] env.seed(os.getpid()) set_global_seeds(os.getpid()) num_traj = 0 # Allocate ReplayBuffer memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) # Create DPPG agent agent = DDPG(self.actor, self.critic, memory, env.observation_space.shape, env.action_space.shape, gamma=self.gamma, tau=self.tau, normalize_returns=self.normalize_returns, normalize_observations=self.normalize_observations, batch_size=self.batch_size, action_noise=None, param_noise=None, critic_l2_reg=self.critic_l2_reg, enable_popart=self.popart, clip_norm=self.clip_norm, reward_scale=self.reward_scale) # Build the testing logic fn testing_fn = make_testing_fn(agent, env, self.episode_length, self.action_repeat, self.max_action, self.nb_episodes) # Start TF session with U.single_threaded_session() as sess: agent.initialize(sess) set_parameters = U.SetFromFlat(self.actor.trainable_vars) # Start sampling-worker loop. while True: message, actor_ws, global_step = self.inputQ.get( ) # Pop message if message == 'test': # Set weights set_parameters(actor_ws) # Do testing rewards, step_times, distances, episode_lengths = testing_fn( ) self.outputQ.put((rewards, step_times, distances, episode_lengths, global_step)) # update number of trajectories num_traj += self.nb_episodes # restore environment if needed if num_traj >= self.max_env_traj: env.restore() num_traj = 0 elif message == 'exit': print('[Worker {}] Exiting...'.format(os.getpid())) env.close() break