def _build_agent(self, id, agent_file): Logger.print2('Agent {:d}: {}'.format(id, agent_file)) if (agent_file == 'none'): agent = None else: agent = AgentBuilder.build_agent(self, id, agent_file) assert (agent != None), 'Failed to build agent {:d} from: {}'.format(id, agent_file) return agent
def save_model(self, out_path): with self.sess.as_default(), self.graph.as_default(): try: save_path = self.saver.save(self.sess, out_path, write_meta_graph=False, write_state=False) Logger.print2('Model saved to: ' + save_path) except: Logger.print2("Failed to save model to: " + save_path) return
def _build_nets(self, json_data): assert self.ACTOR_NET_KEY in json_data assert self.CRITIC_NET_KEY in json_data actor_net_name = json_data[self.ACTOR_NET_KEY] critic_net_name = json_data[self.CRITIC_NET_KEY] actor_init_output_scale = 1 if ( self.ACTOR_INIT_OUTPUT_SCALE_KEY not in json_data) else json_data[self.ACTOR_INIT_OUTPUT_SCALE_KEY] s_size = self.get_state_size() g_size = self.get_goal_size() a_size = self.get_action_size() # setup input tensors self.s_tf = tf.placeholder(tf.float32, shape=[None, s_size], name="s") self.a_tf = tf.placeholder(tf.float32, shape=[None, a_size], name="a") self.tar_val_tf = tf.placeholder(tf.float32, shape=[None], name="tar_val") self.adv_tf = tf.placeholder(tf.float32, shape=[None], name="adv") self.g_tf = tf.placeholder( tf.float32, shape=([None, g_size] if self.has_goal() else None), name="g") self.old_logp_tf = tf.placeholder(tf.float32, shape=[None], name="old_logp") self.exp_mask_tf = tf.placeholder(tf.float32, shape=[None], name="exp_mask") with tf.variable_scope('main'): with tf.variable_scope('actor'): self.a_mean_tf = self._build_net_actor( actor_net_name, actor_init_output_scale) with tf.variable_scope('critic'): self.critic_tf = self._build_net_critic(critic_net_name) if (self.a_mean_tf != None): Logger.print2('Built actor net: ' + actor_net_name) if (self.critic_tf != None): Logger.print2('Built critic net: ' + critic_net_name) self.norm_a_std_tf = self.exp_params_curr.noise * tf.ones(a_size) norm_a_noise_tf = self.norm_a_std_tf * tf.random_normal( shape=tf.shape(self.a_mean_tf)) norm_a_noise_tf *= tf.expand_dims(self.exp_mask_tf, axis=-1) self.sample_a_tf = self.a_mean_tf + norm_a_noise_tf * self.a_norm.std_tf self.sample_a_logp_tf = TFUtil.calc_logp_gaussian( x_tf=norm_a_noise_tf, mean_tf=None, std_tf=self.norm_a_std_tf) return
def build_arg_parser(args): arg_parser = ArgParser() arg_parser.load_args(args) arg_file = arg_parser.parse_string('arg_file', '') if (arg_file != ''): path = pybullet_data_local.getDataPath() + "/args/" + arg_file succ = arg_parser.load_file(path) Logger.print2(arg_file) assert succ, Logger.print2('Failed to load args from: ' + arg_file) return arg_parser
def __init__(self, world, id, json_data): self.world = world self.id = id self.logger = Logger() self._mode = self.Mode.TRAIN assert self._check_action_space(), \ Logger.print2("Invalid action space, got {:s}".format(str(self.get_action_space()))) self._enable_training = True self.path = Path() self.iter = int(0) self.start_time = time.time() self._update_counter = 0 self.update_period = 1.0 # simulated time (seconds) before each training update self.iters_per_update = int(1) self.discount = 0.95 self.mini_batch_size = int(32) self.replay_buffer_size = int(50000) self.init_samples = int(1000) self.normalizer_samples = np.inf self._local_mini_batch_size = self.mini_batch_size # batch size for each work for multiprocessing self._need_normalizer_update = True self._total_sample_count = 0 self._output_dir = "" self._int_output_dir = "" self.output_iters = 100 self.int_output_iters = 100 self.train_return = 0.0 self.test_episodes = int(0) self.test_episode_count = int(0) self.test_return = 0.0 self.avg_test_return = 0.0 self.exp_anneal_samples = 320000 self.exp_params_beg = ExpParams() self.exp_params_end = ExpParams() self.exp_params_curr = ExpParams() self._load_params(json_data) self._build_replay_buffer(self.replay_buffer_size) self._build_normalizers() self._build_bounds() self.reset() return
def main(): # Command line arguments args = sys.argv[1:] arg_parser = ArgParser() arg_parser.load_args(args) num_workers = arg_parser.parse_int('num_workers', 1) assert (num_workers > 0) Logger.print2('Running with {:d} workers'.format(num_workers)) cmd = 'mpiexec -n {:d} python DeepMimic_Optimizer.py '.format(num_workers) cmd += ' '.join(args) Logger.print2('cmd: ' + cmd) subprocess.call(cmd, shell=True) return
def update(self): new_count = MPIUtil.reduce_sum(self.new_count) new_sum = MPIUtil.reduce_sum(self.new_sum) new_sum_sq = MPIUtil.reduce_sum(self.new_sum_sq) new_total = self.count + new_count if (self.count // self.CHECK_SYNC_COUNT != new_total // self.CHECK_SYNC_COUNT): assert self.check_synced(), Logger.print2( 'Normalizer parameters desynchronized') if new_count > 0: new_mean = self._process_group_data(new_sum / new_count, self.mean) new_mean_sq = self._process_group_data(new_sum_sq / new_count, self.mean_sq) w_old = float(self.count) / new_total w_new = float(new_count) / new_total self.mean = w_old * self.mean + w_new * new_mean self.mean_sq = w_old * self.mean_sq + w_new * new_mean_sq self.count = new_total self.std = self.calc_std(self.mean, self.mean_sq) self.new_count = 0 self.new_sum.fill(0) self.new_sum_sq.fill(0) return
def _build_nets(self, json_data): assert self.ACTOR_NET_KEY in json_data assert self.CRITIC_NET_KEY in json_data actor_net_name = json_data[self.ACTOR_NET_KEY] critic_net_name = json_data[self.CRITIC_NET_KEY] actor_init_output_scale = 1 if ( self.ACTOR_INIT_OUTPUT_SCALE_KEY not in json_data) else json_data[self.ACTOR_INIT_OUTPUT_SCALE_KEY] s_size = self.get_state_size() g_size = self.get_goal_size() a_size = self.get_action_size() # setup input tensors self.s_tf = tf.placeholder(tf.float32, shape=[None, s_size], name="s") # observations self.tar_val_tf = tf.placeholder(tf.float32, shape=[None], name="tar_val") # target value s self.adv_tf = tf.placeholder(tf.float32, shape=[None], name="adv") # advantage self.a_tf = tf.placeholder(tf.float32, shape=[None, a_size], name="a") # target actions self.g_tf = tf.placeholder( tf.float32, shape=([None, g_size] if self.has_goal() else None), name="g") # goals with tf.variable_scope('main'): with tf.variable_scope('actor'): self.actor_tf = self._build_net_actor(actor_net_name, actor_init_output_scale) with tf.variable_scope('critic'): self.critic_tf = self._build_net_critic(critic_net_name) if (self.actor_tf != None): Logger.print2('Built actor net: ' + actor_net_name) if (self.critic_tf != None): Logger.print2('Built critic net: ' + critic_net_name) return
def _update_mode(self): if (self._mode == self.Mode.TRAIN): self._update_mode_train() elif (self._mode == self.Mode.TRAIN_END): self._update_mode_train_end() elif (self._mode == self.Mode.TEST): self._update_mode_test() else: assert False, Logger.print2("Unsupported RL agent mode" + str(self._mode)) return
def store(self, path): start_idx = MathUtil.INVALID_IDX n = path.pathlength() if (n > 0): assert path.is_valid() if path.check_vals(): if self.buffers is None: self._init_buffers(path) idx = self._request_idx(n + 1) self._store_path(path, idx) self._add_sample_buffers(idx) self.num_paths += 1 self.total_count += n + 1 start_idx = idx[0] else: Logger.print2('Invalid path data value detected') return start_idx
def record(self, x): size = self.get_size() is_array = isinstance(x, np.ndarray) if not is_array: assert (size == 1) x = np.array([[x]]) assert x.shape[-1] == size, \ Logger.print2('Normalizer shape mismatch, expecting size {:d}, but got {:d}'.format(size, x.shape[-1])) x = np.reshape(x, [-1, size]) self.new_count += x.shape[0] self.new_sum += np.sum(x, axis=0) self.new_sum_sq += np.sum(np.square(x), axis=0) return
def end_episode(self): if (self.path.pathlength() > 0): self._end_path() if (self._mode == self.Mode.TRAIN or self._mode == self.Mode.TRAIN_END): if (self.enable_training and self.path.pathlength() > 0): self._store_path(self.path) elif (self._mode == self.Mode.TEST): self._update_test_return(self.path) else: assert False, Logger.print2("Unsupported RL agent mode" + str(self._mode)) self._update_mode() return
def set_mean_std(self, mean, std): size = self.get_size() is_array = isinstance(mean, np.ndarray) and isinstance(std, np.ndarray) if not is_array: assert (size == 1) mean = np.array([mean]) std = np.array([std]) assert len(mean) == size and len(std) == size, \ Logger.print2('Normalizer shape mismatch, expecting size {:d}, but got {:d} and {:d}'.format(size, len(mean), len(std))) self.mean = mean self.std = std self.mean_sq = self.calc_mean_sq(self.mean, self.std) return
def update_flatgrad(self, flat_grad, grad_scale=1.0): if self.iter % self.CHECK_SYNC_ITERS == 0: assert self.check_synced(), Logger.print2( 'Network parameters desynchronized') if grad_scale != 1.0: flat_grad *= grad_scale MPI.COMM_WORLD.Allreduce(flat_grad, self._global_flat_grad, op=MPI.SUM) self._global_flat_grad /= MPIUtil.get_num_procs() self._load_flat_grad(self._global_flat_grad) self.sess.run([self._update], self._grad_feed) self.iter += 1 return
def build_agents(self): num_agents = self.env.get_num_agents() print("num_agents=", num_agents) self.agents = [] Logger.print2('') Logger.print2('Num Agents: {:d}'.format(num_agents)) agent_files = self.arg_parser.parse_strings('agent_files') print("len(agent_files)=", len(agent_files)) assert (len(agent_files) == num_agents or len(agent_files) == 0) model_files = self.arg_parser.parse_strings('model_files') assert (len(model_files) == num_agents or len(model_files) == 0) output_path = self.arg_parser.parse_string('output_path') int_output_path = self.arg_parser.parse_string('int_output_path') for i in range(num_agents): curr_file = agent_files[i] curr_agent = self._build_agent(i, curr_file) if curr_agent is not None: curr_agent.output_dir = output_path curr_agent.int_output_dir = int_output_path Logger.print2(str(curr_agent)) if (len(model_files) > 0): curr_model_file = model_files[i] if curr_model_file != 'none': curr_agent.load_model(pybullet_data_local.getDataPath() + "/" + curr_model_file) self.agents.append(curr_agent) Logger.print2('') self.set_enable_training(self.enable_training) return
from pybullet_utils_local.logger import Logger logger = Logger() logger.configure_output_file("e:/mylog.txt") for i in range(10): logger.log_tabular("Iteration", 1) Logger.print2("hello world") logger.print_tabular() logger.dump_tabular()
def shutdown(): global world Logger.print2('Shutting down...') world.shutdown() return
def __init__(self, renders=False, arg_file=''): self._arg_parser = ArgParser() Logger.print2( "===========================================================") succ = False if (arg_file != ''): path = pybullet_data_local.getDataPath() + "/args/" + arg_file succ = self._arg_parser.load_file(path) Logger.print2(arg_file) assert succ, Logger.print2('Failed to load args from: ' + arg_file) self._p = None self._time_step = 1. / 240. self._internal_env = None self._renders = renders self._discrete_actions = False self._arg_file = arg_file self._render_height = 200 self._render_width = 320 self.theta_threshold_radians = 12 * 2 * math.pi / 360 self.x_threshold = 0.4 #2.4 high = np.array([ self.x_threshold * 2, np.finfo(np.float32).max, self.theta_threshold_radians * 2, np.finfo(np.float32).max ]) ctrl_size = 43 #numDof root_size = 7 # root action_dim = ctrl_size - root_size action_bound_min = np.array([ -4.79999999999, -1.00000000000, -1.00000000000, -1.00000000000, -4.00000000000, -1.00000000000, -1.00000000000, -1.00000000000, -7.77999999999, -1.00000000000, -1.000000000, -1.000000000, -7.850000000, -6.280000000, -1.000000000, -1.000000000, -1.000000000, -12.56000000, -1.000000000, -1.000000000, -1.000000000, -4.710000000, -7.779999999, -1.000000000, -1.000000000, -1.000000000, -7.850000000, -6.280000000, -1.000000000, -1.000000000, -1.000000000, -8.460000000, -1.000000000, -1.000000000, -1.000000000, -4.710000000 ]) #print("len(action_bound_min)=",len(action_bound_min)) action_bound_max = np.array([ 4.799999999, 1.000000000, 1.000000000, 1.000000000, 4.000000000, 1.000000000, 1.000000000, 1.000000000, 8.779999999, 1.000000000, 1.0000000, 1.0000000, 4.7100000, 6.2800000, 1.0000000, 1.0000000, 1.0000000, 12.560000, 1.0000000, 1.0000000, 1.0000000, 7.8500000, 8.7799999, 1.0000000, 1.0000000, 1.0000000, 4.7100000, 6.2800000, 1.0000000, 1.0000000, 1.0000000, 10.100000, 1.0000000, 1.0000000, 1.0000000, 7.8500000 ]) #print("len(action_bound_max)=",len(action_bound_max)) self.action_space = spaces.Box(action_bound_min, action_bound_max) observation_min = np.array([0.0] + [-100.0] + [-4.0] * 105 + [-500.0] * 90) observation_max = np.array([1.0] + [100.0] + [4.0] * 105 + [500.0] * 90) state_size = 197 self.observation_space = spaces.Box(observation_min, observation_min, dtype=np.float32) self.seed() self.viewer = None self._configure()
def _train(self): samples = self.replay_buffer.total_count self._total_sample_count = int(MPIUtil.reduce_sum(samples)) end_training = False if (self.replay_buffer_initialized): if (self._valid_train_step()): prev_iter = self.iter iters = self._get_iters_per_update() avg_train_return = MPIUtil.reduce_avg(self.train_return) for i in range(iters): curr_iter = self.iter wall_time = time.time() - self.start_time wall_time /= 60 * 60 # store time in hours has_goal = self.has_goal() s_mean = np.mean(self.s_norm.mean) s_std = np.mean(self.s_norm.std) g_mean = np.mean(self.g_norm.mean) if has_goal else 0 g_std = np.mean(self.g_norm.std) if has_goal else 0 self.logger.log_tabular("Iteration", self.iter) self.logger.log_tabular("Wall_Time", wall_time) self.logger.log_tabular("Samples", self._total_sample_count) self.logger.log_tabular("Train_Return", avg_train_return) self.logger.log_tabular("Test_Return", self.avg_test_return) self.logger.log_tabular("State_Mean", s_mean) self.logger.log_tabular("State_Std", s_std) self.logger.log_tabular("Goal_Mean", g_mean) self.logger.log_tabular("Goal_Std", g_std) self._log_exp_params() self._update_iter(self.iter + 1) self._train_step() Logger.print2("Agent " + str(self.id)) self.logger.print_tabular() Logger.print2("") if (self._enable_output() and curr_iter % self.int_output_iters == 0): self.logger.dump_tabular() if (prev_iter // self.int_output_iters != self.iter // self.int_output_iters): end_training = self.enable_testing() else: Logger.print2("Agent " + str(self.id)) Logger.print2("Samples: " + str(self._total_sample_count)) Logger.print2("") if (self._total_sample_count >= self.init_samples): self.replay_buffer_initialized = True end_training = self.enable_testing() if self._need_normalizer_update: self._update_normalizers() self._need_normalizer_update = self.normalizer_samples > self._total_sample_count if end_training: self._init_mode_train_end() return
class RLAgent(ABC): class Mode(Enum): TRAIN = 0 TEST = 1 TRAIN_END = 2 NAME = "None" UPDATE_PERIOD_KEY = "UpdatePeriod" ITERS_PER_UPDATE = "ItersPerUpdate" DISCOUNT_KEY = "Discount" MINI_BATCH_SIZE_KEY = "MiniBatchSize" REPLAY_BUFFER_SIZE_KEY = "ReplayBufferSize" INIT_SAMPLES_KEY = "InitSamples" NORMALIZER_SAMPLES_KEY = "NormalizerSamples" OUTPUT_ITERS_KEY = "OutputIters" INT_OUTPUT_ITERS_KEY = "IntOutputIters" TEST_EPISODES_KEY = "TestEpisodes" EXP_ANNEAL_SAMPLES_KEY = "ExpAnnealSamples" EXP_PARAM_BEG_KEY = "ExpParamsBeg" EXP_PARAM_END_KEY = "ExpParamsEnd" def __init__(self, world, id, json_data): self.world = world self.id = id self.logger = Logger() self._mode = self.Mode.TRAIN assert self._check_action_space(), \ Logger.print2("Invalid action space, got {:s}".format(str(self.get_action_space()))) self._enable_training = True self.path = Path() self.iter = int(0) self.start_time = time.time() self._update_counter = 0 self.update_period = 1.0 # simulated time (seconds) before each training update self.iters_per_update = int(1) self.discount = 0.95 self.mini_batch_size = int(32) self.replay_buffer_size = int(50000) self.init_samples = int(1000) self.normalizer_samples = np.inf self._local_mini_batch_size = self.mini_batch_size # batch size for each work for multiprocessing self._need_normalizer_update = True self._total_sample_count = 0 self._output_dir = "" self._int_output_dir = "" self.output_iters = 100 self.int_output_iters = 100 self.train_return = 0.0 self.test_episodes = int(0) self.test_episode_count = int(0) self.test_return = 0.0 self.avg_test_return = 0.0 self.exp_anneal_samples = 320000 self.exp_params_beg = ExpParams() self.exp_params_end = ExpParams() self.exp_params_curr = ExpParams() self._load_params(json_data) self._build_replay_buffer(self.replay_buffer_size) self._build_normalizers() self._build_bounds() self.reset() return def __str__(self): action_space_str = str(self.get_action_space()) info_str = "" info_str += '"ID": {:d},\n "Type": "{:s}",\n "ActionSpace": "{:s}",\n "StateDim": {:d},\n "GoalDim": {:d},\n "ActionDim": {:d}'.format( self.id, self.NAME, action_space_str[action_space_str.rfind('.') + 1:], self.get_state_size(), self.get_goal_size(), self.get_action_size()) return "{\n" + info_str + "\n}" def get_output_dir(self): return self._output_dir def set_output_dir(self, out_dir): self._output_dir = out_dir if (self._output_dir != ""): self.logger.configure_output_file(out_dir + "/agent" + str(self.id) + "_log.txt") return output_dir = property(get_output_dir, set_output_dir) def get_int_output_dir(self): return self._int_output_dir def set_int_output_dir(self, out_dir): self._int_output_dir = out_dir return int_output_dir = property(get_int_output_dir, set_int_output_dir) def reset(self): self.path.clear() return def update(self, timestep): if self.need_new_action(): #print("update_new_action!!!") self._update_new_action() if (self._mode == self.Mode.TRAIN and self.enable_training): self._update_counter += timestep while self._update_counter >= self.update_period: self._train() self._update_exp_params() self.world.env.set_sample_count(self._total_sample_count) self._update_counter -= self.update_period return def end_episode(self): if (self.path.pathlength() > 0): self._end_path() if (self._mode == self.Mode.TRAIN or self._mode == self.Mode.TRAIN_END): if (self.enable_training and self.path.pathlength() > 0): self._store_path(self.path) elif (self._mode == self.Mode.TEST): self._update_test_return(self.path) else: assert False, Logger.print2("Unsupported RL agent mode" + str(self._mode)) self._update_mode() return def has_goal(self): return self.get_goal_size() > 0 def predict_val(self): return 0 def get_enable_training(self): return self._enable_training def set_enable_training(self, enable): print("set_enable_training=", enable) self._enable_training = enable if (self._enable_training): self.reset() return enable_training = property(get_enable_training, set_enable_training) def enable_testing(self): return self.test_episodes > 0 def get_name(self): return self.NAME @abstractmethod def save_model(self, out_path): pass @abstractmethod def load_model(self, in_path): pass @abstractmethod def _decide_action(self, s, g): pass @abstractmethod def _get_output_path(self): pass @abstractmethod def _get_int_output_path(self): pass @abstractmethod def _train_step(self): pass @abstractmethod def _check_action_space(self): pass def get_action_space(self): return self.world.env.get_action_space(self.id) def get_state_size(self): return self.world.env.get_state_size(self.id) def get_goal_size(self): return self.world.env.get_goal_size(self.id) def get_action_size(self): return self.world.env.get_action_size(self.id) def get_num_actions(self): return self.world.env.get_num_actions(self.id) def need_new_action(self): return self.world.env.need_new_action(self.id) def _build_normalizers(self): self.s_norm = Normalizer( self.get_state_size(), self.world.env.build_state_norm_groups(self.id)) self.s_norm.set_mean_std(-self.world.env.build_state_offset(self.id), 1 / self.world.env.build_state_scale(self.id)) self.g_norm = Normalizer( self.get_goal_size(), self.world.env.build_goal_norm_groups(self.id)) self.g_norm.set_mean_std(-self.world.env.build_goal_offset(self.id), 1 / self.world.env.build_goal_scale(self.id)) self.a_norm = Normalizer(self.world.env.get_action_size()) self.a_norm.set_mean_std( -self.world.env.build_action_offset(self.id), 1 / self.world.env.build_action_scale(self.id)) return def _build_bounds(self): self.a_bound_min = self.world.env.build_action_bound_min(self.id) self.a_bound_max = self.world.env.build_action_bound_max(self.id) return def _load_params(self, json_data): if (self.UPDATE_PERIOD_KEY in json_data): self.update_period = int(json_data[self.UPDATE_PERIOD_KEY]) if (self.ITERS_PER_UPDATE in json_data): self.iters_per_update = int(json_data[self.ITERS_PER_UPDATE]) if (self.DISCOUNT_KEY in json_data): self.discount = json_data[self.DISCOUNT_KEY] if (self.MINI_BATCH_SIZE_KEY in json_data): self.mini_batch_size = int(json_data[self.MINI_BATCH_SIZE_KEY]) if (self.REPLAY_BUFFER_SIZE_KEY in json_data): self.replay_buffer_size = int( json_data[self.REPLAY_BUFFER_SIZE_KEY]) if (self.INIT_SAMPLES_KEY in json_data): self.init_samples = int(json_data[self.INIT_SAMPLES_KEY]) if (self.NORMALIZER_SAMPLES_KEY in json_data): self.normalizer_samples = int( json_data[self.NORMALIZER_SAMPLES_KEY]) if (self.OUTPUT_ITERS_KEY in json_data): self.output_iters = json_data[self.OUTPUT_ITERS_KEY] if (self.INT_OUTPUT_ITERS_KEY in json_data): self.int_output_iters = json_data[self.INT_OUTPUT_ITERS_KEY] if (self.TEST_EPISODES_KEY in json_data): self.test_episodes = int(json_data[self.TEST_EPISODES_KEY]) if (self.EXP_ANNEAL_SAMPLES_KEY in json_data): self.exp_anneal_samples = json_data[self.EXP_ANNEAL_SAMPLES_KEY] if (self.EXP_PARAM_BEG_KEY in json_data): self.exp_params_beg.load(json_data[self.EXP_PARAM_BEG_KEY]) if (self.EXP_PARAM_END_KEY in json_data): self.exp_params_end.load(json_data[self.EXP_PARAM_END_KEY]) num_procs = MPIUtil.get_num_procs() self._local_mini_batch_size = int( np.ceil(self.mini_batch_size / num_procs)) self._local_mini_batch_size = np.maximum(self._local_mini_batch_size, 1) self.mini_batch_size = self._local_mini_batch_size * num_procs assert (self.exp_params_beg.noise == self.exp_params_end.noise ) # noise std should not change self.exp_params_curr = copy.deepcopy(self.exp_params_beg) self.exp_params_end.noise = self.exp_params_beg.noise self._need_normalizer_update = self.normalizer_samples > 0 return def _record_state(self): s = self.world.env.record_state(self.id) return s def _record_goal(self): g = self.world.env.record_goal(self.id) return g def _record_reward(self): r = self.world.env.calc_reward(self.id) return r def _apply_action(self, a): self.world.env.set_action(self.id, a) return def _record_flags(self): return int(0) def _is_first_step(self): return len(self.path.states) == 0 def _end_path(self): s = self._record_state() g = self._record_goal() r = self._record_reward() self.path.rewards.append(r) self.path.states.append(s) self.path.goals.append(g) self.path.terminate = self.world.env.check_terminate(self.id) return def _update_new_action(self): #print("_update_new_action!") s = self._record_state() #np.savetxt("pb_record_state_s.csv", s, delimiter=",") g = self._record_goal() if not (self._is_first_step()): r = self._record_reward() self.path.rewards.append(r) a, logp = self._decide_action(s=s, g=g) assert len(np.shape(a)) == 1 assert len(np.shape(logp)) <= 1 flags = self._record_flags() self._apply_action(a) self.path.states.append(s) self.path.goals.append(g) self.path.actions.append(a) self.path.logps.append(logp) self.path.flags.append(flags) if self._enable_draw(): self._log_val(s, g) return def _update_exp_params(self): lerp = float(self._total_sample_count) / self.exp_anneal_samples lerp = np.clip(lerp, 0.0, 1.0) self.exp_params_curr = self.exp_params_beg.lerp( self.exp_params_end, lerp) return def _update_test_return(self, path): path_reward = path.calc_return() self.test_return += path_reward self.test_episode_count += 1 return def _update_mode(self): if (self._mode == self.Mode.TRAIN): self._update_mode_train() elif (self._mode == self.Mode.TRAIN_END): self._update_mode_train_end() elif (self._mode == self.Mode.TEST): self._update_mode_test() else: assert False, Logger.print2("Unsupported RL agent mode" + str(self._mode)) return def _update_mode_train(self): return def _update_mode_train_end(self): self._init_mode_test() return def _update_mode_test(self): if (self.test_episode_count * MPIUtil.get_num_procs() >= self.test_episodes): global_return = MPIUtil.reduce_sum(self.test_return) global_count = MPIUtil.reduce_sum(self.test_episode_count) avg_return = global_return / global_count self.avg_test_return = avg_return if self.enable_training: self._init_mode_train() return def _init_mode_train(self): self._mode = self.Mode.TRAIN self.world.env.set_mode(self._mode) return def _init_mode_train_end(self): self._mode = self.Mode.TRAIN_END return def _init_mode_test(self): self._mode = self.Mode.TEST self.test_return = 0.0 self.test_episode_count = 0 self.world.env.set_mode(self._mode) return def _enable_output(self): return MPIUtil.is_root_proc() and self.output_dir != "" def _enable_int_output(self): return MPIUtil.is_root_proc() and self.int_output_dir != "" def _calc_val_bounds(self, discount): r_min = self.world.env.get_reward_min(self.id) r_max = self.world.env.get_reward_max(self.id) assert (r_min <= r_max) val_min = r_min / (1.0 - discount) val_max = r_max / (1.0 - discount) return val_min, val_max def _calc_val_offset_scale(self, discount): val_min, val_max = self._calc_val_bounds(discount) val_offset = 0 val_scale = 1 if (np.isfinite(val_min) and np.isfinite(val_max)): val_offset = -0.5 * (val_max + val_min) val_scale = 2 / (val_max - val_min) return val_offset, val_scale def _calc_term_vals(self, discount): r_fail = self.world.env.get_reward_fail(self.id) r_succ = self.world.env.get_reward_succ(self.id) r_min = self.world.env.get_reward_min(self.id) r_max = self.world.env.get_reward_max(self.id) assert (r_fail <= r_max and r_fail >= r_min) assert (r_succ <= r_max and r_succ >= r_min) assert (not np.isinf(r_fail)) assert (not np.isinf(r_succ)) if (discount == 0): val_fail = 0 val_succ = 0 else: val_fail = r_fail / (1.0 - discount) val_succ = r_succ / (1.0 - discount) return val_fail, val_succ def _update_iter(self, iter): if (self._enable_output() and self.iter % self.output_iters == 0): output_path = self._get_output_path() output_dir = os.path.dirname(output_path) if not os.path.exists(output_dir): os.makedirs(output_dir) self.save_model(output_path) if (self._enable_int_output() and self.iter % self.int_output_iters == 0): int_output_path = self._get_int_output_path() int_output_dir = os.path.dirname(int_output_path) if not os.path.exists(int_output_dir): os.makedirs(int_output_dir) self.save_model(int_output_path) self.iter = iter return def _enable_draw(self): return self.world.env.enable_draw def _log_val(self, s, g): pass def _build_replay_buffer(self, buffer_size): num_procs = MPIUtil.get_num_procs() buffer_size = int(buffer_size / num_procs) self.replay_buffer = ReplayBuffer(buffer_size=buffer_size) self.replay_buffer_initialized = False return def _store_path(self, path): path_id = self.replay_buffer.store(path) valid_path = path_id != MathUtil.INVALID_IDX if valid_path: self.train_return = path.calc_return() if self._need_normalizer_update: self._record_normalizers(path) return path_id def _record_normalizers(self, path): states = np.array(path.states) self.s_norm.record(states) if self.has_goal(): goals = np.array(path.goals) self.g_norm.record(goals) return def _update_normalizers(self): self.s_norm.update() if self.has_goal(): self.g_norm.update() return def _train(self): samples = self.replay_buffer.total_count self._total_sample_count = int(MPIUtil.reduce_sum(samples)) end_training = False if (self.replay_buffer_initialized): if (self._valid_train_step()): prev_iter = self.iter iters = self._get_iters_per_update() avg_train_return = MPIUtil.reduce_avg(self.train_return) for i in range(iters): curr_iter = self.iter wall_time = time.time() - self.start_time wall_time /= 60 * 60 # store time in hours has_goal = self.has_goal() s_mean = np.mean(self.s_norm.mean) s_std = np.mean(self.s_norm.std) g_mean = np.mean(self.g_norm.mean) if has_goal else 0 g_std = np.mean(self.g_norm.std) if has_goal else 0 self.logger.log_tabular("Iteration", self.iter) self.logger.log_tabular("Wall_Time", wall_time) self.logger.log_tabular("Samples", self._total_sample_count) self.logger.log_tabular("Train_Return", avg_train_return) self.logger.log_tabular("Test_Return", self.avg_test_return) self.logger.log_tabular("State_Mean", s_mean) self.logger.log_tabular("State_Std", s_std) self.logger.log_tabular("Goal_Mean", g_mean) self.logger.log_tabular("Goal_Std", g_std) self._log_exp_params() self._update_iter(self.iter + 1) self._train_step() Logger.print2("Agent " + str(self.id)) self.logger.print_tabular() Logger.print2("") if (self._enable_output() and curr_iter % self.int_output_iters == 0): self.logger.dump_tabular() if (prev_iter // self.int_output_iters != self.iter // self.int_output_iters): end_training = self.enable_testing() else: Logger.print2("Agent " + str(self.id)) Logger.print2("Samples: " + str(self._total_sample_count)) Logger.print2("") if (self._total_sample_count >= self.init_samples): self.replay_buffer_initialized = True end_training = self.enable_testing() if self._need_normalizer_update: self._update_normalizers() self._need_normalizer_update = self.normalizer_samples > self._total_sample_count if end_training: self._init_mode_train_end() return def _get_iters_per_update(self): return MPIUtil.get_num_procs() * self.iters_per_update def _valid_train_step(self): return True def _log_exp_params(self): self.logger.log_tabular("Exp_Rate", self.exp_params_curr.rate) self.logger.log_tabular("Exp_Noise", self.exp_params_curr.noise) self.logger.log_tabular("Exp_Temp", self.exp_params_curr.temp) return
def load_model(self, in_path): with self.sess.as_default(), self.graph.as_default(): self.saver.restore(self.sess, in_path) self._load_normalizers() Logger.print2('Model loaded from: ' + in_path) return