def _calc_updated_vals(self, idx): r = self.replay_buffer.get('rewards', idx) if self.discount == 0: new_V = r else: next_idx = self.replay_buffer.get_next_idx(idx) s_next = self.replay_buffer.get('states', next_idx) g_next = self.replay_buffer.get( 'goals', next_idx) if self.has_goal() else None #####################[ wt_next = self.replay_buffer.get('wts', next_idx) if Settings.mode( ) == Mode.CWT_CNN_v1 or Settings.mode( ) == Mode.CWT_CNN_v2 else None #####################] is_end = self.replay_buffer.is_path_end(idx) is_fail = self.replay_buffer.check_terminal_flag( idx, Env.Terminate.Fail) is_succ = self.replay_buffer.check_terminal_flag( idx, Env.Terminate.Succ) is_fail = np.logical_and(is_end, is_fail) is_succ = np.logical_and(is_end, is_succ) V_next = self._eval_critic(s_next, g_next, wt_next) V_next[is_fail] = self.val_fail V_next[is_succ] = self.val_succ new_V = r + self.discount * V_next return new_V
def _update_actor(self): key = self.EXP_ACTION_FLAG idx = self.replay_buffer.sample_filtered(self._local_mini_batch_size, key) has_goal = self.has_goal() s = self.replay_buffer.get('states', idx) g = self.replay_buffer.get('goals', idx) if has_goal else None #####################[ wt = self.replay_buffer.get('wts', idx) if Settings.mode( ) == Mode.CWT_CNN_v1 or Settings.mode() == Mode.CWT_CNN_v2 else None #####################] a = self.replay_buffer.get('actions', idx) V_new = self._calc_updated_vals(idx) V_old = self._eval_critic(s, g, wt) adv = V_new - V_old feed = { self.s_tf: s, self.g_tf: g, self.wt_tf: wt, self.a_tf: a, self.adv_tf: adv } loss, grads = self.sess.run([self.actor_loss_tf, self.actor_grad_tf], feed) self.actor_solver.update(grads) return loss
def _build_net_critic(self, net_name): norm_s_tf = self.s_norm.normalize_tf(self.s_tf) input_tfs = [norm_s_tf] if (self.has_goal()): norm_g_tf = self.g_norm.normalize_tf(self.g_tf) input_tfs += [norm_g_tf] #####################[ if Settings.mode() == Mode.CWT_CNN_v1 or Settings.mode( ) == Mode.CWT_CNN_v2: cnn_network = NetBuilder.build_net( MyCNN.NAME, self.wt_tf, self.my_memory_buffer.channel_count) input_tfs += [cnn_network] #####################] h = NetBuilder.build_net(net_name, input_tfs) norm_val_tf = tf.layers.dense( inputs=h, units=1, activation=None, kernel_initializer=TFUtil.xavier_initializer) norm_val_tf = tf.reshape(norm_val_tf, [-1]) val_tf = self.val_norm.unnormalize_tf(norm_val_tf) return val_tf
def _build_net_actor(self, net_name, init_output_scale): norm_s_tf = self.s_norm.normalize_tf(self.s_tf) input_tfs = [norm_s_tf] if (self.has_goal()): norm_g_tf = self.g_norm.normalize_tf(self.g_tf) input_tfs += [norm_g_tf] #####################[ if Settings.mode() == Mode.CWT_CNN_v1 or Settings.mode( ) == Mode.CWT_CNN_v2: cnn_network = NetBuilder.build_net( MyCNN.NAME, self.wt_tf, self.my_memory_buffer.channel_count) input_tfs += [cnn_network] #####################] h = NetBuilder.build_net(net_name, input_tfs) norm_a_tf = tf.layers.dense( inputs=h, units=self.get_action_size(), activation=None, kernel_initializer=tf.random_uniform_initializer( minval=-init_output_scale, maxval=init_output_scale)) a_tf = self.a_norm.unnormalize_tf(norm_a_tf) return a_tf
def _build_nets(self, json_data): assert self.ACTOR_NET_KEY in json_data assert self.CRITIC_NET_KEY in json_data actor_net_name = json_data[self.ACTOR_NET_KEY] critic_net_name = json_data[self.CRITIC_NET_KEY] actor_init_output_scale = 1 if ( self.ACTOR_INIT_OUTPUT_SCALE_KEY not in json_data) else json_data[self.ACTOR_INIT_OUTPUT_SCALE_KEY] s_size = self.get_state_size() g_size = self.get_goal_size() a_size = self.get_action_size() # setup input tensors self.s_tf = tf.placeholder(tf.float32, shape=[None, s_size], name="s") # observations self.tar_val_tf = tf.placeholder(tf.float32, shape=[None], name="tar_val") # target value s self.adv_tf = tf.placeholder(tf.float32, shape=[None], name="adv") # advantage self.a_tf = tf.placeholder(tf.float32, shape=[None, a_size], name="a") # target actions self.g_tf = tf.placeholder( tf.float32, shape=([None, g_size] if self.has_goal() else None), name="g") # goals #####################[ if Settings.mode() == Mode.CWT_CNN_v1 or Settings.mode( ) == Mode.CWT_CNN_v2: self.wt_tf = tf.placeholder(tf.float32, shape=([ None, self.my_wt.scale_count, self.my_memory_buffer.length, self.my_memory_buffer.channel_count ]), name="wt") else: self.wt_tf = tf.placeholder(tf.float32, shape=(None), name="wt") #####################] with tf.variable_scope('main'): with tf.variable_scope('actor'): self.actor_tf = self._build_net_actor(actor_net_name, actor_init_output_scale) with tf.variable_scope('critic'): self.critic_tf = self._build_net_critic(critic_net_name) if (self.actor_tf != None): Logger.print('Built actor net: ' + actor_net_name) if (self.critic_tf != None): Logger.print('Built critic net: ' + critic_net_name) return
def _update_new_action(self): s = self._record_state() g = self._record_goal() #####################[ #self.my_update_array.append(self.my_update_count) #self.my_update_count = 0 wt = [[]] if Settings.mode() == Mode.CWT_CNN_v1 or Settings.mode( ) == Mode.CWT_CNN_v2: self.my_memory_buffer.save( s[-self.my_memory_buffer.channel_count:]) wt = [self.my_wt.calculate_cwt(self.my_memory_buffer)] #####################] if not (self._is_first_step()): r = self._record_reward() self.path.rewards.append(r) a, logp = self._decide_action(s=s, g=g, wt=wt) assert len(np.shape(a)) == 1 assert len(np.shape(logp)) <= 1 flags = self._record_flags() self._apply_action(a) self.path.states.append(s) self.path.goals.append(g) #####################[ self.path.wts.append(wt[0]) #####################] self.path.actions.append(a) self.path.logps.append(logp) self.path.flags.append(flags) if self._enable_draw(): self._log_val(s, g, wt) return
def end_episode(self): if (self.path.pathlength() > 0): self._end_path() if (self._mode == self.Mode.TRAIN or self._mode == self.Mode.TRAIN_END): if (self.enable_training and self.path.pathlength() > 0): self._store_path(self.path) elif (self._mode == self.Mode.TEST): self._update_test_return(self.path) else: assert False, Logger.print("Unsupported RL agent mode" + str(self._mode)) self._update_mode() #####################[ if Settings.mode() == Mode.CWT_CNN_v1 or Settings.mode( ) == Mode.CWT_CNN_v2: self.my_memory_buffer.reset() #####################] return
def _end_path(self): s = self._record_state() g = self._record_goal() #####################[ wt = [] if Settings.mode() == Mode.CWT_CNN_v1 or Settings.mode( ) == Mode.CWT_CNN_v2: # saves only a reduced set of state variables -> the last "len(channels)" self.my_memory_buffer.save( s[-self.my_memory_buffer.channel_count:]) wt = self.my_wt.calculate_cwt(self.my_memory_buffer) #####################] r = self._record_reward() self.path.rewards.append(r) self.path.states.append(s) self.path.goals.append(g) #####################[ self.path.wts.append(wt) #####################] self.path.terminate = self.world.env.check_terminate(self.id) return
def _update_critic(self): idx = self.replay_buffer.sample(self._local_mini_batch_size) s = self.replay_buffer.get('states', idx) g = self.replay_buffer.get('goals', idx) if self.has_goal() else None #####################[ wt = self.replay_buffer.get('wts', idx) if Settings.mode( ) == Mode.CWT_CNN_v1 or Settings.mode() == Mode.CWT_CNN_v2 else None #####################] tar_V = self._calc_updated_vals(idx) tar_V = np.clip(tar_V, self.val_min, self.val_max) feed = { self.s_tf: s, self.g_tf: g, self.wt_tf: wt, self.tar_val_tf: tar_V } loss, grads = self.sess.run([self.critic_loss_tf, self.critic_grad_tf], feed) self.critic_solver.update(grads) return loss
def _train(self): samples = self.replay_buffer.total_count self._total_sample_count = int(MPIUtil.reduce_sum(samples)) end_training = False if (self.replay_buffer_initialized): if (self._valid_train_step()): prev_iter = self.iter iters = self._get_iters_per_update() avg_train_return = MPIUtil.reduce_avg(self.train_return) for i in range(iters): curr_iter = self.iter wall_time = time.time() - self.start_time wall_time /= 60 * 60 # store time in hours has_goal = self.has_goal() s_mean = np.mean(self.s_norm.mean) s_std = np.mean(self.s_norm.std) g_mean = np.mean(self.g_norm.mean) if has_goal else 0 g_std = np.mean(self.g_norm.std) if has_goal else 0 self.logger.log_tabular("Iteration", self.iter) self.logger.log_tabular("Wall_Time", wall_time) self.logger.log_tabular("Samples", self._total_sample_count) self.logger.log_tabular("Train_Return", avg_train_return) self.logger.log_tabular("Test_Return", self.avg_test_return) self.logger.log_tabular("State_Mean", s_mean) self.logger.log_tabular("State_Std", s_std) self.logger.log_tabular("Goal_Mean", g_mean) self.logger.log_tabular("Goal_Std", g_std) self._log_exp_params() self._update_iter(self.iter + 1) self._train_step() Logger.print("Agent " + str(self.id)) self.logger.print_tabular() Logger.print("") if (self._enable_output() and curr_iter % self.int_output_iters == 0): self.logger.dump_tabular() if (prev_iter // self.int_output_iters != self.iter // self.int_output_iters): end_training = self.enable_testing() #####################[ if Settings.use_babe_support() and self.is_baby_support_on: if self.avg_test_return > self.baby_support_threshold: self._set_off_baby_support() #####################] else: Logger.print("Agent " + str(self.id)) Logger.print("Samples: " + str(self._total_sample_count)) Logger.print("") if (self._total_sample_count >= self.init_samples): self.replay_buffer_initialized = True end_training = self.enable_testing() if self._need_normalizer_update: self._update_normalizers() self._need_normalizer_update = self.normalizer_samples > self._total_sample_count if end_training: self._init_mode_train_end() return
def __init__(self, world, id, json_data): self.world = world self.id = id self.logger = Logger() self._mode = self.Mode.TRAIN assert self._check_action_space(), \ Logger.print("Invalid action space, got {:s}".format(str(self.get_action_space()))) self._enable_training = True self.path = Path() self.iter = int(0) self.start_time = time.time() self._update_counter = 0 self.update_period = 1.0 # simulated time (seconds) before each training update self.iters_per_update = int(1) self.discount = 0.95 self.mini_batch_size = int(32) self.replay_buffer_size = int(50000) self.init_samples = int(1000) self.normalizer_samples = np.inf self._local_mini_batch_size = self.mini_batch_size # batch size for each work for multiprocessing self._need_normalizer_update = True self._total_sample_count = 0 self._output_dir = "" self._int_output_dir = "" self.output_iters = 100 self.int_output_iters = 100 self.train_return = 0.0 self.test_episodes = int(0) self.test_episode_count = int(0) self.test_return = 0.0 self.avg_test_return = 0.0 self.exp_anneal_samples = 320000 self.exp_params_beg = ExpParams() self.exp_params_end = ExpParams() self.exp_params_curr = ExpParams() self._load_params(json_data) self._build_replay_buffer(self.replay_buffer_size) self._build_normalizers() self._build_bounds() self.reset() #####################[ self.is_baby_support_on = False # self.baby_support_threshold = baby_support_max_value * policy frequency (query_rate) * max_step_train_time self.baby_support_threshold = 0.8 * 30 * 20 if Settings.use_babe_support(): self._set_on_baby_support() if Settings.mode() == Mode.CWT_CNN_v1 or Settings.mode( ) == Mode.CWT_CNN_v2: channels = [ "r_hip_w", "r_hip_x", "r_hip_y", "r_hip_z", "r_knee_rot", "r_shoulder_w", "r_shoulder_x", "r_shoulder_y", "r_shoulder_z", "r_elbow_rot", "l_hip_w", "l_hip_x", "l_hip_y", "l_hip_z", "r_knee_rot", "r_shoulder_w", "r_shoulder_x", "r_shoulder_y", "r_shoulder_z", "l_elbow_rot" ] memory_buffer_size = 16 cache_size = 1 self.my_memory_buffer = MyMemoryBuffer(channels, memory_buffer_size, cache_size) scale_min = 1 scale_max = memory_buffer_size scale_count = memory_buffer_size # square scalogram self.my_wt = MyWT(scale_min, scale_max, scale_count) # get test data #self.my_rawStateData = MyRawStateData(197, 1200) #####################] return