def __init__(self, a_dim, s_dim, a_bound, env=None, buffer=None): self.dynamic_memory = np.zeros( (MEMORY_CAPACITY, s_dim + s_dim + a_dim), dtype=np.float32) # 1(the last dimension) for reward self.num_episodes = NUM_EPISODES self.minibatch = MINI_BATCH self.sample_size = SAMPLE_SIZE self.trainfromscratch = TRAIN_FROM_SCRATCH self.sess = tf.Session() self.env = copy_env(env) self.reset_env = copy_env(env) self.globaltesttime = 0 self.vtrace_losses = [] self.dlosses = [] self.alosses = [] self.dynamic_model = Dynamic_Net(s_dim, a_dim, 'dm') if buffer == None: self.buffer = Replay_buffer() else: self.buffer = buffer self.a_dim, self.s_dim, self.a_bound = a_dim, s_dim, a_bound, if not self.trainfromscratch: self.buffer.load_data()
def MPC(self, initial_start, env, iteration_times=5): batch_max_value = np.zeros([iteration_times, 2]) #total_time = time.time() for i in range(1): initial_time = time.time() envs = [ copy_env(env) for j in range((ROLL_OUTS + 1) * iteration_times) ] # 复制环境,为计算V(S) = r+v(S_(t+1))做准备 if i == 0: initial_action = self.sample_action( initial_start.reshape([-1, s_dim]) ) # initial action is supposed to be [action_dim,] narray object sigma = np.ones([(ROLL_OUTS + 1) * iteration_times, self.a_dim]) sigma[0] = np.zeros_like(sigma[0]) action_groups = np.squeeze( np.clip(np.random.normal(initial_action, sigma), -self.a_bound[0], self.a_bound[0])) action_groups = action_groups.reshape( (ROLL_OUTS + 1) * iteration_times, self.a_dim) next_stages = [] rewards = [] initial_time = time.time() - initial_time # print('initial took', initial_time, ' s') # calculate_value_time = time.time() for j in range(len(action_groups)): temp_next_state, temp_reward, _, _ = envs[j].step( copy.deepcopy(action_groups[j][0])) next_stages.append(temp_next_state) rewards.append(temp_reward) state_groups = np.array(next_stages) rewards = np.array(rewards) next_values = np.array(self.get_state_value(state_groups)) values = rewards.reshape([(ROLL_OUTS + 1) * iteration_times, 1 ]) + next_values # calculate_value_time = time.time() - calculate_value_time # print('calculate value time took ', calculate_value_time) get_hybrid_action = time.time() probability_weighting = np.zeros( ((ROLL_OUTS + 1) * iteration_times, 1), dtype=np.float64) exponential_value_loss = np.zeros( ((ROLL_OUTS + 1) * iteration_times, 1), dtype=np.float64) maxv = np.max(values, axis=0) minv = np.min(values, axis=0) if (maxv - minv) < 1e-4: probability_weighting[:] = 1.0 / ROLL_OUTS else: for k in range(exponential_value_loss.shape[0]): res = (maxv - values[k]) / (maxv - minv) exponential_value_loss[k] = res probability_weighting[:] = exponential_value_loss[:] / np.sum( exponential_value_loss) # 计算soft_max概率 hybrid_action = np.dot(action_groups.T, probability_weighting) hybrid_next_state, hybrid_reward, _, _ = envs[-1].step( hybrid_action) hybrid_value = hybrid_reward + self.get_state_value( np.array(hybrid_next_state).reshape([1, 11])) if hybrid_value >= maxv: current_action = hybrid_action current_value = hybrid_value else: current_action = action_groups[np.argmax(values)] current_value = maxv batch_max_value[i][0] = copy.deepcopy(current_action) batch_max_value[i][1] = copy.deepcopy(current_value) initial_action = hybrid_action # get_hybrid_action = time.time() - get_hybrid_action # print('get hybrid action took :', get_hybrid_action) index = np.argmax(batch_max_value[:, 1], axis=0) # total_time = time.time() - total_time # print('pi2_with_critic took ', total_time,' s') return batch_max_value[index][0]
def pi2_tradition(self, initial_start, env, iteration_times=5): # pi2探索,输入状态,生成混合动作,点优化5次,去最好。 # total_time = time.time() batch_max_value = np.zeros([iteration_times, 2]) for i in range(iteration_times): # initial_time = time.time() envs = [copy_env(env) for j in range(ROLL_OUTS + 1) ] # 复制环境,为计算V(S) = r+v(S_(t+1))做准备 if i == 0: initial_action = self.sample_action( initial_start.reshape([-1, s_dim]) ) # initial action is supposed to be [action_dim,] narray object sigma = np.ones([ROLL_OUTS, self.a_dim]) sigma[0] = np.zeros_like(sigma[0]) action_groups = np.squeeze( np.clip(np.random.normal(initial_action, sigma), -self.a_bound[0], self.a_bound[0])) action_groups = action_groups.reshape(ROLL_OUTS, self.a_dim) rewards = np.zeros([len(action_groups)]) # initial_time = time.time() - initial_time # print('initial took', initial_time, ' s') # calculate_value_time = time.time() for j in range(len(action_groups)): r = 0 a = copy.deepcopy(action_groups[j][0]) s_a = np.zeros([1, s_dim + a_dim]) s_a[0][:s_dim] = initial_start s_a[s_dim:s_dim + a_dim] = a temp_next_state, temp_reward, done = self.dynamic_model.prediction( s_a) r += temp_reward timer = 0 while not done or timer < 100: a = pi2_critic.sample_action( temp_next_state.reshape([-1, s_dim])) timer += 1 s_a = np.zeros([1, s_dim + a_dim]) s_a[0][:s_dim] = temp_next_state s_a[s_dim:s_dim + a_dim] = a temp_next_state, temp_reward, done = self.dynamic_model.prediction( s_a) r += temp_reward rewards[j] = r values = rewards.reshape([ROLL_OUTS, 1]) # calculate_value_time = time.time() - calculate_value_time # print('calculate value time took ', calculate_value_time) # time_start = time.time() probability_weighting = np.zeros((ROLL_OUTS, 1), dtype=np.float64) exponential_value_loss = np.zeros((ROLL_OUTS, 1), dtype=np.float64) maxv = np.max(values, axis=0) minv = np.min(values, axis=0) if (maxv - minv) < 1e-4: probability_weighting[:] = 1.0 / ROLL_OUTS else: for k in range(exponential_value_loss.shape[0]): res = (maxv - values[k]) / (maxv - minv) exponential_value_loss[k] = res probability_weighting[:] = exponential_value_loss[:] / np.sum( exponential_value_loss) # 计算soft_max概率 hybrid_action = np.dot(action_groups.T, probability_weighting) hybrid_value = 0 a = hybrid_action temp_next_state, temp_reward, done, _ = envs[-1].step(a) hybrid_value += temp_reward timer = 0 while not done or timer < 100: a = pi2_critic.sample_action( temp_next_state.reshape([-1, s_dim])) temp_next_state, temp_reward, done, _ = envs[-1].step(a) hybrid_value += temp_reward timer += 1 if hybrid_value >= maxv: current_action = hybrid_action current_value = hybrid_value else: current_action = action_groups[np.argmax(values)] current_value = maxv batch_max_value[i][0] = copy.deepcopy(current_action) batch_max_value[i][1] = copy.deepcopy(current_value) initial_action = hybrid_action # time_end = time.time() # print('get hybrid action :', time_end - time_start) index = np.argmax(batch_max_value[:, 1], axis=0) # total_time = time.time() - total_time # print('pi2_tradition took ', total_time,' s') return batch_max_value[index][0]
def __init__(self, a_dim, s_dim, a_bound, env=None, buffer=None): self.dynamic_memory = np.zeros( (MEMORY_CAPACITY, s_dim + s_dim + a_dim), dtype=np.float32) # 1(the last dimension) for reward self.num_episodes = NUM_EPISODES self.minibatch = MINI_BATCH self.sample_size = SAMPLE_SIZE self.trainfromscratch = TRAIN_FROM_SCRATCH self.sess = tf.Session() self.env = copy_env(env) self.reset_env = copy_env(env) self.globaltesttime = 0 self.vtrace_losses = [] self.dlosses = [] self.alosses = [] self.dynamic_model = Dynamic_Net(s_dim, a_dim, 'dm') if buffer == None: self.buffer = Replay_buffer(buffer_size=500) else: self.buffer = buffer self.a_dim, self.s_dim, self.a_bound = a_dim, s_dim, a_bound, self.S = tf.placeholder(tf.float32, [None, s_dim], 's') self.target_action = tf.placeholder(tf.float32, [None, a_dim]) self.target_value = tf.placeholder(tf.float32, [None, 1]) self.current_action = tf.placeholder(tf.float32, [None, a_dim]) self.generate_sample_from_outside_buffer = False with tf.variable_scope('Actor'): self.a, self.a_mu = self._build_a(self.S, scope='eval', trainable=True) self.action = tf.clip_by_value(tf.squeeze(self.a.sample(1), axis=0), -a_bound[0], a_bound[0]) self.action_prob = self.a.prob(self.current_action) with tf.variable_scope('Critic'): self.q = self._build_c(self.S, scope='vtrace', trainable=True) self.q_compare = self._build_c(self.S, scope='td_lambda', trainable=True) # networks parameters self.ae_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval') self.cv_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/vtrace') self.ctd_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/td_lambda') self.vtrace_error = tf.losses.mean_squared_error( labels=self.target_value, predictions=self.q) self.td_error = tf.losses.mean_squared_error( labels=self.target_value, predictions=self.q_compare) self.vtrace_train = tf.train.AdamOptimizer(LR_C).minimize( self.vtrace_error, var_list=self.cv_params) self.td_train = tf.train.AdamOptimizer(LR_C).minimize( self.td_error, var_list=self.ctd_params) self.a_loss = tf.losses.mean_squared_error( labels=self.target_action, predictions=self.a_mu) # maximize the q self.atrain = tf.train.AdamOptimizer(LR_A).minimize( self.a_loss, var_list=self.ae_params) self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver() if not self.trainfromscratch: self.buffer.load_data()