Exemplo n.º 1
0
    def __init__(self, a_dim, s_dim, a_bound, env=None, buffer=None):
        self.dynamic_memory = np.zeros(
            (MEMORY_CAPACITY, s_dim + s_dim + a_dim), dtype=np.float32)
        # 1(the last dimension) for reward
        self.num_episodes = NUM_EPISODES
        self.minibatch = MINI_BATCH
        self.sample_size = SAMPLE_SIZE
        self.trainfromscratch = TRAIN_FROM_SCRATCH
        self.sess = tf.Session()
        self.env = copy_env(env)
        self.reset_env = copy_env(env)
        self.globaltesttime = 0
        self.vtrace_losses = []
        self.dlosses = []
        self.alosses = []
        self.dynamic_model = Dynamic_Net(s_dim, a_dim, 'dm')
        if buffer == None:
            self.buffer = Replay_buffer()
        else:
            self.buffer = buffer
        self.a_dim, self.s_dim, self.a_bound = a_dim, s_dim, a_bound,

        if not self.trainfromscratch:
            self.buffer.load_data()
Exemplo n.º 2
0
 def MPC(self, initial_start, env, iteration_times=5):
     batch_max_value = np.zeros([iteration_times, 2])
     #total_time = time.time()
     for i in range(1):
         initial_time = time.time()
         envs = [
             copy_env(env) for j in range((ROLL_OUTS + 1) * iteration_times)
         ]  # 复制环境,为计算V(S) = r+v(S_(t+1))做准备
         if i == 0:
             initial_action = self.sample_action(
                 initial_start.reshape([-1, s_dim])
             )  # initial action is supposed to be [action_dim,] narray object
         sigma = np.ones([(ROLL_OUTS + 1) * iteration_times, self.a_dim])
         sigma[0] = np.zeros_like(sigma[0])
         action_groups = np.squeeze(
             np.clip(np.random.normal(initial_action, sigma),
                     -self.a_bound[0], self.a_bound[0]))
         action_groups = action_groups.reshape(
             (ROLL_OUTS + 1) * iteration_times, self.a_dim)
         next_stages = []
         rewards = []
         initial_time = time.time() - initial_time
         # print('initial took', initial_time, ' s')
         #  calculate_value_time = time.time()
         for j in range(len(action_groups)):
             temp_next_state, temp_reward, _, _ = envs[j].step(
                 copy.deepcopy(action_groups[j][0]))
             next_stages.append(temp_next_state)
             rewards.append(temp_reward)
         state_groups = np.array(next_stages)
         rewards = np.array(rewards)
         next_values = np.array(self.get_state_value(state_groups))
         values = rewards.reshape([(ROLL_OUTS + 1) * iteration_times, 1
                                   ]) + next_values
         #            calculate_value_time = time.time() - calculate_value_time
         #  print('calculate value time took ', calculate_value_time)
         get_hybrid_action = time.time()
         probability_weighting = np.zeros(
             ((ROLL_OUTS + 1) * iteration_times, 1), dtype=np.float64)
         exponential_value_loss = np.zeros(
             ((ROLL_OUTS + 1) * iteration_times, 1), dtype=np.float64)
         maxv = np.max(values, axis=0)
         minv = np.min(values, axis=0)
         if (maxv - minv) < 1e-4:
             probability_weighting[:] = 1.0 / ROLL_OUTS
         else:
             for k in range(exponential_value_loss.shape[0]):
                 res = (maxv - values[k]) / (maxv - minv)
                 exponential_value_loss[k] = res
             probability_weighting[:] = exponential_value_loss[:] / np.sum(
                 exponential_value_loss)  # 计算soft_max概率
         hybrid_action = np.dot(action_groups.T, probability_weighting)
         hybrid_next_state, hybrid_reward, _, _ = envs[-1].step(
             hybrid_action)
         hybrid_value = hybrid_reward + self.get_state_value(
             np.array(hybrid_next_state).reshape([1, 11]))
         if hybrid_value >= maxv:
             current_action = hybrid_action
             current_value = hybrid_value
         else:
             current_action = action_groups[np.argmax(values)]
             current_value = maxv
         batch_max_value[i][0] = copy.deepcopy(current_action)
         batch_max_value[i][1] = copy.deepcopy(current_value)
         initial_action = hybrid_action
     # get_hybrid_action = time.time() - get_hybrid_action
     #  print('get hybrid action took :', get_hybrid_action)
     index = np.argmax(batch_max_value[:, 1], axis=0)
     # total_time = time.time() - total_time
     #   print('pi2_with_critic took ', total_time,' s')
     return batch_max_value[index][0]
Exemplo n.º 3
0
    def pi2_tradition(self, initial_start, env, iteration_times=5):
        # pi2探索,输入状态,生成混合动作,点优化5次,去最好。
        #   total_time = time.time()
        batch_max_value = np.zeros([iteration_times, 2])
        for i in range(iteration_times):
            #   initial_time = time.time()
            envs = [copy_env(env) for j in range(ROLL_OUTS + 1)
                    ]  # 复制环境,为计算V(S) = r+v(S_(t+1))做准备
            if i == 0:
                initial_action = self.sample_action(
                    initial_start.reshape([-1, s_dim])
                )  # initial action is supposed to be [action_dim,] narray object
            sigma = np.ones([ROLL_OUTS, self.a_dim])
            sigma[0] = np.zeros_like(sigma[0])
            action_groups = np.squeeze(
                np.clip(np.random.normal(initial_action, sigma),
                        -self.a_bound[0], self.a_bound[0]))
            action_groups = action_groups.reshape(ROLL_OUTS, self.a_dim)
            rewards = np.zeros([len(action_groups)])
            # initial_time = time.time() - initial_time
            #   print('initial took', initial_time, ' s')
            #   calculate_value_time = time.time()
            for j in range(len(action_groups)):

                r = 0
                a = copy.deepcopy(action_groups[j][0])
                s_a = np.zeros([1, s_dim + a_dim])
                s_a[0][:s_dim] = initial_start
                s_a[s_dim:s_dim + a_dim] = a
                temp_next_state, temp_reward, done = self.dynamic_model.prediction(
                    s_a)
                r += temp_reward
                timer = 0

                while not done or timer < 100:
                    a = pi2_critic.sample_action(
                        temp_next_state.reshape([-1, s_dim]))
                    timer += 1
                    s_a = np.zeros([1, s_dim + a_dim])
                    s_a[0][:s_dim] = temp_next_state
                    s_a[s_dim:s_dim + a_dim] = a
                    temp_next_state, temp_reward, done = self.dynamic_model.prediction(
                        s_a)
                    r += temp_reward
                rewards[j] = r

            values = rewards.reshape([ROLL_OUTS, 1])
            #    calculate_value_time = time.time() - calculate_value_time
            #  print('calculate value time took ', calculate_value_time)
            #    time_start = time.time()
            probability_weighting = np.zeros((ROLL_OUTS, 1), dtype=np.float64)
            exponential_value_loss = np.zeros((ROLL_OUTS, 1), dtype=np.float64)
            maxv = np.max(values, axis=0)
            minv = np.min(values, axis=0)
            if (maxv - minv) < 1e-4:
                probability_weighting[:] = 1.0 / ROLL_OUTS
            else:
                for k in range(exponential_value_loss.shape[0]):
                    res = (maxv - values[k]) / (maxv - minv)
                    exponential_value_loss[k] = res
                probability_weighting[:] = exponential_value_loss[:] / np.sum(
                    exponential_value_loss)  # 计算soft_max概率
            hybrid_action = np.dot(action_groups.T, probability_weighting)

            hybrid_value = 0
            a = hybrid_action
            temp_next_state, temp_reward, done, _ = envs[-1].step(a)
            hybrid_value += temp_reward
            timer = 0
            while not done or timer < 100:
                a = pi2_critic.sample_action(
                    temp_next_state.reshape([-1, s_dim]))
                temp_next_state, temp_reward, done, _ = envs[-1].step(a)
                hybrid_value += temp_reward
                timer += 1
            if hybrid_value >= maxv:
                current_action = hybrid_action
                current_value = hybrid_value
            else:
                current_action = action_groups[np.argmax(values)]
                current_value = maxv
            batch_max_value[i][0] = copy.deepcopy(current_action)
            batch_max_value[i][1] = copy.deepcopy(current_value)
            initial_action = hybrid_action
        #    time_end = time.time()
        #    print('get hybrid action :', time_end - time_start)
        index = np.argmax(batch_max_value[:, 1], axis=0)
        # total_time = time.time() - total_time
        # print('pi2_tradition took ', total_time,' s')
        return batch_max_value[index][0]
Exemplo n.º 4
0
    def __init__(self, a_dim, s_dim, a_bound, env=None, buffer=None):
        self.dynamic_memory = np.zeros(
            (MEMORY_CAPACITY, s_dim + s_dim + a_dim), dtype=np.float32)
        # 1(the last dimension) for reward
        self.num_episodes = NUM_EPISODES
        self.minibatch = MINI_BATCH
        self.sample_size = SAMPLE_SIZE
        self.trainfromscratch = TRAIN_FROM_SCRATCH
        self.sess = tf.Session()
        self.env = copy_env(env)
        self.reset_env = copy_env(env)
        self.globaltesttime = 0
        self.vtrace_losses = []
        self.dlosses = []
        self.alosses = []
        self.dynamic_model = Dynamic_Net(s_dim, a_dim, 'dm')
        if buffer == None:
            self.buffer = Replay_buffer(buffer_size=500)
        else:
            self.buffer = buffer
        self.a_dim, self.s_dim, self.a_bound = a_dim, s_dim, a_bound,
        self.S = tf.placeholder(tf.float32, [None, s_dim], 's')
        self.target_action = tf.placeholder(tf.float32, [None, a_dim])
        self.target_value = tf.placeholder(tf.float32, [None, 1])
        self.current_action = tf.placeholder(tf.float32, [None, a_dim])
        self.generate_sample_from_outside_buffer = False
        with tf.variable_scope('Actor'):
            self.a, self.a_mu = self._build_a(self.S,
                                              scope='eval',
                                              trainable=True)
        self.action = tf.clip_by_value(tf.squeeze(self.a.sample(1), axis=0),
                                       -a_bound[0], a_bound[0])
        self.action_prob = self.a.prob(self.current_action)
        with tf.variable_scope('Critic'):
            self.q = self._build_c(self.S, scope='vtrace', trainable=True)
            self.q_compare = self._build_c(self.S,
                                           scope='td_lambda',
                                           trainable=True)
        # networks parameters
        self.ae_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                           scope='Actor/eval')
        self.cv_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                           scope='Critic/vtrace')
        self.ctd_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                            scope='Critic/td_lambda')

        self.vtrace_error = tf.losses.mean_squared_error(
            labels=self.target_value, predictions=self.q)
        self.td_error = tf.losses.mean_squared_error(
            labels=self.target_value, predictions=self.q_compare)
        self.vtrace_train = tf.train.AdamOptimizer(LR_C).minimize(
            self.vtrace_error, var_list=self.cv_params)
        self.td_train = tf.train.AdamOptimizer(LR_C).minimize(
            self.td_error, var_list=self.ctd_params)
        self.a_loss = tf.losses.mean_squared_error(
            labels=self.target_action, predictions=self.a_mu)  # maximize the q
        self.atrain = tf.train.AdamOptimizer(LR_A).minimize(
            self.a_loss, var_list=self.ae_params)
        self.sess.run(tf.global_variables_initializer())
        self.saver = tf.train.Saver()
        if not self.trainfromscratch:
            self.buffer.load_data()