예제 #1
0
    def test_action_mapping(self):
        origin_act = np.array([-1.0, 0.0, 1.0])

        mapped_act = action_mapping(origin_act, 0.0, 1.0)
        self.assertListEqual(list(mapped_act), [0.0, 0.5, 1.0])

        mapped_act = action_mapping(origin_act, -2.0, 2.0)
        self.assertListEqual(list(mapped_act), [-2.0, 0.0, 2.0])

        mapped_act = action_mapping(origin_act, -5.0, 10.0)
        self.assertListEqual(list(mapped_act), [-5.0, 2.5, 10.0])
예제 #2
0
def run_evaluate_episode(env, agent, render):
    obs = env.reset()
    total_reward = 0
    episode_goal = np.expand_dims(obs[-3:], axis=0)
    steps = 0
    while MAX_STEPS_PER_EPISODES - steps:
        steps += 1
        batch_obs = np.expand_dims(obs[8:15], axis=0)
        batch_obs_with_goal = np.concatenate((batch_obs, episode_goal), axis=1)
        action = agent.predict(batch_obs_with_goal.astype('float32'))
        action = np.squeeze(action)
        action = action_mapping(action, env.action_space.low[0],
                                env.action_space.high[0])

        next_obs, reward, done, info = env.step(action)
        if render:
            env.render()
        # print(reward)

        obs = next_obs
        total_reward += reward

        if done:
            break
    return total_reward
예제 #3
0
def evaluate(env, agent, render=False):
    eval_reward = []
    for i in range(5):
        obs = env.reset()
        total_reward, steps = 0, 0
        while True:
            batch_obs = np.expand_dims(obs, axis=0)
            pred_action = agent.predict(batch_obs.astype('float32'))
            pred_action = np.squeeze(pred_action)
            env_action = pred_action[0] + 0.2 * pred_action[1:]
            env_action = np.clip(env_action, -1.0, 1.0)
            env_action = action_mapping(env_action, env.action_space.low[0],
                                        env.action_space.high[0])
            next_obs, reward, done, info = env.step(env_action)

            obs = next_obs
            total_reward += reward
            steps += 1
            if render:
                env.render()
            if done:
                break
        print("Test episode {}, reward:{}".format(i, total_reward))
        eval_reward.append(total_reward)
    return np.mean(eval_reward)
예제 #4
0
파일: train.py 프로젝트: thunder95/PARL
def evaluate(env, agent):
    eval_reward = []
    for i in range(5):
        obs = env.reset()
        total_reward, steps = 0, 0
        while True:
            batch_obs = np.expand_dims(obs, axis=0)
            pred_action = agent.predict(batch_obs.astype('float32'))
            pred_action = np.squeeze(pred_action)
            env_action = pred_action[0] + 0.2 * pred_action[1:]
            #CHANGE
            #env_action = np.clip(env_action, -1.0, 1.0)
            env_action = np.clip(
                np.random.normal(pred_action, EXPL_NOISE * max_action), -1.0,
                1.0)
            env_action = action_mapping(env_action, env.action_space.low[0],
                                        env.action_space.high[0])
            next_obs, reward, done, info = env.step(env_action)
            obs = next_obs
            total_reward += reward
            steps += 1

            if done:
                break
        eval_reward.append(total_reward)
    return np.mean(eval_reward)
예제 #5
0
def evaluate1(env, agent ,gm):
    
    eval_reward = []
    for i in range(5):
        obs = env.reset()
        total_reward, steps = 0, 0
        while True:
            batch_obs = np.expand_dims(obs, axis=0)
            action = agent.predict(batch_obs.astype('float32'))
            

            action = np.squeeze(action)                      
            mean_a= action[4]                                     #加的代码,还原输出,目的使输出稳定,原因同上。
            action = action[0:4]
            action = gm*action +(1-gm) * mean_a                           #注意此处的gm,用于变电压浮动的控制
            

            action = np.clip(action, -1.0, 1.0)         #加的一行代码,防止报错
            action = action_mapping(action, env.action_space.low[0], 
                                    env.action_space.high[0])

            next_obs, reward, done, info = env.step(action)

            obs = next_obs
            total_reward += reward
            steps += 1

            if done:
                break
        eval_reward.append(total_reward)
        print("一次评估完成,此时的gm值",gm,"此次的total_reward",total_reward)
    return np.mean(eval_reward)
예제 #6
0
def evaluate(env, agent):
    eval_reward = []
    for i in range(5):
        obs = env.reset()
        total_reward, steps = 0, 0
        while True:
            batch_obs = np.expand_dims(obs, axis=0)
            action = agent.predict(batch_obs.astype('float32'))
            

            action = np.squeeze(action)                      
            mean_a= action[4]                                     #加的代码,还原输出,目的使输出稳定,原因同上。
            action = action[0:4]
            action = GM*action + mean_a                           #此处我取了一个GM = 0.2的系数,在全局变量里面设置,用于变电压浮动的控制

            action = np.clip(action, -1.0, 1.0)         #加的一行代码,防止报错
            action = action_mapping(action, env.action_space.low[0], 
                                    env.action_space.high[0])

            next_obs, reward, done, info = env.step(action)

            obs = next_obs
            total_reward += reward
            steps += 1

            if done:
                break
        eval_reward.append(total_reward)
    return np.mean(eval_reward)
예제 #7
0
def evaluate(env, agent):
    eval_reward = []
    for i in range(3):
        obs = env.reset()
        total_reward, steps = 0, 0
        while True:
            batch_obs = np.expand_dims(obs, axis=0)
            action = agent.predict(batch_obs.astype('float32'))
            action = np.squeeze(action)
            main_action = action[0]
            sub_action = action[1:]
            # sub_action = np.random.normal(sub_action, 0.01)
            action = [main_action+0.2*x for x in sub_action]
            action = np.clip(action, -1.0, 1.0)
            action = action_mapping(action, env.action_space.low[0], 
                                    env.action_space.high[0])

            next_obs, reward, done, info = env.step(action)

            obs = next_obs
            total_reward += reward
            steps += 1

            if done:
                break
        eval_reward.append(total_reward)
    return np.mean(eval_reward)
예제 #8
0
def run_episode(env, agent, rpm):
    total_reward, steps = 0, 0
    obs = env.reset()

    while True:
        steps += 1
        batch_obs = np.expand_dims(obs, axis=0)
        action = agent.predict(batch_obs.astype("float32"))
        action = np.squeeze(action)

        # 增加高斯噪音,并clip
        action = np.clip(np.random.normal(action, 1.0), -1.0, 1.0)
        # 将动作映射到对应的电压区间
        action = action_mapping(action, env.action_space.low[0],
                                env.action_space.high[0])

        # 让四个电压值靠近,这样有意引导容易收敛
        means = np.mean(action)
        action = action + gamma * (means - action)

        next_obs, reward, done, info = env.step(action)
        rpm.append(obs, action, REWARD_SCALE * reward, next_obs, done)

        if rpm.size() > MEMORY_WARMUP_SIZE:
            batch_obs, batch_action, batch_reward, batch_next_obs, \
                batch_terminal = rpm.sample_batch(BATCH_SIZE)
            critic_cost = agent.learn(batch_obs, batch_action, batch_reward,
                                      batch_next_obs, batch_terminal)

            obs = next_obs
            total_reward += reward

        if done:
            break
    return total_reward, steps
예제 #9
0
def run_episode(env, agent, rpm, render=False):
    step = 0
    total_reward = 0
    obs = env.reset()
    while True:
        step += 1
        batch_obs = np.expand_dims(obs, axis=0)
        action = agent.predict(batch_obs)
        action = np.squeeze(action)
        action = np.random.normal(action, 1.0)
        action = np.clip(action, -1.0, 1.0)
        actuall = action
        actuall = action_mapping(actuall, env.action_space.low[0],
                                 env.action_space.high[0])
        next_obs, reward, done, info = env.step(actuall)
        vx_1 = abs(info['b_v_x'] - info['next_target_g_v_x'])
        vy_1 = abs(info['b_v_x'] - info['next_target_g_v_y'])
        vz_1 = abs(info['b_v_x'] - info['next_target_g_v_z'])
        vx_2 = pow(info['b_v_x'] - info['next_target_g_v_x'], 2)
        vy_2 = pow(info['b_v_y'] - info['next_target_g_v_y'], 2)
        vz_2 = pow(info['b_v_z'] - info['next_target_g_v_z'], 2)
        reward_adept = -0.1 * (vx_1 + vy_1 + vz_1 + vx_2 + vy_2 + vz_2)
        rpm.append(obs, action, Reward_Scale * reward_adept, next_obs, done)
        if rpm.size() > Memory_Warm_Up:
            batch_obs, batch_act, batch_reward, batch_next_obs, batch_done = rpm.sample_batch(
                Batch_Size)
            C_cost = agent.learn(batch_obs, batch_act, batch_reward,
                                 batch_next_obs, batch_done)
        obs = next_obs
        total_reward += reward_adept
        if render:
            env.render()
        if done:
            break
    return step, total_reward
예제 #10
0
def run_train_episode(env, agent, rpm, reward_scale, warmup_size, batch_size,
                      expl_noise):
    obs = env.reset()
    total_reward, steps = 0, 0
    critic_cost, actor_cost = 0, 0
    low_act, high_act = env.action_space.low[0], env.action_space.high[0]
    while True:
        steps += 1
        batch_obs = np.expand_dims(obs, axis=0)
        if rpm.size() < warmup_size:
            action = env.action_space.sample()
        elif hasattr(agent, "sample_program"):
            action = agent.sample(batch_obs.astype('float32'))
        else:
            action = agent.predict(batch_obs.astype('float32'))
            action = np.clip(np.random.normal(action, high_act * expl_noise),
                             -high_act, high_act)
        action = np.clip(action / high_act, -1.0, 1.0)
        action = action_mapping(action, low_act, high_act)
        next_obs, reward, done, info = env.step(action)
        rpm.append(obs, action, reward_scale * reward, next_obs, done)

        if rpm.size() > warmup_size:
            batch_obs, batch_action, batch_reward, batch_next_obs, batch_terminal = rpm.sample_batch(
                batch_size)
            critic_cost, actor_cost = agent.learn(batch_obs, batch_action,
                                                  batch_reward, batch_next_obs,
                                                  batch_terminal)

        obs = next_obs
        total_reward += reward

        if done:
            break
    return total_reward, steps, critic_cost, actor_cost
예제 #11
0
def evaluate_episode(env, agent, render=False):
    eval_reward = []
    for i in range(5):
        obs = env.reset()
        total_reward = 0
        target_pose = np.expand_dims(obs[-3:], axis=0)

        for i in range(EPISODE_LENGTH):
            batch_obs = np.expand_dims(obs[8:15], axis=0)
            batch_obs_full = np.concatenate((batch_obs, target_pose), axis=1)
            action = agent.predict(batch_obs_full.astype('float32'))

            # Add gripper action again
            action = np.append(action, 0)

            action = np.squeeze(action)
            action = action_mapping(action, env.action_space.low[0],
                                    env.action_space.high[0])

            next_obs, reward, done, info = env.step(action)

            obs = next_obs
            total_reward += reward

            if render:
                env.render()

            if done:
                break
        eval_reward.append(total_reward)
    return np.mean(eval_reward)
예제 #12
0
def run_episode(env, agent, rpm):
    obs = env.reset()
    total_reward, steps = 0, 0
    while True:
        steps += 1
        batch_obs = np.expand_dims(obs, axis=0)
        action = agent.predict(batch_obs.astype('float32'))
        action = np.squeeze(action)

        # Add exploration noise, and clip to [-1.0, 1.0]
        action = np.clip(np.random.normal(action, 1.0), -1.0, 1.0)
        action = action_mapping(action, env.action_space.low[0],
                                env.action_space.high[0])

        next_obs, reward, done, info = env.step(action)
        rpm.append(obs, action, REWARD_SCALE * reward, next_obs, done)

        if rpm.size() > MEMORY_WARMUP_SIZE:
            batch_obs, batch_action, batch_reward, batch_next_obs, \
                    batch_terminal = rpm.sample_batch(BATCH_SIZE)
            critic_cost = agent.learn(batch_obs, batch_action, batch_reward,
                                      batch_next_obs, batch_terminal)

        obs = next_obs
        total_reward += reward

        if done:
            break
    return total_reward, steps
예제 #13
0
def evaluate(env, agent):
    eval_reward = []
    for i in range(5):
        obs = env.reset()
        total_reward, steps = 0, 0
        while True:
            if obs.shape[0] == 19:
                # yaw = obs[14]
                # pitch = obs[12]
                # roll = obs[13]
                next_target_g_v_x = obs[16]
                next_target_g_v_y = obs[17]
                next_target_g_v_z = obs[18]
                # r_matrix = get_rotation_matrix(yaw, pitch, roll)
                r_matrix = env.simulator.get_coordination_converter_to_body()
                next_expected_v = np.squeeze(
                    np.matmul(
                        r_matrix,
                        np.array([[next_target_g_v_x], [next_target_g_v_y],
                                  [next_target_g_v_z]],
                                 dtype="float32")))
                obs = np.append(obs, next_expected_v)  # extend the obs
            batch_obs = np.expand_dims(obs, axis=0)
            action = agent.predict(batch_obs.astype('float32'))
            action = np.clip(action, -1.0, 1.0)
            action = np.squeeze(action)
            # action_main = action[0]
            # action_diff = action[1:] * OFFSET_SCALAR
            # action_new = action_diff + action_main
            # action_new = np.clip(action_new, -1.0, 1.0)
            action = action_mapping(action, env.action_space.low[0],
                                    env.action_space.high[0])

            next_obs, reward, done, info = env.step(action)

            # yaw = next_obs[14]
            # pitch = next_obs[12]
            # roll = next_obs[13]
            next_target_g_v_x = next_obs[16]
            next_target_g_v_y = next_obs[17]
            next_target_g_v_z = next_obs[18]
            # r_matrix = get_rotation_matrix(yaw, pitch, roll)
            r_matrix = env.simulator.get_coordination_converter_to_body()
            next_expected_v = np.squeeze(
                np.matmul(
                    r_matrix,
                    np.array([[next_target_g_v_x], [next_target_g_v_y],
                              [next_target_g_v_z]],
                             dtype="float32")))
            next_obs = np.append(next_obs, next_expected_v)  # extend the obs

            obs = next_obs
            total_reward += reward
            steps += 1

            if done:
                break
            env.render()
        eval_reward.append(total_reward)
    return np.mean(eval_reward)
예제 #14
0
def evaluate(env, agent):
    eval_reward = []
    for i in range(5):
        obs = env.reset()
        total_reward, steps = 0, 0
        while True:
            batch_obs = np.expand_dims(obs, axis=0)
            action = agent.predict(batch_obs.astype('float32'))
            action = np.squeeze(action)
            action_four = action[0] + 0.2 * action[1:]

            action_four = np.clip(action_four, -1.0, 1.0)
            action_four = action_mapping(action_four, env.action_space.low[0],
                                         env.action_space.high[0])

            next_obs, reward, done, info = env.step(action_four)

            obs = next_obs
            total_reward += reward
            steps += 1

            if done:
                break
        eval_reward.append(total_reward)
    return np.mean(eval_reward)
예제 #15
0
def run_episode(env, agent, rpm):
    obs = env.reset()
    total_reward, steps = 0, 0
    while True:
        steps += 1
        batch_obs = np.expand_dims(obs, axis=0)
        action = agent.predict(batch_obs.astype('float32'))
        action = np.squeeze(action)

        # 给输出动作增加探索扰动,输出限制在 [-1.0, 1.0] 范围内
        action = np.random.normal(action, 1.0)
        action = np.clip(action, -1.0, 1.0)
        # 动作映射到对应的 实际动作取值范围 内, action_mapping是从parl.utils那里import进来的函数
        action = action_mapping(action, env.action_space.low[0],
                                env.action_space.high[0])

        next_obs, reward, done, info = env.step(action)
        rpm.append(obs, action, REWARD_SCALE * reward, next_obs, done)

        if rpm.size() > MEMORY_WARMUP_SIZE:
            batch_obs, batch_action, batch_reward, batch_next_obs, \
                    batch_terminal = rpm.sample_batch(BATCH_SIZE)
            critic_cost = agent.learn(batch_obs, batch_action, batch_reward,
                                      batch_next_obs, batch_terminal)

        obs = next_obs
        total_reward += reward

        if done:
            break
    return total_reward, steps
예제 #16
0
def run_train_episode(env, agent, scaler):
    obs = env.reset()
    observes, actions, rewards, unscaled_obs = [], [], [], []
    step = 0.0
    scale, offset = scaler.get()
    scale[-1] = 1.0  # don't scale time step feature
    offset[-1] = 0.0  # don't offset time step feature
    env.reset()
    while True:
        obs = obs.reshape((1, -1))
        obs = np.append(obs, [[step]], axis=1)  # add time step feature
        unscaled_obs.append(obs)
        obs = (obs - offset) * scale  # center and scale observations
        obs = obs.astype('float32')
        observes.append(obs)

        action = agent.policy_sample(obs)
        action = np.clip(action, -1.0, 1.0)
        action = action_mapping(action, env.action_space.low[0],
                                env.action_space.high[0])

        action = action.reshape((1, -1)).astype('float32')
        env.render()
        actions.append(action)

        obs, reward, done, _ = env.step(np.squeeze(action))
        rewards.append(reward)
        step += 1e-3  # increment time step feature

        if done:
            break

    return (np.concatenate(observes), np.concatenate(actions),
            np.array(rewards, dtype='float32'), np.concatenate(unscaled_obs))
예제 #17
0
def evaluate_episode(env, agent, render=False):
    total_reward = []
    env_reward = []
    for j in range(5):
        obs = env.reset()
        c_r = 0
        d_r = 0
        while True:
            batch_obs = np.expand_dims(obs, axis=0)
            action = agent.predict(batch_obs)
            action = np.squeeze(action)
            action = np.clip(action, -1.0, 1.0)
            actuall = action
            actuall = action_mapping(actuall, env.action_space.low[0],
                                     env.action_space.high[0])
            next_obs, reward, done, info = env.step(actuall)
            obs = next_obs
            vx = pow(info['b_v_x'] - info['next_target_g_v_x'], 2)
            vy = pow(info['b_v_y'] - info['next_target_g_v_y'], 2)
            vz = pow(info['b_v_z'] - info['next_target_g_v_z'], 2)
            reward_adept = 0.01 * (vx + vy + vz)
            d_r += reward
            reward -= reward_adept
            c_r += reward
            if render:
                env.render()
            if done:
                break
        total_reward.append(c_r)
        env_reward.append(d_r)
    total_reward.append(np.mean(total_reward))
    env_reward.append(np.mean(env_reward))
    return total_reward, env_reward
예제 #18
0
파일: PPGD.py 프로젝트: QFaceblue/quadrotor
def evaluate(env, agent):
    eval_reward = []
    for i in range(5):
        obs = env.reset()
        total_reward, steps = 0, 0
        while True:
            batch_obs = np.expand_dims(obs, axis=0)
            action = agent.predict(batch_obs.astype('float32'))
            # print("before:",action)
            action = np.squeeze(action)

            action = np.array([
                action[0] + 0.1 * action[1], action[0] + 0.1 * action[2],
                action[0] + 0.1 * action[3], action[0] + 0.1 * action[4]
            ])
            # print(action)
            action = np.clip(action, -1.0, 1.0)
            action = action_mapping(action, env.action_space.low[0],
                                    env.action_space.high[0])

            next_obs, reward, done, info = env.step(action)

            obs = next_obs
            total_reward += reward
            steps += 1
            env.render()
            if done:
                break
        eval_reward.append(total_reward)
    return np.mean(eval_reward)
예제 #19
0
def run_evaluate_episode(env, agent, scaler):
    obs = env.reset()
    rewards = []
    step = 0.0
    scale, offset = scaler.get()
    scale[-1] = 1.0  # don't scale time step feature
    offset[-1] = 0.0  # don't offset time step feature
    while True:
        obs = obs.reshape((1, -1))
        obs = np.append(obs, [[step]], axis=1)  # add time step feature
        obs = (obs - offset) * scale  # center and scale observations
        obs = obs.astype('float32')

        action = agent.policy_predict(obs)
        action = action_mapping(action, env.action_space.low[0],
                                env.action_space.high[0])

        obs, reward, done, _ = env.step(np.squeeze(action))
        env.render()
        rewards.append(reward)

        step += 1e-3  # increment time step feature

        if done:
            break
    return np.sum(rewards)
예제 #20
0
def run_episode(env, agent, rpm, batch_size=64):
    obs = env.reset()
    total_reward, steps, a_loss, c_loss = 0, 0, 0, 0
    while True:
        steps += 1
        batch_obs = np.expand_dims(obs, axis=0)
        action = agent.predict(batch_obs.astype('float32'))
        action = np.squeeze(action)
        action = np.clip(np.random.normal(action, EXPL_NOISE), -1.0, 1.0)
        action = action_mapping(action, env.action_space.low[0],
                                env.action_space.high[0])

        next_obs, reward, done, info = env.step(action)
        rpm.append(obs, action, reward, next_obs, done)

        if rpm.size() > WARMUP_SIZE:
            batch_obs, batch_action, batch_reward, batch_next_obs, batch_terminal = rpm.sample_batch(
                batch_size)
            a_loss, c_loss = agent.learn(batch_obs, batch_action, batch_reward,
                                         batch_next_obs, batch_terminal)

        obs = next_obs
        total_reward += reward

        if done:
            break
    return total_reward, steps, a_loss, c_loss
예제 #21
0
def evaluate(env, agent, render=True):
    eval_reward = []
    for i in range(1):
        obs = env.reset()
        total_reward, steps = 0, 0
        while True:
            batch_obs = np.expand_dims(obs, axis=0)
            action = agent.predict(batch_obs.astype('float32'))
            action = np.squeeze(action)
            action = np.clip(action, -1.0, 1.0)
            action = action_mapping(action, env.action_space.low[0],
                                    env.action_space.high[0])
            new_action = [0] * (action.shape[0] - 1)
            for i in range(len(new_action)):
                new_action[i] = action[0] + 0.3 * action[i + 1]
            new_action = np.array(new_action)
            #new_action = action[0] + 0.3*action[1:]
            next_obs, reward, done, info = env.step(new_action)

            obs = next_obs
            total_reward += reward
            steps += 1
            if render:
                env.render()
            if done:
                break
        eval_reward.append(total_reward)
    return np.mean(eval_reward)
예제 #22
0
def evaluate(env, agent, render=False):
    eval_reward = []
    for i in range(5):
        obs = env.reset()
        total_reward, steps = 0, 0
        while True:
            batch_obs = np.expand_dims(obs, axis=0)
            action = agent.predict(batch_obs.astype('float32'))
            action = np.squeeze(action)
            action = np.clip(action, -1.0, 1.0)  ## special
            action = action_mapping(action, env.action_space.low[0],
                                    env.action_space.high[0])
            # action = np.clip(action, -1.0, 1.0) ## special

            next_obs, reward, done, info = env.step(action)

            obs = next_obs
            total_reward += reward
            steps += 1

            if render:
                env.render()

            if done:
                break
        eval_reward.append(total_reward)
    return np.mean(eval_reward)
예제 #23
0
def evaluate(env, agent):
    eval_reward = []
    for i in range(5):
        obs = env.reset()
        total_reward, steps = 0, 0
        while True:
            batch_obs = np.expand_dims(obs, axis=0)
            action = agent.predict(batch_obs)
            action = np.clip(action, -1.0, 1.0)
            action = np.squeeze(action)
            action = action_mapping(action, env.action_space.low[0],
                                    env.action_space.high[0])

            means = np.mean(action)
            action = action + gamma * (means - action)

            next_obs, reward, done, info = env.step(action)

            obs = next_obs
            total_reward += reward
            steps += 1

            if done:
                break
        eval_reward.append(total_reward)
    return np.mean(eval_reward)
def evaluate(env, agent):
    eval_reward = []
    for i in range(5):
        obs = env.reset()
        total_reward, steps = 0, 0
        while True:
            batch_obs = np.expand_dims(obs, axis=0)
            action = agent.predict(batch_obs.astype('float32'))
            #action = [actions[0] * 0.8 + actions[1] * 0.2, actions[0] * 0.8 + actions[2] * 0.2,
            #          actions[0] * 0.8 + actions[3] * 0.2, actions[0] * 0.8 + actions[4] * 0.2]
            # action = np.squeeze(action)
            # print("============================",action )
            action = np.clip(action, -1.0, 1.0)
            action = action_mapping(action, env.action_space.low[0], env.action_space.high[0])

            next_obs, reward, done, info = env.step(action)

            obs = next_obs
            total_reward += reward
            steps += 1

            if done:
                break
        eval_reward.append(total_reward)
    return np.mean(eval_reward)
def run_episode(env, agent, rpm):
    obs = env.reset()
    total_reward, steps = 0, 0
    while True:
        steps += 1
        batch_obs = np.expand_dims(obs, axis=0)
        action = agent.predict(batch_obs.astype('float32'))
        # 给输出动作增加探索扰动
        action = np.random.normal(action, 1.0)
        action = np.squeeze(action)

        # 动作从 5 个压缩为 4 个
        temp = np.zeros((1, 4))
        temp_1 = np.array([ [ 1.0, 0.0, 0.0, 0.0 ] ])
        temp_2 = np.array([ [ 0.0, 1.0, 0.0, 0.0 ] ])
        temp_3 = np.array([ [ 0.0, 0.0, 1.0, 0.0 ] ])
        temp_4 = np.array([ [ 0.0, 0.0, 0.0, 1.0 ] ])

        temp += list(action)[ 0 ]

        temp_1 *= list(action)[ 1 ]
        temp_2 *= list(action)[ 2 ]
        temp_3 *= list(action)[ 3 ]
        temp_4 *= list(action)[ 4 ]

        action_4 = temp + 0.1 * (temp_1 + temp_2 + temp_3 + temp_4)
        action_4 = np.squeeze(action_4)
        action_4 = np.squeeze(action_4)
        action_4 = np.clip(action_4, -1.0, 1.0)


        # 动作映射到对应的 实际动作取值范围 内, action_mapping是从parl.utils那里import进来的函数
        action_4 = action_mapping(action_4, env.action_space.low[ 0 ],
                                  env.action_space.high[ 0 ])

        next_obs, reward, done, info = env.step(action_4)
        rpm.append(obs, action, REWARD_SCALE * reward, next_obs, done)

        if rpm.size() > MEMORY_WARMUP_SIZE:
            batch_obs, batch_action, batch_reward, batch_next_obs, \
            batch_terminal = rpm.sample_batch(BATCH_SIZE)
            critic_cost = agent.learn(batch_obs, batch_action, batch_reward,
                                      batch_next_obs, batch_terminal)

        obs = next_obs
        total_reward += reward

        if done:
            break
    return total_reward, steps
def evaluate(env, agent, render=False):
    eval_reward = [ ]
    for i in range(5):
        obs = env.reset()
        total_reward, steps = 0, 0
        while True:
            batch_obs = np.expand_dims(obs, axis=0)
            action = agent.predict(batch_obs.astype('float32'))
            action = np.squeeze(action)

            # 动作从 5 个压缩为 4 个
            temp = np.zeros((1, 4))
            temp_1 = np.array([ [ 1.0, 0.0, 0.0, 0.0 ] ])
            temp_2 = np.array([ [ 0.0, 1.0, 0.0, 0.0 ] ])
            temp_3 = np.array([ [ 0.0, 0.0, 1.0, 0.0 ] ])
            temp_4 = np.array([ [ 0.0, 0.0, 0.0, 1.0 ] ])

            temp += list(action)[ 0 ]

            temp_1 *= list(action)[ 1 ]
            temp_2 *= list(action)[ 2 ]
            temp_3 *= list(action)[ 3 ]
            temp_4 *= list(action)[ 4 ]

            action_4 = temp + 0.1 * (temp_1 + temp_2 + temp_3 + temp_4)
            action_4 = np.squeeze(action_4)
            action_4 = np.squeeze(action_4)
            action_4 = np.clip(action_4, -1.0, 1.0)


            action_4 = action_mapping(action_4, env.action_space.low[ 0 ],
                                      env.action_space.high[ 0 ])

            next_obs, reward, done, info = env.step(action_4)

            obs = next_obs
            total_reward += reward
            steps += 1

            if render:
                env.render()

            if done:
                break
        eval_reward.append(total_reward)
    return np.mean(eval_reward)
예제 #27
0
def run_evaluate_episode(env, agent):
    obs = env.reset()
    total_reward = 0
    while True:
        batch_obs = np.expand_dims(obs, axis=0)
        action = agent.predict(batch_obs.astype('float32'))
        action = action_mapping(action, env.action_space.low[0],
                                env.action_space.high[0])

        next_obs, reward, done, info = env.step(action)

        obs = next_obs
        total_reward += reward

        if done:
            break
    return total_reward
예제 #28
0
def evaluate_episode(env, render=False):
    env_reward = []
    for j in range(5):
        env.reset()
        d_r = 0
        while True:
            actuall = np.array([-1, -1, -1, -1], dtype='float32')
            actuall = action_mapping(actuall, env.action_space.low[0],
                                     env.action_space.high[0])
            next_obs, reward, done, info = env.step(actuall)
            d_r += reward
            if render:
                env.render()
            if done:
                break
        env_reward.append(d_r)
    env_reward.append(np.mean(env_reward))
    return env_reward
예제 #29
0
def run_episode(env, agent, rpm, render=False):
    obs = env.reset()
    total_reward, steps = 0, 0
    while True:
        steps += 1
        batch_obs = np.expand_dims(obs, axis=0)
        action = agent.predict(batch_obs.astype('float32'))
        action = np.squeeze(action)

        # 给输出动作增加探索扰动,输出限制在 [-1.0, 1.0] 范围内
        action = np.clip(np.random.normal(action, NOISE), -1.0, 1.0)

        # action_tmp = action[0] +action[-(len(action)-1):,] * OFFSET_FACTOR;
        # action = np.append(action[0],action_tmp)
        # 动作映射到对应的 实际动作取值范围 内, action_mapping是从parl.utils那里import进来的函数
        # action = action_mapping(action, env.action_space.low[0],
        #                         env.action_space.high[0])
        # next_obs, reward, done, info = env.step(action)

        main_action = action[0]
        sub_action = action[1:]
        sub_action = main_action + sub_action * OFFSET_FACTOR
        sub_action = np.clip(sub_action, -1.0, 1.0)
        sub_action = action_mapping(sub_action, env.action_space.low[0],
                                    env.action_space.high[0])
        next_obs, reward, done, info = env.step(sub_action)

        rpm.append(obs, action, REWARD_SCALE * reward, next_obs, done)

        if rpm.size() > MEMORY_WARMUP_SIZE:
            batch_obs, batch_action, batch_reward, batch_next_obs, \
                    batch_terminal = rpm.sample_batch(BATCH_SIZE)
            critic_cost = agent.learn(batch_obs, batch_action, batch_reward,
                                      batch_next_obs, batch_terminal)

        obs = next_obs
        total_reward += reward

        if render:
            env.render()

        if done:
            break
    return total_reward, steps
예제 #30
0
def evaluate(env, agent, render=False):
    obs = env.reset()
    total_reward = 0
    while True:
        batch_obs = np.expand_dims(obs, axis=0)
        action = agent.predict(batch_obs.astype('float32'))
        action = np.squeeze(action)
        action = np.clip(action, -1.0, 1.0)  ## special
        action = action_mapping(action, env.action_space.low[0],
                                env.action_space.high[0])

        next_obs, reward, done, info = env.step(action)

        obs = next_obs
        total_reward += reward

        if done:
            break
    return total_reward