예제 #1
0
파일: Action.py 프로젝트: YuanyeMa/RL
def test_process(config, steps, target_actor):
    env = NormalizedEnv(gym.make('Pendulum-v0'))
    agent = Action(state_dim=env.observation_space.shape[0],
                   action_dim=env.action_space.shape[0])
    reward_list = []
    try:
        while True:
            # for test
            if (steps.value) != 0 and (steps.value % config.test_every_eposide
                                       == 0):
                agent.load_param(target_actor)
                print("test agent load param ")
                et_reward = 0
                for index in range(config.num_eposide_test):
                    eposide = 0
                    state = env.reset()
                    state = (state - env.observation_space.low) / (
                        env.observation_space.high - env.observation_space.low)

                    while True:
                        action = agent.chose_action(state, explort=False)
                        next_state, reward, done, _ = env.step(action)
                        env.render()
                        next_state = (next_state - env.observation_space.low
                                      ) / (env.observation_space.high -
                                           env.observation_space.low)
                        eposide += reward
                        state = next_state
                        if done:
                            break
                    et_reward += eposide
                print("\033[93m [ test ] eposide average reward : {}\033[00m".
                      format(et_reward / config.num_eposide_test))
                reward_list.append(et_reward / config.num_eposide_test)

                x = np.arange(len(reward_list))
                y = np.array(reward_list)
                plt.plot(x, y)
                plt.savefig("./eposide_reward.png")

    except Exception as e:
        print(e)
        print("test process exit")
        env.close()
예제 #2
0
파일: test.py 프로젝트: YuanyeMa/RL
def main():
    env = NormalizedEnv(gym.make('Pendulum-v0'))

    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]

    agent = Actor(state_dim, action_dim).to('cuda')

    agent.load_state_dict(torch.load('./Models/78.0_actor.pt'))

    eposide = 0
    done = False
    eposide_list = []
    while eposide < 100:
        eposide_reward = 0
        state = env.reset()
        state = (state - env.observation_space.low) / (
            env.observation_space.high - env.observation_space.low)
        state = to_tensor(state)
        while not done:
            action = agent.forward(state).detach().cpu().data.numpy()
            state_, reward, done, _ = env.step(action)
            state_ = (state_ - env.observation_space.low) / (
                env.observation_space.high - env.observation_space.low)
            env.render()
            state = to_tensor(state_)
            eposide_reward += reward

        eposide_list.append(eposide_reward)
        eposide += 1
        done = False
        print("{} : {}".format(eposide, eposide_reward))

    import matplotlib.pyplot as plt
    x = np.arange(100)
    y = np.array(eposide_list)
    plt.plot(x, y)
    plt.savefig("./test_eposide_reward.png")

    env.close()
예제 #3
0
def main(args):

    env = make_env('simple_tag')
    env = NormalizedEnv(env)

    kwargs = dict()
    kwargs['config'] = args

    predator_model = Predators(16, 2, num_agent=3, **kwargs)
    preyer_model = Preyer(14, 2, **kwargs)
    if args.tensorboard:
        writer = SummaryWriter(log_dir='runs/' + args.log_dir)
    episode = 0
    total_step = 0

    while episode < args.max_episodes:

        state = env.reset()
        episode += 1
        step = 0
        predator_accum_reward = []
        preyer_accum_reward = 0

        while True:
            state_predator, state_prayer = split_obs(state)

            predator_model.prep_eval()
            action_predator = predator_model.choose_action(state_predator)
            action_prayer = preyer_model.random_action()
            #action_prayer = preyer_model.choose_action(state_prayer)

            action = merge_action(action_predator, action_prayer)

            next_state, reward, done, info = env.step(action)
            step += 1
            total_step += 1

            predator_accum_reward.append(np.mean(reward[:3]))
            preyer_accum_reward = reward[3]

            if step > args.episode_length:
                done = [True, True, True, True]

            if args.render and (episode % 10 == 1):
                env.render(mode='rgb_array')

            predator_model.memory(state[:3], action[:3], reward[:3],
                                  next_state[:3], done[:3])
            # preyer_model.memory(state[3], action[3], reward[3], next_state[3], done[3])

            if len(
                    predator_model.replay_buffer
            ) >= args.batch_size and total_step % args.steps_per_update == 0:
                predator_model.prep_train()
                predator_model.train()
                # preyer_model.train()

            if True in done:
                predator_c_loss, predator_a_loss = predator_model.getLoss()
                preyer_c_loss, preyer_a_loss = preyer_model.getLoss()
                print("[Episode %05d] reward_predator %3.1f reward_preyer %3.1f predator_c_loss %3.1f predator_a_loss %3.1f preyer_c_loss %3.1f preyer_a_loss %3.1f" % \
                      (episode, np.mean(predator_accum_reward).item(), preyer_accum_reward, predator_c_loss, predator_a_loss, preyer_c_loss, preyer_a_loss))
                if args.tensorboard:
                    # writer.add_scalar(tag='debug/memory_length', global_step=episode, scalar_value=len(predator_model.replay_buffer))
                    # writer.add_scalar(tag='debug/predator_epsilon', global_step=episode, scalar_value=predator_model.epsilon)
                    # writer.add_scalar(tag='debug/preyer_epsilon', global_step=episode, scalar_value=preyer_model.epsilon)
                    writer.add_scalar(
                        tag='agent/reward_predator',
                        global_step=episode,
                        scalar_value=np.mean(predator_accum_reward).item())
                    # writer.add_scalar(tag='perf/reward_preyer', global_step=episode, scalar_value=preyer_accum_reward)
                    if predator_c_loss and predator_a_loss:
                        writer.add_scalars('agent/predator_loss',
                                           global_step=episode,
                                           tag_scalar_dict={
                                               'actor': -predator_a_loss,
                                               'critic': predator_c_loss
                                           })
                    # writer.add_scalar(tag='loss/preyer_c_loss', global_step=episode, scalar_value=preyer_c_loss)
                    # writer.add_scalar(tag='loss/preyer_a_loss', global_step=episode, scalar_value=preyer_a_loss)

                predator_model.reset()
                preyer_model.reset()
                break

            state = next_state
    if args.tensorboard:
        writer.close()