Python Policy.eval примеры использования

Язык программирования: Python

Пространство имен/Пакет: Policy

Класс/Тип: Policy

Метод/Функция: eval

Примеров на hotexamples.com: 3

Python Policy.eval - 3 примера найдено. Это лучшие примеры Python кода для Policy.Policy.eval, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Policy(30)

parameters(4)

load_state_dict(4)

eval(3)

__init__(2)

action(2)

to(2)

state_dict(2)

setHeight(1)

put_data(1)

resetValues(1)

sample(1)

sample_action(1)

setConfig(1)

IsValidStateAction(1)

setWidth(1)

set_parameters(1)

policyIteration(1)

toString(1)

train(1)

train_net(1)

update(1)

updateTargetNet(1)

sortedA(1)

load_reinforcement_model(1)

parse(1)

epsilonGreedy(1)

Policy_considerPolicy(1)

__str(1)

add_state(1)

buildGraph(1)

calcQ(1)

close_sess(1)

get_action_log_prob(1)

load_best_model(1)

get_state_action_probability(1)

get_state_probabilities(1)

import_all_data_and_train(1)

import_data_and_train(1)

improve_actor(1)

improve_critic(1)

insert_policy(1)

valueIteration(1)

Пример #1

Показать файл

Файл: eval_model.py Проект: Tolsager/intelligent_systems_exam_code

def eval(model_type=model_type, model_path=model_path):
    if torch.cuda.is_available():
        device = 'cuda'
    else:
        device = 'cpu'

    env = LunarLander()

    if model_type == 'policy':
        model = Policy(env.observation_dim, env.action_dim)
    elif model_type == 'dqn':
        model = Network(env.observation_dim, env.action_dim)
    model.to(device)
    model.load_state_dict(torch.load(model_path))
    model.eval()

    episodes = 50
    wins = 0
    frames = []
    fuel_left = []
    for i in range(episodes):
        if i % 10 == 0:
            print(f"On episode {i}")
        frame_count = 0

        env.reset()
        state = env.get_state()
        while True:
            frame_count += 1

            action = model(
                torch.tensor(state, dtype=torch.float32,
                             device=device).unsqueeze(0)).argmax()

            state, reward, done = env.step(action)

            if done:
                if env.won:
                    wins += 1
                    frames.append(frame_count)
                    fuel_left.append(env.rocket.fuel)
                break
        env.close()

    if wins > 0:
        print(f"wins: {wins}")
        print(f"mean frames on wins {np.mean(frames)}")
        print(f"std frames on wins {np.std(frames, ddof=1)}")
        print(f"min frames on wins {np.min(frames)}")
        print(f"max frames on wins {np.max(frames)}")

        print(f"mean fuel on wins {np.mean(fuel_left)}")
        print(f"std fuel on wins {np.std(fuel_left, ddof=1)}")
        print(f"min fuel on wins {np.min(fuel_left)}")
        print(f"max fuel on wins {np.max(fuel_left)}")
    else:
        print("The model had 0 wins. Statistics can't be calculated")

Пример #2

Показать файл

env = LunarLander()
env.reset()
exit_program = False

if torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'

if model_type == 'policy':
    model = Policy(env.observation_dim, env.action_dim)
elif model_type == 'dqn':
    model = Network(env.observation_dim, env.action_dim)
model.to(device)
model.load_state_dict(torch.load(model_path))
model.eval()
state = env.get_state()

while not exit_program:
    env.render()
    action = model(
        torch.tensor(state, dtype=torch.float32,
                     device=device).unsqueeze(0)).argmax()

    state, reward, done = env.step(action)

    # Process game events
    for event in pygame.event.get():
        if event.type == pygame.QUIT:
            exit_program = True
        if event.type == pygame.KEYDOWN:

Пример #3

Показать файл

Файл: Learner.py Проект: APM150/CartPole_v0

class Learner:
    def __init__(self, learning_rate=0.01, FILE="Model/goodPolicy.pth"):
        self.FILE = FILE
        self.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')
        self.policy = Policy().to(self.device)
        self.policy.load_state_dict(torch.load(self.FILE))
        self.policy.eval()
        self.criterion = nn.CrossEntropyLoss()
        self.learning_rate = learning_rate
        self.optimizer = torch.optim.Adam(self.policy.parameters(),
                                          lr=self.learning_rate)

    def simulate(self, episode: int, policyPercent: float, show=False):
        """
        Simulate the cartpole process
        :param episode: number of episode want to simulate, how many percentage of policy want to use
        :return: list of ([trajectory of actions], [trajectory of observation], totalReward)
        """
        env = gym.make('CartPole-v0')
        result = []
        for i_episode in range(episode):
            actions = []
            observations = []
            totalReward = 500  # if not failed
            observation = env.reset()
            for t in range(500):
                if show: env.render()
                observationTensor = torch.from_numpy(
                    observation)  # convert from numpy to tensor
                observationTensor = torch.tensor(observationTensor,
                                                 dtype=torch.float32)
                observationTensor = observationTensor.to(self.device)
                observations.append(observation.tolist())
                if random.random(
                ) <= policyPercent:  # policy mix with random choice
                    with torch.no_grad():
                        action = torch.max(self.policy(observationTensor),
                                           0)[1].item()  # 0 or 1
                else:
                    action = random.randint(0, 1)
                actions.append(action)
                observation, reward, done, info = env.step(action)
                if done:
                    totalReward = t + 1
                    # print(f"Episode finished after {t + 1} timesteps")
                    break
            result.append((actions, observations, totalReward))
        env.close()
        return result

    def trainPolicy(self, episodes, policyPercent=0.8):
        """ Train the policy """
        # First play serval times to determine the average reward.
        trajectoriesForAvgRwd = self.simulate(20, 1)
        averageReward = sum([i[2] for i in trajectoriesForAvgRwd
                             ]) / len(trajectoriesForAvgRwd)
        print(averageReward)

        trajectoriesForTrain = self.simulate(episodes, policyPercent)
        for trainTrajectory in trajectoriesForTrain:
            if trainTrajectory[2] > averageReward:
                # forward
                predictAction = self.policy(
                    torch.tensor(trainTrajectory[1]).to(self.device))
                loss = self.criterion(
                    predictAction,
                    torch.tensor(trainTrajectory[0]).to(self.device))

                # backwards
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()
        torch.save(self.policy.state_dict(), self.FILE)