def eval(model_type=model_type, model_path=model_path):
    if torch.cuda.is_available():
        device = 'cuda'
    else:
        device = 'cpu'

    env = LunarLander()

    if model_type == 'policy':
        model = Policy(env.observation_dim, env.action_dim)
    elif model_type == 'dqn':
        model = Network(env.observation_dim, env.action_dim)
    model.to(device)
    model.load_state_dict(torch.load(model_path))
    model.eval()

    episodes = 50
    wins = 0
    frames = []
    fuel_left = []
    for i in range(episodes):
        if i % 10 == 0:
            print(f"On episode {i}")
        frame_count = 0

        env.reset()
        state = env.get_state()
        while True:
            frame_count += 1

            action = model(
                torch.tensor(state, dtype=torch.float32,
                             device=device).unsqueeze(0)).argmax()

            state, reward, done = env.step(action)

            if done:
                if env.won:
                    wins += 1
                    frames.append(frame_count)
                    fuel_left.append(env.rocket.fuel)
                break
        env.close()

    if wins > 0:
        print(f"wins: {wins}")
        print(f"mean frames on wins {np.mean(frames)}")
        print(f"std frames on wins {np.std(frames, ddof=1)}")
        print(f"min frames on wins {np.min(frames)}")
        print(f"max frames on wins {np.max(frames)}")

        print(f"mean fuel on wins {np.mean(fuel_left)}")
        print(f"std fuel on wins {np.std(fuel_left, ddof=1)}")
        print(f"min fuel on wins {np.min(fuel_left)}")
        print(f"max fuel on wins {np.max(fuel_left)}")
    else:
        print("The model had 0 wins. Statistics can't be calculated")
Пример #2
0
env = LunarLander()
env.reset()
exit_program = False

if torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'

if model_type == 'policy':
    model = Policy(env.observation_dim, env.action_dim)
elif model_type == 'dqn':
    model = Network(env.observation_dim, env.action_dim)
model.to(device)
model.load_state_dict(torch.load(model_path))
model.eval()
state = env.get_state()

while not exit_program:
    env.render()
    action = model(
        torch.tensor(state, dtype=torch.float32,
                     device=device).unsqueeze(0)).argmax()

    state, reward, done = env.step(action)

    # Process game events
    for event in pygame.event.get():
        if event.type == pygame.QUIT:
            exit_program = True
        if event.type == pygame.KEYDOWN:
Пример #3
0
class Learner:
    def __init__(self, learning_rate=0.01, FILE="Model/goodPolicy.pth"):
        self.FILE = FILE
        self.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')
        self.policy = Policy().to(self.device)
        self.policy.load_state_dict(torch.load(self.FILE))
        self.policy.eval()
        self.criterion = nn.CrossEntropyLoss()
        self.learning_rate = learning_rate
        self.optimizer = torch.optim.Adam(self.policy.parameters(),
                                          lr=self.learning_rate)

    def simulate(self, episode: int, policyPercent: float, show=False):
        """
        Simulate the cartpole process
        :param episode: number of episode want to simulate, how many percentage of policy want to use
        :return: list of ([trajectory of actions], [trajectory of observation], totalReward)
        """
        env = gym.make('CartPole-v0')
        result = []
        for i_episode in range(episode):
            actions = []
            observations = []
            totalReward = 500  # if not failed
            observation = env.reset()
            for t in range(500):
                if show: env.render()
                observationTensor = torch.from_numpy(
                    observation)  # convert from numpy to tensor
                observationTensor = torch.tensor(observationTensor,
                                                 dtype=torch.float32)
                observationTensor = observationTensor.to(self.device)
                observations.append(observation.tolist())
                if random.random(
                ) <= policyPercent:  # policy mix with random choice
                    with torch.no_grad():
                        action = torch.max(self.policy(observationTensor),
                                           0)[1].item()  # 0 or 1
                else:
                    action = random.randint(0, 1)
                actions.append(action)
                observation, reward, done, info = env.step(action)
                if done:
                    totalReward = t + 1
                    # print(f"Episode finished after {t + 1} timesteps")
                    break
            result.append((actions, observations, totalReward))
        env.close()
        return result

    def trainPolicy(self, episodes, policyPercent=0.8):
        """ Train the policy """
        # First play serval times to determine the average reward.
        trajectoriesForAvgRwd = self.simulate(20, 1)
        averageReward = sum([i[2] for i in trajectoriesForAvgRwd
                             ]) / len(trajectoriesForAvgRwd)
        print(averageReward)

        trajectoriesForTrain = self.simulate(episodes, policyPercent)
        for trainTrajectory in trajectoriesForTrain:
            if trainTrajectory[2] > averageReward:
                # forward
                predictAction = self.policy(
                    torch.tensor(trainTrajectory[1]).to(self.device))
                loss = self.criterion(
                    predictAction,
                    torch.tensor(trainTrajectory[0]).to(self.device))

                # backwards
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()
        torch.save(self.policy.state_dict(), self.FILE)