Пример #1
0
def rollout(env: gym.Env,
            policies: policies.QPolicy,
            episodes: int,
            temp: float,
            render: bool = False):
    """
    Simulates trajectories for the given number of episodes. Input policy is used to sample actions at each time step

    :param env: the gym environment
    :param policies: The policy used to sample actions (Tabular/DQN) 
    :param episodes: Number of episodes to be simulated
    :param epsilon: The exploration parameter for epsilon-greedy policy
    :param gamma: Discount factor
    :param render: If True, render the environment
    
    :return replay: Collection of (state, action, reward, next_state, done) at each timestep of all simulated episodes
    :return scores: Collection of total reward for each simulated episode  
    """
    replay = []
    scores = []
    for itrnum in range(episodes):
        state = env.reset()
        step = 0
        score = 0
        done = False
        while not done:
            if render:
                env.render()
            pi = policies(state, temp)
            # How do you select the action given pi. Hint: use np.random.choice
            #print("SIZE STATE:", np.size(state), "SIZE PI:", np.size(pi), "")
            action = np.random.choice([0, 1, 2],
                                      p=pi)  #TODO: fill in this line
            #raise Exception("STATE:",state,"PI:",pi,"ACTION:",action,"TYPE:",type(action))
            next_state, reward, done, _ = env.step(action)
            score += reward
            replay.append((state, action, reward, next_state, done))
            state = next_state
            step += 1

        env.close()
        scores.append(score)

    return replay, scores
Пример #2
0
def rollout(env: gym.Env, policies: policies.QPolicy, episodes: int, epsilon: float, render: bool = False):
    """
    Simulates trajectories for the given number of episodes. Input policy is used to sample actions at each time step

    :param env: the gym environment
    :param policies: The policy used to sample actions (Tabular/DQN) 
    :param episodes: Number of episodes to be simulated
    :param epsilon: The exploration parameter for epsilon-greedy policy
    :param gamma: Discount factor
    :param render: If True, render the environment
    
    :return replay: Collection of (state, action, reward, next_state, done) at each timestep of all simulated episodes
    :return scores: Collection of total reward for each simulated episode  
    """
    replay = []
    scores = []
    for _ in range(episodes):
        state = env.reset()
        step = 0
        score = 0
        done = False
        while not done:
            if render:
                env.render()
            ### <<< Your Code Here
            # print(state,episodes)
            pi = policies(state, epsilon)
            # How do you select the action given pi. Hint: use np.random.choice
            if pi[0] == pi[1]:
                action = np.random.choice(2)
                # print(action, 'random')
            else:
                action = np.argmax(pi)
                # print(action, 'not random')

            ### Your Code Ends >>>
            next_state, reward, done, _ = env.step(action)
            score += reward
            replay.append((state, action, reward, next_state, done))
            state = next_state
            step += 1
        env.close()
        scores.append(score)
    return replay, scores
Пример #3
0
def rollout(env: gym.Env,
            policies: policies.QPolicy,
            episodes: int,
            temp: float,
            render: bool = False):
    """
    Simulates trajectories for the given number of episodes. Input policy is used to sample actions at each time step

    :param env: the gym environment
    :param policies: The policy used to sample actions (Tabular/DQN) 
    :param episodes: Number of episodes to be simulated
    :param epsilon: The exploration parameter for epsilon-greedy policy
    :param gamma: Discount factor
    :param render: If True, render the environment
    
    :return replay: Collection of (state, action, reward, next_state, done) at each timestep of all simulated episodes
    :return scores: Collection of total reward for each simulated episode  
    """
    replay = []
    scores = []

    for itrnum in range(episodes):
        state = env.reset()
        step = 0
        score = 0
        done = False
        while not done:
            if render:
                env.render()
            pi = policies(state, temp)
            # How do you select the action given pi. Hint: use np.random.choice
            # action = np.random.choice(a=[0, 1, 2], size=1, p=pi)[0]    # for tabQ learning
            action = np.random.choice(a=[0, 1], size=1,
                                      p=pi)[0]  # for DQN learning
            next_state, reward, done, _ = env.step(action)
            score += reward
            replay.append((state, action, reward, next_state, done))
            state = next_state
            step += 1

        env.close()
        scores.append(score)

    return replay, scores