Пример #1
0
def test(n_epi):
    agent = Agent(state_size=37, action_size=4, seed=0)
    env = UnityEnvironment(file_name="Banana.app")
    brain_name = env.brain_names[0]  # get the default brain
    brain = env.brains[brain_name]
    agent.qnetwork_local.load_state_dict(torch.load('checkpoint.pth'))

    for i in range(n_epi):
        scores = []  # list containing scores from each episode
        scores_window = deque(maxlen=100)  # last 100 scores
        score = 0  # initialize the score
        env_info = env.reset(
            train_mode=False)[brain_name]  # reset the environment
        state = env_info.vector_observations[0]  # get the current state
        while True:
            action = agent.act(state)
            env_info = env.step(action)[
                brain_name]  # send the action to the environment
            next_state = env_info.vector_observations[0]  # get the next state
            reward = env_info.rewards[0]  # get the reward
            score += reward  # update the score
            done = env_info.local_done[0]  # see if episode has finished
            agent.step(state, action, reward, next_state, done)
            state = next_state  # roll over the state to next time step
            if done:
                break
        scores_window.append(score)  # save most recent score
        scores.append(score)  # save most recent score
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(
            i, np.mean(scores_window)))

    env.close()
Пример #2
0
def dqn(LR,
        GAMMA,
        TAU,
        BUFF,
        UPD,
        n_episodes=1000,
        max_t=100,
        eps_start=1.0,
        eps_end=0.01,
        eps_decay=0.995):
    """Deep Q-Learning.

    Params
    ======
        n_episodes (int): maximum number of training episodes
        max_t (int): maximum number of timesteps per episode
        eps_start (float): starting value of epsilon, for epsilon-greedy action selection
        eps_end (float): minimum value of epsilon
        eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
    """

    agent = Agent(state_size, action_size, LR, GAMMA, TAU, BUFF, UPD, seed=0)
    scores = []  # list containing scores from each episode
    scores_window = deque(maxlen=100)  # last 100 scores
    eps = eps_start  # initialize epsilon
    for i_episode in range(1, n_episodes + 1):
        env_info = env.reset(train_mode=True)[brain_name]
        state = env_info.vector_observations[0]
        score = 0
        for t in range(max_t):
            action = agent.act(state, eps)
            env_info = env.step(action)[brain_name]
            next_state = env_info.vector_observations[0]
            reward = env_info.rewards[0]
            done = env_info.local_done[0]
            agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward
            if done:
                break
        scores_window.append(score)  # save most recent score
        scores.append(score)  # save most recent score
        eps = max(eps_end, eps_decay * eps)  # decrease epsilon
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(
            i_episode, np.mean(scores_window)),
              end="")
        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                i_episode, np.mean(scores_window)))
            #        if np.mean(scores_window)>=13.0:
            #            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_window)))
            torch.save(agent.qnetwork_local.state_dict(), 'checkpoint.pth')
            #break
#    return scores
    return np.mean(scores_window)
Пример #3
0
def dqn(args, eps_start=1.0, eps_end=0.01, eps_decay=0.995):
    """Deep Q-Learning.
    
    Params
    ======
        args : command line arguments
        n_episodes (int): maximum number of training episodes
        max_t (int): maximum number of timesteps per episode
        eps_start (float): starting value of epsilon, for epsilon-greedy action selection
        eps_end (float): minimum value of epsilon
        eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
    """
    scores = []  # list containing scores from each episode
    scores_window = deque(maxlen=100)  # last 100 scores
    eps = eps_start  # initialize epsilon
    state_size = 37
    action_size = 4
    agent = Agent(state_size, action_size, 1)
    for i_episode in range(1, args.num_episodes + 1):
        #resetting the environment for a new episode
        env_info = env.reset(train_mode=True)[brain_name]
        state = env_info.vector_observations[0]
        score = 0
        cnt = 0
        while True:
            action = agent.act(state, eps)
            env_info = env.step(action)[
                brain_name]  # send the action to the environment
            next_state = env_info.vector_observations[0]  # get the next state
            reward = env_info.rewards[0]  # get the reward
            done = env_info.local_done[0]
            agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward
            cnt += 1
            if done:
                break
        scores_window.append(
            score)  # save most recent score in the 100 episode window
        scores.append(score)  # save most recent score
        eps = max(eps_end, eps_decay * eps)  # decrease epsilon
        print('\rEpisode {}\tAverage Score in the last 100 episodes: {:.2f}'.
              format(i_episode, np.mean(scores_window)),
              end="")
        if i_episode % args.save_every == 0:
            print(
                '\nSaving Checkpoint for {:d} episodes!\tAverage Score: {:.2f}'
                .format(i_episode, np.mean(scores_window)))
            torch.save(
                agent.qnetwork_local.state_dict(),
                os.path.join(args.save_checkpoint_path,
                             'checkpoint_' + str(i_episode) + '.pth'))
    return scores
Пример #4
0
def trainFunction(n_episodes=2000,
                  max_t=1000,
                  eps_start=1.0,
                  eps_end=0.01,
                  eps_decay=0.995):
    agent = Agent(state_size=37, action_size=4, seed=0, priority=True)
    epsilons = []
    scores = []  # list containing scores from each episode
    scores_window = deque(maxlen=100)  # last 100 scores
    eps = eps_start  # initialize epsilon
    for i_episode in range(1, n_episodes + 1):
        env_info = env.reset(
            train_mode=True)[brain_name]  # reset the environment
        state = env_info.vector_observations[0]
        score = 0
        for t in range(max_t):
            action = agent.act(state, eps)
            env_info = env.step(action.astype(np.int32))[brain_name]
            next_state = env_info.vector_observations[0]  # get the next state
            reward = env_info.rewards[0]  # get the reward
            done = env_info.local_done[0]  # see if episode has finished
            agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward
            if done:
                break
        scores_window.append(score)  # save most recent score
        scores.append(score)  # save most recent score
        eps = max(eps_end, eps_decay * eps)  # decrease epsilon
        epsilons.append(eps)
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(
            i_episode, np.mean(scores_window)),
              end="")
        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                i_episode, np.mean(scores_window)),
                  end="")
        # if np.mean(scores_window)>=13.0:

    print('\nEnvironment finished in {:d} episodes!\tAverage Score: {:.2f}'.
          format(i_episode, np.mean(scores_window)))
    torch.save(agent.qnetwork_local.state_dict(), 'checkpoint.pth')
    return scores, epsilons
Пример #5
0
def dqn(agent: Agent, params: Params):
    """Deep Q-Learning.

    Params
    ======
        n_episodes (int): maximum number of training episodes
        max_t (int): maximum number of timesteps per episode
        eps_start (float): starting value of epsilon, for epsilon-greedy action selection
        eps_end (float): minimum value of epsilon
        eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
    """

    scores = []  # list containing scores from each episode
    scores_window = deque(maxlen=100)  # last 100 scores
    eps = params.eps_start  # initialize epsilon
    for i_episode in range(1, params.n_episodes + 1):
        agent.init_episode()
        score = 0  # initialize the score
        for t in range(params.max_t):
            agent.act(eps)  # to be defined in the agent
            agent.step()  # to be defined in the agent
            score += agent.get_reward()
            if agent.get_done():
                break

        scores_window.append(score)  # save most recent score
        scores.append(score)  # save most recent score
        eps = max(params.eps_end, params.eps_decay * eps)  # decrease epsilon
        # print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)), end="")
        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                i_episode, np.mean(scores_window)))
        if np.mean(scores_window) >= 200.0:
            print(
                '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                .format(i_episode - 100, np.mean(scores_window)))
            torch.save(agent.qnetwork_local.state_dict(), 'checkpoint.pth')
            break
    return scores
Пример #6
0
def train(n_episodes=2000, eps_start=1.0, eps_end=0.025, eps_decay=0.995):
    agent = Agent(state_size=37, action_size=4, seed=0)
    env = UnityEnvironment(file_name="Banana.app")
    brain_name = env.brain_names[0]                    # get the default brain
    brain = env.brains[brain_name]
    scores = []                        # list containing scores from each episode
    scores_window = deque(maxlen=100)  # last 100 scores
    eps = eps_start                    # initialize epsilon

    
    for i_episode in range(1, n_episodes+1):
        env_info = env.reset(train_mode=True)[brain_name] # reset the environment
        state = env_info.vector_observations[0]            # get the current state
        score = 0                                          # initialize the score
        while True:
            action = agent.act(state, eps)                 # select an action
            env_info = env.step(action)[brain_name]        # send the action to the environment
            next_state = env_info.vector_observations[0]   # get the next state
            reward = env_info.rewards[0]                   # get the reward
            done = env_info.local_done[0]                  # see if episode has finished
            agent.step(state, action, reward, next_state, done)
            score += reward                                # update the score
            state = next_state                             # roll over the state to next time step
            if done:                                       # exit loop if episode finished
                break
        scores_window.append(score)       # save most recent score
        scores.append(score)              # save most recent score
        eps = max(eps_end, eps_decay*eps) # decrease epsilon
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)), end="")
        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
        if np.mean(scores_window)>=13.0:
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_window)))
            torch.save(agent.qnetwork_local.state_dict(), 'checkpoint_Nav_V01_13.pth')
            env.close()    
            break
    return scores    
def train_banana_collector(env, brain_name, maxEpisodes, threshold, \
                           eps_start, eps_end, eps_decay, seed, filename, memory_type):

    # reset the environment
    env_info = env.reset(train_mode=True)[brain_name]
    brain = env.brains[brain_name]

    # number of agents in the environment
    print('Number of agents:', len(env_info.agents))

    # number of actions
    action_size = brain.vector_action_space_size
    print('Number of actions:', action_size)

    # examine the state space
    state = env_info.vector_observations[0]
    print('States look like:', state)
    state_size = len(state)
    print('States have length:', state_size)

    env_info = env.reset(train_mode=True)[brain_name]
    agent = Agent(state_size=state_size,
                  action_size=action_size,
                  seed=seed,
                  memory_type=memory_type)

    state = env_info.vector_observations[0]  # get the current state

    # initialize the score
    score = 0  # current score within an episode
    scores = []  # list containing scores from each episode
    scores_window = deque(maxlen=100)  # last 100 scores

    # initialize epsilon
    eps = eps_start

    # now execute up to maximum "maxEpisodes" episodes
    for i_episode in range(1, maxEpisodes):
        # 1.Step: reset the environment - set the train_mode to True !!
        env_info = env.reset(train_mode=True)[brain_name]

        # 2. Step: get the current state
        state = env_info.vector_observations[0]

        # 3.Step: set the score of the current episode to 0
        score = 0

        # 4.Step: while episode has not ended (done = True) repeat
        while True:
            # 5.Step: Calculate the next action from agent with epsilon eps
            action = agent.act(state, eps)
            #print("Action = " , action)

            # 6.Step: Tell the environment about this action and get result
            env_info = env.step(action)[brain_name]

            # 7.Step: now let's get the state observation from observation
            next_state = env_info.vector_observations[0]

            # 8.Step: now let's get the reward observation from observation
            reward = env_info.rewards[0]
            #print("Reward = " , reward)

            # 9.Step: now let's get the done observation from observation
            done = env_info.local_done[0]

            # 10.Step: Add the reward of the last action-state result
            score += reward

            # 11.Step: Execute a training step of the agent
            agent.step(state, action, reward, next_state, done)

            # 12.Step: Continue while-loop with next_state as current state
            state = next_state

            # 13.Step: in case of end of episode print the result and break loop
            if done:
                #print("Episode " , i_episode , " has ended with score: " , score)
                break

        # 14.Step: Finally append the score of last epsisode to the overall scores
        scores_window.append(score)
        scores.append(score)

        # 15.Step: Calculate next epsilon
        eps = max(eps_end, eps_decay * eps)  # decrease epsilon
        print('\rEpisode {}\tAverage Score: {:.2f} , epsilon: {}'.format(
            i_episode, np.mean(scores_window), eps),
              end="")

        # 16.Step: Print results every 100 episodes
        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                i_episode, np.mean(scores_window)))

        # 17.Step: In case the performance "threshold" is exceeded --> stop and save the current agents neural network
        if np.mean(scores_window) >= threshold and len(scores_window) == 100:
            print(
                '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                .format(i_episode, np.mean(scores_window)))
            torch.save(agent.qnn_local.state_dict(), filename)
            break

    return scores
Пример #8
0
class DQN():
    def __init__(self, state_size, action_size, env):
        self.agent = Agent(state_size=state_size,
                           action_size=action_size,
                           seed=0)
        self.env = env
        self.saved_network = 'VisualBanana_DQN_chkpt.pth'

    def train(self,
              n_episodes=2000,
              max_t=1000,
              eps_start=1.0,
              eps_end=0.01,
              eps_decay=0.995,
              score_window_size=100,
              target_score=13.0,
              save=True,
              verbose=True):
        """Deep Q-Learning.

            Params
            ======
                n_episodes  (int): max. number of training episodes
                max_t       (int): max. number of timesteps per episode
                eps_start (float): starting value of epsilon, for epsilon-greedy action selection
                eps_end   (float): min. value of epsilon
                eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
            """

        moving_avgs = [
        ]  # list containing moving average scores (over last 100 episodes)
        scores = []  # list containing scores from each episode
        scores_window = deque(
            maxlen=score_window_size)  # last score_window_size scores
        eps = eps_start  # initialize epsilon
        save12 = False

        start = time.time()
        for i_episode in range(1, n_episodes + 1):
            state = self.env.reset()
            score = 0
            for t in range(max_t):
                action = self.agent.act(state, eps)
                next_state, reward, done, _ = self.env.step(action)
                self.agent.step(state, action, reward, next_state, done)
                state = next_state
                score += reward
                if done:
                    break

            scores_window.append(score)  # save most recent score
            scores.append(score)  # save most recent score
            eps = max(eps_end, eps_decay * eps)  # decrease epsilon
            avg_score = np.mean(scores_window)
            moving_avgs.append(avg_score)

            if (avg_score >= 13.0) and not save12:
                torch.save(self.agent.qnetwork_local.state_dict(),
                           self.saved_network)
                np.save('VisualBanana_Scores.npy', np.array(scores))
                save12 = True

            if (avg_score >= target_score) and (i_episode > 100):
                print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'\
                        .format(i_episode-100, np.mean(scores_window)))
                self.solved = True
                if save:
                    torch.save(self.agent.qnetwork_local.state_dict(),
                               self.saved_network)
                break

            print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                i_episode, np.mean(scores_window)),
                  end="")
            if (i_episode % 100 == 0):
                print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                    i_episode, np.mean(scores_window)))

            if (i_episode % 100 == 0):
                end = time.time()
                elapsed = (end - start) / 60.0
                print('\tElapsed: {:3.2f} mins.'.format(elapsed))

        if save:
            torch.save(self.agent.qnetwork_local.state_dict(),
                       self.saved_network)

        end = time.time()
        elapsed = (end - start) / 60.0
        print('\n*** TOTAL ELAPSED: {:3.2f} mins. ***'.format(elapsed))

        return scores, moving_avgs
def dqn(n_episodes=10000,
        max_t=1000,
        eps_start=1.0,
        eps_end=0.05,
        eps_decay=0.995,
        train_mode=True):
    """Deep Q-Learning.
    
    Params
    ======
        agent: 
        env: 
        n_episodes (int): maximum number of training episodes
        max_t (int): maximum number of timesteps per episode
        eps_start (float): starting value of epsilon, for epsilon-greedy action selection
        eps_end (float): minimum value of epsilon
        eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
        train_mode (bool): set environment into training mode if True. 
    """
    scores = []  # list containing scores from each episode
    scores_window = deque(maxlen=100)  # last 100 scores
    eps = eps_start  # initialize epsilon

    env = UnityEnvironment(file_name="Banana/Banana.exe",
                           base_port=64738,
                           no_graphics=True)
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    action_size = brain.vector_action_space_size
    env_info = env.reset(train_mode=train_mode)[brain_name]
    state_size = len(env_info.vector_observations[0])

    agent = Agent(state_size=state_size, action_size=action_size, seed=0)

    for i_episode in range(1, n_episodes + 1):
        state = env_info.vector_observations[0]
        score = 0
        for t in range(max_t):
            action = np.int32(agent.act(state, eps))
            #next_state, reward, done, _ = env.step(action)
            env_info = env.step(action)[
                brain_name]  # send the action to the environment
            next_state = env_info.vector_observations[0]  # get the next state
            reward = env_info.rewards[0]  # get the reward
            done = env_info.local_done[0]  # see if episode has finished

            agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward
            if done:
                env.reset(train_mode=train_mode)[brain_name]
                break
        scores_window.append(score)  # save most recent score
        scores.append(score)  # save most recent score
        eps = max(eps_end, eps_decay * eps)  # decrease epsilon
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(
            i_episode, np.mean(scores_window)),
              end="")
        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                i_episode, np.mean(scores_window)))
        if np.mean(scores_window) > 13.0:
            print(
                '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                .format(i_episode - 100, np.mean(scores_window)))
            torch.save(agent.qnetwork_local.state_dict(),
                       'checkpoint_vanilla.pth')
            break
    return scores
Пример #10
0
    def dqn(agent: Agent, env: UnityEnvironment, n_episodes: int, max_t: int,
            eps_start: float, eps_end: float, eps_decay: float) -> None:
        """Deep Q-Learning.
        Params
        ======
            n_episodes (int): maximum number of training episodes
            max_t (int): maximum number of timesteps per episode
            eps_start (float): starting value of epsilon, for epsilon-greedy action selection
            eps_end (float): minimum value of epsilon
            eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
        """

        scores: List[float] = []  # list containing scores from each episode
        scores_window: Deque[float] = deque(
            maxlen=settings.score_window_size
        )  # last settings.score_window_size scores
        eps: float = eps_start  # initialize epsilon

        for i_episode in range(1, n_episodes + 1):
            env_info: BrainInfo = env.reset(
                train_mode=True)[brain_name]  # reset the environment
            state: np.ndarray = env_info.vector_observations[
                0]  # get the current state
            score: float = 0

            for t in range(max_t):
                action: int = agent.act(state, eps)
                env_info: BrainInfo = env.step(action)[brain_name]
                next_state: np.ndarray = env_info.vector_observations[
                    0]  # get the next state
                reward: float = env_info.rewards[0]  # get the reward
                done: bool = env_info.local_done[
                    0]  # see if episode has finished
                agent.step(Experience(state, action, reward, next_state, done))
                state = next_state
                score += reward
                if done:
                    break

            scores_window.append(score)  # save most recent score
            scores.append(score)  # save most recent score

            eps = max(eps_end, eps_decay * eps)  # decrease epsilon

            print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                i_episode, np.mean(scores_window)),
                  end="")
            if i_episode % settings.score_window_size == 0:
                print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                    i_episode, np.mean(scores_window)))
            if np.mean(scores_window) >= settings.solved_at:
                print(
                    '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                    .format(i_episode - settings.score_window_size,
                            np.mean(scores_window)))
                # The env is solved, save the outputs.
                create_output_files(agent.qnetwork_local, scores, i_episode,
                                    Path() / settings.output_dir,
                                    settings.checkpoints_dir)
                env.close()
                break

        # episodes reached, save the outputs.
        create_output_files(agent.qnetwork_local, scores, n_episodes,
                            Path() / settings.output_dir,
                            settings.checkpoints_dir)
        env.close()
Пример #11
0
def demo4_LearningPathPlanning(setting):

    n_sample = 100

    # Environment
    env = FireEnvironment(64, 64)
    # Vehicle to generate observation mask
    vehicle = Vehicle(n_time_windows=512,
                      grid_size=(64, 64),
                      planner_type='Default')
    # Trainer and Estimator
    dyn_autoencoder = DynamicAutoEncoder(SETTING,
                                         grid_size=(env.map_width,
                                                    env.map_height),
                                         n_state=3,
                                         n_obs=3,
                                         encoding_dim=16,
                                         gru_hidden_dim=16)
    ### DQN agent
    dqn_agent = DQN_Agent(state_size=16,
                          action_size=4,
                          replay_memory_size=1000,
                          batch_size=64,
                          gamma=0.99,
                          learning_rate=0.01,
                          target_tau=0.01,
                          update_rate=1,
                          seed=0)
    # Train Data Buffer
    memory = SingleTrajectoryBuffer(N_MEMORY_SIZE)
    # Train Iteration Logger

    writer = SummaryWriter()
    # Video Writier
    video_writer1 = ImageStreamWriter('LearningPlanner.avi',
                                      FPS,
                                      image_size=(1200, 820))

    # Add concat. text
    setting_text = ''
    for k, v in setting.items():
        setting_text += k
        setting_text += ':'
        setting_text += str(v)
        setting_text += '\t'
    writer.add_text('setting', setting_text)

    ########################################
    ### Interacting with the Environment ###
    ########################################
    mask_obs, obs, state = env.reset()
    state_est_grid = dyn_autoencoder.u_k

    ### Loss Monitors ###
    list_loss = []
    list_cross_entropy_loss = []
    list_entropy_loss = []
    list_rewards = []
    list_new_fire_count = []
    list_action = []

    ### Filling the Data Buffer ###
    for i in tqdm.tqdm(range(N_TRAIN_WAIT)):
        map_visit_mask, img_resized = vehicle.full_mask()
        mask_obs, obs, state, reward, info = env.step(map_visit_mask)
        memory.add(mask_obs.detach().long(),
                   state.detach().long(),
                   map_visit_mask.detach().long())

    for i in tqdm.tqdm(range(N_TOTAL_TIME_STEPS)):

        # determine epsilon-greedy action from current sate
        h_k = dyn_autoencoder.h_k.squeeze().data.cpu().numpy()
        epsilon = 0.1
        action = dqn_agent.act(h_k, epsilon)
        list_action.append(action)

        ### Collect Data from the Env. ###
        map_visit_mask, img_resized = vehicle.plan_a_trajectory(
            state_est_grid, n_sample, action)
        mask_obs, obs, state, reward, info = env.step(map_visit_mask)
        memory.add(mask_obs.detach().long(),
                   state.detach().long(),
                   map_visit_mask.detach().long())

        ### Run the Estimator ###
        state_est_grid = dyn_autoencoder.step(mask_obs, map_visit_mask)
        h_kp1 = dyn_autoencoder.h_k.squeeze().data.cpu().numpy()

        #### Update the reinforcement learning agent ###
        dqn_agent.step(h_k, action, reward, h_kp1, done=False)

        list_rewards.append(reward)
        list_new_fire_count.append(info['new_fire_count'])

        ################################
        ### Rendering and Save Video ###
        ################################
        img_env = env.output_image()
        img_agent = dyn_autoencoder.output_image(state_est_grid)

        # State Est
        #blank = np.zeros((400, 200, 3))
        img_top = img_env  #np.concatenate((blank, img_env[:,:800], blank), axis=1)
        blank = np.zeros((20, 1200, 3))
        img_top = np.concatenate((img_top, blank), axis=0)
        img_top = (img_top * 255).astype('uint8')

        img_state_est_grid_uint8 = (img_agent * 255).astype('uint8')
        backtorgb = cv2.cvtColor(img_state_est_grid_uint8, cv2.COLOR_GRAY2RGB)
        img_bayes_uint8 = np.concatenate((img_top, backtorgb),
                                         axis=0)  #<-- to be saved
        render('Dynamic Auto Encoder', img_bayes_uint8, 1)

        # Save video #
        video_writer1.write_image_frame(img_bayes_uint8)

        ### Training ###
        loss_val, loss_val_cross, loss_val_ent, O_np_val = dyn_autoencoder.update(
            memory, N_TRAIN_BATCH, N_TRAIN_WINDOW)
        list_loss.append(loss_val)
        list_cross_entropy_loss.append(loss_val_cross)
        list_entropy_loss.append(loss_val_ent)

        if i % N_LOGGING_PERIOD == 0:
            avg_loss = np.mean(np.array(list_loss))
            list_loss = []
            writer.add_scalar('dynautoenc/loss', avg_loss, i)

            avg_loss_cross = np.mean(np.array(list_cross_entropy_loss))
            list_cross_entropy_loss = []
            writer.add_scalar('dynautoenc/crossentropy', avg_loss_cross, i)

            avg_loss_entropy = np.mean(np.array(list_entropy_loss))
            list_entropy_loss = []
            writer.add_scalar('dynautoenc/shannonentropy', avg_loss_entropy, i)

            avg_reward = np.mean(np.array(list_rewards))
            list_rewards = []
            writer.add_scalar('perform/rewards', avg_reward, i)

            avg_new_fire_count = np.mean(np.array(list_new_fire_count))
            list_new_fire_count = []
            writer.add_scalar('perform/new_fire_counts', avg_new_fire_count, i)

            writer.add_scalar('perform/pc_coverd_new_fire',
                              avg_reward / avg_new_fire_count, i)

            action_0_count = list_action.count(0)
            action_1_count = list_action.count(1)
            action_2_count = list_action.count(2)
            action_3_count = list_action.count(3)

            writer.add_scalar('action_count/0',
                              action_0_count / len(list_action), i)
            writer.add_scalar('action_count/1',
                              action_1_count / len(list_action), i)
            writer.add_scalar('action_count/2',
                              action_2_count / len(list_action), i)
            writer.add_scalar('action_count/3',
                              action_3_count / len(list_action), i)
            list_action = []

            writer.add_scalar('obs_state0/o00', O_np_val[0][0], i)
            writer.add_scalar('obs_state1/o01', O_np_val[0][1], i)
            writer.add_scalar('obs_state2/o02', O_np_val[0][2], i)
            writer.add_scalar('obs_state0/o10', O_np_val[1][0], i)
            writer.add_scalar('obs_state1/o11', O_np_val[1][1], i)
            writer.add_scalar('obs_state2/o12', O_np_val[1][2], i)
            writer.add_scalar('obs_state0/o20', O_np_val[2][0], i)
            writer.add_scalar('obs_state1/o21', O_np_val[2][1], i)
            writer.add_scalar('obs_state2/o22', O_np_val[2][2], i)

            print(
                'losses at iteration: %d, losses: total %.3f, cross %.3f, shannon %.3f'
                % (i, avg_loss, avg_loss_cross, avg_loss_entropy))
            print('memory size at iteration: %d, size: %d' %
                  (i, len(memory.obs_memory)))

        if (i + 1) % N_SAVING_PERIOD == 0:
            f_name = setting['name']
            dyn_autoencoder.save_the_model(i, f_name)
            dqn_agent.save_the_model(i, f_name)

    video_writer1.close()
Пример #12
0
    while True:
        # determine epsilon-greedy action from current sate
        actions = agent.act(state, epsilon)

        converted_actions = [convert_action(a) for a in actions]
        # print ("CONVERTED_ACTIONS", actions, converted_actions)

        # send the action to the environment and receive resultant environment information
        env_info = env.step(converted_actions)[brain_name]

        next_state = env_info.vector_observations  # get the next state
        rewards = env_info.rewards  # get the reward
        dones = env_info.local_done  # see if episode has finished

        #Send (S, A, R, S') info to the DQN agent for a neural network update
        agent.step(state, actions, rewards, next_state, dones)

        # set new state to current state for determining next action
        state = next_state

        # Update episode score
        scores += rewards

        # If unity indicates that episode is done,
        # then exit episode loop, to begin new episode
        if all(dones):
            break

    print("Scores", scores)

    # # Add episode score to Scores and...
Пример #13
0
def dqn(n_episodes=4000,
        max_t=3000,
        eps_start=1.0,
        eps_end=0.01,
        eps_decay=0.995):
    agent = Agent(state_size=3, action_size=8, seed=0)

    start_pos = (200, 600)
    end_pos = (800, 375)
    env = environment(MAP, start_pos, end_pos)
    """Deep Q-Learning.
	
	Params
	======
		n_episodes (int): maximum number of training episodes
		max_t (int): maximum number of timesteps per episode
		eps_start (float): starting value of epsilon, for epsilon-greedy action selection
		eps_end (float): minimum value of epsilon
		eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
	"""
    scores = []  # list containing scores from each episode
    scores_window = deque(maxlen=100)  # last 100 scores
    eps = eps_start  # initialize epsilon

    for i_episode in range(1, n_episodes + 1):
        state, _, _ = env.reset(start_pos, end_pos)
        score = 0

        for t in range(max_t):
            action = agent.act(state, eps)
            next_state, reward, done = env.step(action)
            agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward

            if done:
                #print (state)
                break

        scores_window.append(score)  # save most recent score
        scores.append(score)  # save most recent score
        eps = max(eps_end, eps_decay * eps)  # decrease epsilon
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(
            i_episode, np.mean(scores_window)),
              end="")

        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                i_episode, np.mean(scores_window)))

        #if np.mean(scores_window)>=200.0:
        #print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_window)))

        if i_episode % 200 == 0:
            torch.save(agent.qnetwork_local.state_dict(),
                       'checkpoint' + str(i_episode) + '.pth')

        #torch.save(agent.qnetwork_local.state_dict(), 'checkpoint.pth')
        #break

    return scores
Пример #14
0
def train(fullcover, name, setting):

    n_sample = 20

    # Environment
    env = FireEnvironment(64, 64)

    # Vehicle to generate observation mask
    vehicle = Vehicle(n_time_windows=1000, grid_size=(64,64), planner_type=setting['planner_type'])

    # Trainer and Estimator
    dyn_autoencoder = DynamicAutoEncoder(SETTING, grid_size = (env.map_width, env.map_height), n_state=3, n_obs=3, encoding_dim=16, gru_hidden_dim=16)

    # Train Data Buffer
    memory = SingleTrajectoryBuffer(N_MEMORY_SIZE)

    ### DQN agent
    dqn_agent = DQN_Agent(state_size=16, action_size=4, replay_memory_size=1000, batch_size=64, gamma=0.99, learning_rate=0.01, target_tau=0.01, update_rate=1, seed=0)

    # Train Iteration Logger
    from torch.utils.tensorboard import SummaryWriter
    writer = SummaryWriter()

    # Add concat. text
    setting_text = ''
    for k,v in setting.items():
        setting_text += k
        setting_text += str(v)
        setting_text += '\t'
    writer.add_text('setting', setting_text)


    ########################################
    ### Interacting with the Environment ###
    ########################################
    mask_obs, obs, state = env.reset()
    map_visit_mask, img_resized = vehicle.full_mask()
    state_est_grid = dyn_autoencoder.u_k

    ### Loss Monitors ###
    list_loss = []
    list_cross_entropy_loss = []
    list_entropy_loss = []
    list_rewards = []
    list_count_fire_visit = []
    list_count_all_fire = []
    list_action = []

    ### Filling the Data Buffer ###
    for i in tqdm.tqdm(range(N_TRAIN_WAIT)):         
        if fullcover:
            map_visit_mask, img_resized = vehicle.plan_a_trajectory(state_est_grid, n_sample, action)
        else:
            map_visit_mask, img_resized = vehicle.full_mask()

        mask_obs, obs, state, reward = env.step(map_visit_mask)
        memory.add(mask_obs, state, map_visit_mask)
        


    for i in tqdm.tqdm(range(N_TOTAL_TIME_STEPS)):

        # determine epsilon-greedy action from current sate
        h_k = dyn_autoencoder.h_k.squeeze().data.cpu().numpy()
        epsilon = 0.1
        action = dqn_agent.act(h_k, epsilon)
        list_action.append(action)    

        ### Collect Data from the Env. ###
        if fullcover:
            map_visit_mask, img_resized = vehicle.full_mask()
        else:
            map_visit_mask, img_resized = vehicle.plan_a_trajectory(state_est_grid, n_sample, action)
            
        
        mask_obs, obs, state, reward = env.step(map_visit_mask)
        memory.add(mask_obs, state, map_visit_mask)

        ### Run the Estimator ###
        state_est_grid = dyn_autoencoder.step(mask_obs, map_visit_mask)
        h_kp1 = dyn_autoencoder.h_k.squeeze().data.cpu().numpy()

        #### Update the reinforcement learning agent ###
        dqn_agent.step(h_k, action, reward, h_kp1, done=False)

        list_rewards.append(reward)
        fire_count = (torch.sum(state[2])).item()
        fire_visit = (torch.sum(mask_obs.permute(2,0,1) * state[2].unsqueeze(0))).item()

        if fire_count < 1:
            print('no fire')
        else:
            list_count_fire_visit.append(fire_visit)
            list_count_all_fire.append(fire_count)

        ### Render the Env. and the Est. ###
        if i % N_RENDER_PERIOD == 0:
            img_env   = env.output_image()
            img_state_est_grid = dyn_autoencoder.output_image(state_est_grid)
            
            render('env', img_env, 1)
            render('img_state_est_grid', img_state_est_grid, 1)            


        ### Training ###
        loss_val, loss_val_cross, loss_val_ent, O_np_val =  dyn_autoencoder.update(memory, N_TRAIN_BATCH, N_TRAIN_WINDOW)
        list_loss.append(loss_val)
        list_cross_entropy_loss.append(loss_val_cross)
        list_entropy_loss.append(loss_val_ent)

        if i%N_LOGGING_PERIOD == 0:
            avg_loss = np.mean(np.array(list_loss))
            list_loss = []
            writer.add_scalar('dynautoenc/loss', avg_loss, i)

            avg_loss_cross = np.mean(np.array(list_cross_entropy_loss))
            list_cross_entropy_loss = []
            writer.add_scalar('dynautoenc/crossentropy', avg_loss_cross, i)

            avg_loss_entropy = np.mean(np.array(list_entropy_loss))
            list_entropy_loss = []
            writer.add_scalar('dynautoenc/shannonentropy', avg_loss_entropy, i)

            avg_reward = np.mean(np.array(list_rewards))
            list_rewards = []
            writer.add_scalar('perform/rewards', avg_reward, i)

            avg_count_fire_visit = np.mean(np.array(list_count_fire_visit))
            list_count_fire_visit = []
            writer.add_scalar('perform/avg_count_fire_visit', avg_count_fire_visit, i)

            avg_count_all_fire = np.mean(np.array(list_count_all_fire))
            list_count_all_fire = []
            writer.add_scalar('perform/avg_count_all_fire', avg_count_all_fire, i)


            action_0_count = list_action.count(0)
            action_1_count = list_action.count(1)
            action_2_count = list_action.count(2)
            action_3_count = list_action.count(3)
            list_action = []

            if setting['planner_type'] == 'Default':
                writer.add_scalar('action_count/0', action_0_count, i)
                writer.add_scalar('action_count/1', action_1_count, i)
                writer.add_scalar('action_count/2', action_2_count, i)
                writer.add_scalar('action_count/3', action_3_count, i)


            writer.add_scalar('obs_state0/o00', O_np_val[0][0], i)
            writer.add_scalar('obs_state1/o01', O_np_val[0][1], i)
            writer.add_scalar('obs_state2/o02', O_np_val[0][2], i)
            writer.add_scalar('obs_state0/o10', O_np_val[1][0], i)
            writer.add_scalar('obs_state1/o11', O_np_val[1][1], i)
            writer.add_scalar('obs_state2/o12', O_np_val[1][2], i)
            writer.add_scalar('obs_state0/o20', O_np_val[2][0], i)
            writer.add_scalar('obs_state1/o21', O_np_val[2][1], i)
            writer.add_scalar('obs_state2/o22', O_np_val[2][2], i)

            print('losses at iteration: %d, losses: total %.3f, cross %.3f, shannon %.3f' % (i, avg_loss, avg_loss_cross, avg_loss_entropy))
            print('memory size at iteration: %d, size: %d' % (i, len(memory.obs_memory)))

        if (i+1)%N_SAVING_PERIOD==0:
            f_name = name
            dyn_autoencoder.save_the_model(i, f_name)
Пример #15
0
def train_dqn(dev,
              weights_file,
              n_episodes=1000,
              max_t=1000,
              eps_start=1.0,
              eps_end=0.01,
              eps_decay=0.995):
    """Deep Q-Learning.

    Params
    ======
        dev (string): cpu or gpu
        weights_file (string): name of the file to save the weights
        n_episodes (int): maximum number of training episodes
        max_t (int): maximum number of timesteps per episode
        eps_start (float): starting value of epsilon, for epsilon-greedy action selection
        eps_end (float): minimum value of epsilon
        eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
    """
    scores = []  # list containing scores from each episode
    averages = [
    ]  # list containing averages of the scores. Position i (1-index) has the average of the last min(i,100) episodes
    scores_window = deque(maxlen=100)  # last 100 scores
    env = UnityEnvironment(file_name='./Banana_Linux/Banana.x86_64')
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]

    env_info = env.reset(train_mode=True)[brain_name]
    state_size = len(env_info.vector_observations[0])
    action_size = brain.vector_action_space_size
    agent = Agent(state_size, action_size, seed=0, device=dev)

    eps = eps_start  # initialize epsilon
    for i_episode in range(1, n_episodes + 1):
        env_info = env.reset(train_mode=True)[brain_name]
        state = env_info.vector_observations[0]
        score = 0
        for t in range(max_t):
            action = agent.act(state, eps)
            env_info = env.step(action)[brain_name]
            next_state = env_info.vector_observations[0]
            reward = env_info.rewards[0]
            done = env_info.local_done[0]

            agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward
            if done:
                break
        scores_window.append(score)  # save most recent score
        scores.append(score)  # save most recent score
        averages.append(np.mean(scores_window))
        eps = max(eps_end, eps_decay * eps)  # decrease epsilon
        if (i_episode % 100 != 0):
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                i_episode, averages[i_episode - 1]),
                  end="")
        else:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                i_episode, averages[i_episode - 1]))
        if (averages[i_episode - 1] >= 13.0):
            print(
                '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                .format(i_episode - 100, averages[i_episode - 1]))
            torch.save(agent.qnetwork_local.state_dict(), weights_file)
            break

    env.close()
    return scores, averages
Пример #16
0
def dqn(n_episodes=2000,
        max_t=1000,
        eps_start=1.0,
        eps_end=0.01,
        eps_decay=0.995,
        max_score=200.0,
        layers_neurones=64):
    """Deep Q-Learning.
    
    Params
    ======
        n_episodes (int): maximum number of training episodes
        max_t (int): maximum number of timesteps per episode
        eps_start (float): starting value of epsilon, for epsilon-greedy action selection
        eps_end (float): minimum value of epsilon
        eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
    """

    agent = Agent(state_size=state_size,
                  action_size=action_size,
                  seed=0,
                  layers_neurones=layers_neurones)

    filename = f'./results/n_episodes={n_episodes}, max_t={max_t}, eps_start={eps_start}, eps_end={eps_end}, eps_decay={eps_decay}, max_score = {max_score}, layers_neurones = {layers_neurones}'
    scores = []  # list containing scores from each episode
    scores_window = deque(maxlen=100)  # last 100 scores
    eps = eps_start  # initialize epsilon
    for i_episode in range(1, n_episodes + 1):
        # state = env.reset()

        env_info = env.reset(train_mode=True)[brain_name]
        score = 0
        for t in range(max_t):
            state = env_info.vector_observations[0]
            action = agent.act(state, eps)

            env_info = env.step(action)[
                brain_name]  # send the action to the environment
            next_state = env_info.vector_observations[0]  # get the next state
            reward = env_info.rewards[0]  # get the reward
            done = env_info.local_done[0]  # see if episode has finished

            # next_state, reward, done, _ = env.step(action)
            agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward
            if done:
                break
        scores_window.append(score)  # save most recent score
        scores.append(score)  # save most recent score
        eps = max(eps_end, eps_decay * eps)  # decrease epsilon
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(
            i_episode, np.mean(scores_window)),
              end="")
        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                i_episode, np.mean(scores_window)))
            with open(f'{filename}.json', 'w') as filehandle:
                json.dump(scores, filehandle)

        if np.mean(scores_window) >= max_score:
            print(
                '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                .format(i_episode - 100, np.mean(scores_window)))
            torch.save(agent.qnetwork_local.state_dict(), f'{filename}.pth')
            with open(f'{filename}.json', 'w') as filehandle:
                json.dump(scores, filehandle)
            break
    torch.save(agent.qnetwork_local.state_dict(), f'{filename}.pth')
    return scores
Пример #17
0
def train(agent_config,
          n_episodes=2000,
          max_t=1000,
          base_port=5005,
          save_path=None,
          name=None):
    """Deep Q-Learning.
    
    Params
    ======
        n_episodes (int): maximum number of training episodes
        max_t (int): maximum number of timesteps per episode
        eps_start (float): starting value of epsilon, for epsilon-greedy action selection
        eps_end (float): minimum value of epsilon
        eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
    """
    env = UnityEnvironment(file_name="Banana_Linux_NoVis/Banana.x86_64",
                           no_graphics=True,
                           base_port=base_port)
    # get the default brain
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]

    eps_start = agent_config.get('eps_start', 1.0)
    eps_end = agent_config.get('eps_end', 0.01)
    eps_decay = agent_config.get('eps_decay', 0.995)

    lr = agent_config.get('lr', 1e-3)
    lr_decay = agent_config.get('lr_decay', 1)
    agent = Agent(seed=0, **agent_config)

    # reset
    env_info = env.reset(train_mode=True)[brain_name]

    scores = []  # list containing scores from each episode
    scores_window = deque(maxlen=100)  # last 100 scores
    eps = eps_start  # initialize epsilon

    with trange(n_episodes, desc='episode') as episode_bar:
        for episode in episode_bar:
            env_info = env.reset(train_mode=True)[brain_name]
            state = env_info.vector_observations[0]
            score = 0
            for t in range(max_t):
                action = agent.act(state, eps)
                env_info = env.step(action)[brain_name]
                next_state = env_info.vector_observations[0]
                reward = env_info.rewards[0]
                done = env_info.local_done[0]
                agent.step(state, action, reward, next_state, done)
                state = next_state
                score += reward
                if done:
                    break
            scores_window.append(score)  # save most recent score
            scores.append(score)  # save most recent score
            eps = max(eps_end, eps_decay * eps)  # decrease epsilon
            lr = lr * lr_decay  # decrease learning rate
            for g in agent.optimizer.param_groups:
                g['lr'] = lr

            episode_bar.set_postfix(avg_score=np.mean(scores_window))

        if save_path:
            torch.save(agent.qnetwork_local.state_dict(), save_path)

        env.close()
        return pd.Series(scores, name=name)
Пример #18
0
    def run(self,
            run_id=1,
            n_episodes=2000,
            max_t=1000,
            eps_start=1.0,
            eps_end=0.01,
            eps_decay=0.995,
            lr=5e-4,
            use_double_dqn=False,
            use_soft_update=True):
        start = time.time()

        agent = Agent(state_size=37,
                      action_size=4,
                      seed=0,
                      lr=lr,
                      use_double_dqn=use_double_dqn,
                      use_soft_update=use_soft_update)

        # list containing scores from each episode
        scores = []

        # last 100 scores
        scores_window = deque(maxlen=100)

        # initialize epsilon
        eps = eps_start

        for i_episode in range(1, n_episodes + 1):

            # reset the environment
            env_info = self.env.reset(train_mode=True)[self.brain_name]

            # get the current state
            state = env_info.vector_observations[0]

            score = 0

            for t in range(max_t):

                action = agent.act(state, eps)
                #print("action: ", action)

                # send the action to the environment
                env_info = self.env.step(action)[self.brain_name]

                # get the next state
                next_state = env_info.vector_observations[0]

                # get the reward
                reward = env_info.rewards[0]

                # see if episode has finished
                done = env_info.local_done[0]

                # TODO add proper comment
                agent.step(state, action, reward, next_state, done)

                state = next_state
                score += reward

                if done:
                    break

            # save most recent score
            scores_window.append(score)

            # save most recent score
            scores.append(score)

            # decrease epsilon
            eps = max(eps_end, eps_decay * eps)

            print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                i_episode, np.mean(scores_window)),
                  end="")

            if i_episode % 100 == 0:
                print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                    i_episode, np.mean(scores_window)))

            if np.mean(scores_window) >= 14.0:
                print(
                    '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                    .format(i_episode - 100, np.mean(scores_window)))

                end = time.time()
                elapsed = end - start
                print("\nTime taken to solve: {:.2f} minutes".format(elapsed /
                                                                     60.0))

                run_dir = "results/{}".format(run_id)
                os.mkdir(run_dir)

                torch.save(agent.qnetwork_local.state_dict(),
                           "{}/checkpoint.pth".format(run_dir))
                break

        return scores
def dqn(n_episodes=2000,
        max_t=1000,
        eps_start=1.0,
        eps_end=0.01,
        eps_decay=0.995):
    """Deep Q-Learning.
    
    Params
    ======
        n_episodes (int): maximum number of training episodes
        max_t (int): maximum number of timesteps per episode
        eps_start (float): starting value of epsilon, for epsilon-greedy action selection
        eps_end (float): minimum value of epsilon
        eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
    """
    # Get environment instance
    env = UnityEnvironment(file_name=BANANA_FILE)
    # get the default brain
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    # Reset environment
    env_info = env.reset(train_mode=True)[brain_name]
    # Get initial state, state size and action size
    action_size = brain.vector_action_space_size
    state = env_info.vector_observations[0]
    state_size = len(state)
    # Setup agent
    agent = Agent(state_size=state_size, action_size=action_size, seed=0)

    # Train!
    max_avg_score = -100000  # max avg score over 100 episodes
    scores = []  # list containing scores from each episode
    scores_window = deque(maxlen=100)  # last 100 scores
    eps = eps_start  # initialize epsilon
    for i_episode in range(1, n_episodes + 1):
        state = env.reset(train_mode=True)[brain_name].vector_observations[0]
        score = 0
        for t in range(max_t):
            action = agent.act(state, eps)
            env_info = env.step(action)[brain_name]
            next_state = env_info.vector_observations[0]
            reward = env_info.rewards[0]
            done = env_info.local_done[0]
            agent.step(state, action, reward, next_state, done)
            score += reward
            state = next_state
            if done:
                break
        scores_window.append(score)  # save most recent score
        scores.append(score)  # save most recent score
        eps = max(eps_end, eps_decay * eps)  # decrease epsilon
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(
            i_episode, np.mean(scores_window)),
              end="")
        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                i_episode, np.mean(scores_window)))
        if np.mean(scores_window) >= 13.0 and np.mean(
                scores_window) > max_avg_score:
            print(
                '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                .format(i_episode - 100, np.mean(scores_window)))
            torch.save(agent.qnetwork_local.state_dict(), 'checkpoint.pth')
            # break
            max_avg_score = np.mean(scores_window)

    # Close environment
    env.close()
    return scores
Пример #20
0
def run(experiment_name, num_iterations, learning_rate, buffer_size,
        batch_size, gamma, epsilon, epsilod_decay, epsilon_min, stack_size,
        device, is_ddqn, evaluation_rate, log_directory):
    scores = []

    episodic_accum = 0
    epsidoic_rewards = []
    iteration_rewards = []
    episode = 1

    agent = Agent(env=env, state_space=state_space, action_space=action_space, learning_rate=learning_rate,\
                     buffer_size=buffer_size, batch_size=batch_size, gamma=gamma,\
                     device=device, in_channels=stack_size, is_ddqn = is_ddqn)

    #initializing log directory for tensorboard
    if not os.path.exists(log_directory):
        os.makedirs(log_directory)
    tb_writer = SummaryWriter('{}/{}'.format(log_directory, experiment_name))

    frame_count = 0
    epoch_plot_count = 0
    stop = False
    prev_iteration = None
    while agent.num_train_updates < num_iterations + 1 and not stop:
        state = env.reset()
        done = False

        # current state & 3-previous states
        state_frames = deque(maxlen=stack_size)

        episode_reward = []

        while not done:
            frame_count += 1

            _state = preprocess_state(state)
            state = torch.from_numpy(_state).float()

            # if it's the first frame, copy the same state multiple time in the stack
            if len(state_frames) < stack_size:
                for i in range(stack_size):
                    state_frames.append(state)
            else:
                state_frames.append(state)

            state_stack = torch.stack(list(state_frames)).unsqueeze(dim=0)
            action = agent.act(state_stack, epsilon)
            prev_action = action

            next_state, reward, done, info = env.step(action)
            _next_state = preprocess_state(next_state)
            _next_state = torch.from_numpy(_next_state).float()
            agent.step(state_frames.__copy__(), action, reward, _next_state,
                       done)
            state = next_state

            episodic_accum += reward
            iteration_rewards.append(reward)

            if agent.num_train_updates > 0:
                # evaluate every 1M steps and decay epsilon (based on paper)
                if agent.num_train_updates % evaluation_rate == 0 and prev_iteration != agent.num_train_updates:
                    epsilon = max(epsilon_min, epsilon * epsilod_decay)
                    prev_iteration = agent.num_train_updates

            if agent.num_train_updates > num_iterations:
                stop = True

        episode += 1
        epsidoic_rewards.append(episodic_accum)
        episodic_accum = 0.

        if episode % 100 == 0 and len(epsidoic_rewards) > 20:
            tb_writer.add_scalar('Episode Accum score',
                                 np.mean(epsidoic_rewards[-20:]), episode)
            print('episode_num:{}\tepisode_score:{}\tepsilon:{}\tmemory_size:{}'.format(\
                  episode, np.mean(epsidoic_rewards[-20:]), epsilon,len(agent.memory)))
            torch.save(agent.QNetwork_local.state_dict(),
                       '{}_checkpoint.pth'.format(experiment_name))
    return epsidoic_rewards
Пример #21
0
env = gym.make('Breakout-v0')
# env = gym.make('CarRacing-v0')
env.seed(SEED)

obs_size, action_size = env.observation_space.shape, env.action_space.n
print('State shape: ', obs_size)
print('Number of actions: ', action_size)

state_size = 1024
h_size = 128
agent = Agent(action_size, state_size, h_size=h_size, seed=SEED)


episodes = 100
steps = 150

for i_episode in range(episodes):
    obs = env.reset()
    obs = resize(obs, size=64)
    score = 0
    for t in range(steps):
        action = agent.act(obs)
        next_obs, reward, done, _ = env.step(action)
        print(reward)
        next_obs = resize(next_obs, size=64)
        agent.step(obs, action, reward, next_obs, done)
        obs = next_obs
        score += reward
        if done:
            break
Пример #22
0
class DQN():
    # env assumption: env.reset(), env.render(), env.step(), env.close()
    def __init__(self, name, state_size, action_size, env, load_net=False):
        self.agent = Agent(name,
                           state_size=state_size,
                           action_size=action_size,
                           seed=0)
        self.env = env
        self.saved_network = name + '_dqn_checkpoint.pth'
        self.load_net = load_net
        if load_net:
            print('Loading pretrained network...')
            self.agent.qnetwork_local.load_state_dict(
                torch.load(self.saved_network))
            self.agent.qnetwork_target.load_state_dict(
                torch.load(self.saved_network))
            print('Loaded.')

    def train(self,
              n_episodes=2000,
              max_t=1000,
              eps_start=1.0,
              eps_end=0.01,
              eps_decay=0.995,
              score_window_size=100,
              target_score=13.0,
              save=True,
              verbose=True):
        """Deep Q-Learning.

            Params
            ======
                n_episodes (int): maximum number of training episodes
                max_t (int): maximum number of timesteps per episode
                eps_start (float): starting value of epsilon, for epsilon-greedy action selection
                eps_end (float): minimum value of epsilon
                eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
            """
        scores = []  # list containing scores from each episode
        scores_window = deque(
            maxlen=score_window_size)  # last score_window_size scores
        eps = eps_start  # initialize epsilon
        save12 = False
        for i_episode in range(1, n_episodes + 1):
            state = self.env.reset()
            score = 0
            for t in range(max_t):
                action = self.agent.act(state, eps)
                next_state, reward, done, _ = self.env.step(action)
                self.agent.step(state, action, reward, next_state, done)
                state = next_state
                score += reward
                if done:
                    break
            scores_window.append(score)  # save most recent score
            scores.append(score)  # save most recent score
            eps = max(eps_end, eps_decay * eps)  # decrease epsilon
            avg_score = np.mean(scores_window)
            if avg_score > 13.0 and not save12 and not self.load_net:
                torch.save(self.agent.qnetwork_local.state_dict(),
                           self.saved_network)
                np.save('scores13_0824.npy', np.array(scores))
                save12 = True
            if avg_score >= target_score and i_episode > 100:
                if verbose:
                    print(
                        '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                        .format(i_episode, np.mean(scores_window)))
                self.solved = True
                if save:
                    torch.save(self.agent.qnetwork_local.state_dict(),
                               self.saved_network)
                break

            if verbose:
                print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                    i_episode, np.mean(scores_window)),
                      end="")
                if i_episode % 100 == 0:
                    print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                        i_episode, np.mean(scores_window)))

        if save:
            torch.save(self.agent.qnetwork_local.state_dict(),
                       self.saved_network)

        return scores

    def play(self, trials=3, steps=200, load=False):
        if load:
            self.agent.qnetwork_local.load_state_dict(
                torch.load(self.saved_network))

        for i in range(trials):
            total_reward = 0
            print('Start Trial...')
            state = self.env.reset()
            for j in range(steps):
                action = self.agent.act(state)
                self.env.render()
                state, reward, done, _ = self.env.step(action)
                total_reward += reward
                if reward != 0:
                    print("Current Reward:", reward, "Total Reward:",
                          total_reward)
                if done:
                    print('Done.')
                    break
        self.env.close()
Пример #23
0
def demo5_ComparePolicies(setting, env):

    n_sample = 2048

    # Vehicle to generate observation mask
    vehicle = Vehicle(n_time_windows=64, grid_size=(64,64), planner_type='Default')
    # Trainer and Estimator
    dyn_autoencoder = DynamicAutoEncoder(SETTING, grid_size = (env.map_width, env.map_height), n_state=3, n_obs=3, encoding_dim=4, gru_hidden_dim=4)

    ### DQN agent  
    dqn_agent = DQN_Agent(state_size=4, action_size=4, replay_memory_size=1000, batch_size=64, gamma=0.99, learning_rate=0.01, target_tau=0.01, update_rate=1, seed=0)

    # Train Data Buffer
    memory = SingleTrajectoryBuffer(N_MEMORY_SIZE)
    
    # Video Writier
    '''
    video_f_name = 'UsePlanner'+ '_' + setting['name'] + '_' + setting['policy_type'] + '.avi'
    video_writer1 = ImageStreamWriter(video_f_name, FPS, image_size=(1200,820))
    '''

    # Train Iteration Logger

    writer = SummaryWriter()

    # Add concat. text
    setting_text = ''
    for k,v in setting.items():
        setting_text += k
        setting_text += ':'
        setting_text += str(v)
        setting_text += '\t'
    writer.add_text('setting', setting_text)

    ########################################
    ### Interacting with the Environment ###
    ########################################

    ### Loss Monitors ###
    list_rewards = []
    list_new_fire_count = []
    list_action = []
    list_loss = []

    ### Filling the Data Buffer ###
    for i in tqdm.tqdm(range(N_TRAIN_WAIT)):         
        map_visit_mask, img_resized =  vehicle.full_mask()
        mask_obs, obs, state, reward, info = env.step(map_visit_mask)
        memory.add(mask_obs.detach().long(), state.detach().long(), map_visit_mask.detach().long())

    mask_obs, obs, state = env.reset()
    state_est_grid = dyn_autoencoder.u_k

    for i in tqdm.tqdm(range(N_TOTAL_TIME_STEPS)):

        # determine epsilon-greedy action from current sate
        h_k = dyn_autoencoder.h_k.squeeze().data.cpu().numpy()
        epsilon = 0.1
        action = dqn_agent.act(h_k, epsilon)
          
        
        ### Collect Data from the Env. ###
        # Plan a trajectory
        policy_type = setting['policy_type']
        if policy_type == 'Default':
            map_visit_mask, img_resized = vehicle.plan_a_trajectory(state_est_grid, n_sample, action)  

        elif policy_type == 'Random':
            action = 777
            map_visit_mask, img_resized = vehicle.generate_a_random_trajectory()

        elif policy_type == 'Act0':
            action = 0
            map_visit_mask, img_resized = vehicle.plan_a_trajectory(state_est_grid, n_sample, action)

        elif policy_type == 'Act1':
            action = 1
            map_visit_mask, img_resized = vehicle.plan_a_trajectory(state_est_grid, n_sample, action)

        elif policy_type == 'Act2':
            action = 2
            map_visit_mask, img_resized = vehicle.plan_a_trajectory(state_est_grid, n_sample, action)

        else:
            action = 3
            map_visit_mask, img_resized = vehicle.plan_a_trajectory(state_est_grid, n_sample, action)

        list_action.append(action)
        

        # Collect the masked observation
        mask_obs, obs, state, reward, info = env.step(map_visit_mask)
        memory.add(mask_obs.detach().long(), state.detach().long(), map_visit_mask.detach().long())

        ### Run the Estimator ###
        state_est_grid = dyn_autoencoder.step(mask_obs, map_visit_mask)
        h_kp1 = dyn_autoencoder.h_k.squeeze().data.cpu().numpy()

        list_rewards.append(reward)
        list_new_fire_count.append(info['new_fire_count'])

        
        update = True
        #### Update the reinforcement learning agent and Dyn Auto Enc ###
        if policy_type != 'Random':
            dqn_agent.step(h_k, action, reward, h_kp1, False, update)
            loss_val, loss_val_cross, loss_val_ent, O_np_val =  dyn_autoencoder.update(memory, N_TRAIN_BATCH, N_TRAIN_WINDOW, update)
            list_loss.append(loss_val)


        ################################
        ### Rendering and Save Video ###
        ################################        
        img_env   = env.output_image()
        img_agent = dyn_autoencoder.output_image(state_est_grid)

        # State Est
        #blank = np.zeros((400, 200, 3))
        img_top = img_env  #np.concatenate((blank, img_env[:,:800], blank), axis=1)
        blank = np.zeros((20, 1200, 3))
        img_top = np.concatenate((img_top, blank), axis=0)
        img_top = (img_top*255).astype('uint8')

        img_state_est_grid_uint8 = (img_agent*255).astype('uint8')
        backtorgb = cv2.cvtColor(img_state_est_grid_uint8, cv2.COLOR_GRAY2RGB)
        img_bayes_uint8 = np.concatenate((img_top, backtorgb), axis=0) #<-- to be saved
        render('Dynamic Auto Encoder', img_bayes_uint8, 1)

        # Save video #
        #video_writer1.write_image_frame(img_bayes_uint8)

        if i%N_LOGGING_PERIOD == 0:

            avg_reward = np.mean(np.array(list_rewards))
            list_rewards = []
            writer.add_scalar('perform/rewards', avg_reward, i)

            avg_new_fire_count = max(np.mean(np.array(list_new_fire_count)), 1) # to avoid division by zero
            list_new_fire_count = []
            writer.add_scalar('perform/new_fire_counts', avg_new_fire_count, i)
            writer.add_scalar('perform/pc_coverd_new_fire', avg_reward/avg_new_fire_count, i)

            if policy_type != 'Random':

                avg_loss = np.mean(np.array(list_loss))
                list_loss = []
                writer.add_scalar('dynautoenc/loss', avg_loss, i)

                action_0_count = list_action.count(0)
                action_1_count = list_action.count(1)
                action_2_count = list_action.count(2)
                action_3_count = list_action.count(3)

                writer.add_scalar('action_count/0', action_0_count/len(list_action), i)
                writer.add_scalar('action_count/1', action_1_count/len(list_action), i)
                writer.add_scalar('action_count/2', action_2_count/len(list_action), i)
                writer.add_scalar('action_count/3', action_3_count/len(list_action), i)
                list_action = []

                writer.add_scalar('obs_state0/o00', O_np_val[0][0], i)
                writer.add_scalar('obs_state1/o01', O_np_val[0][1], i)
                writer.add_scalar('obs_state2/o02', O_np_val[0][2], i)
                writer.add_scalar('obs_state0/o10', O_np_val[1][0], i)
                writer.add_scalar('obs_state1/o11', O_np_val[1][1], i)
                writer.add_scalar('obs_state2/o12', O_np_val[1][2], i)
                writer.add_scalar('obs_state0/o20', O_np_val[2][0], i)
                writer.add_scalar('obs_state1/o21', O_np_val[2][1], i)
                writer.add_scalar('obs_state2/o22', O_np_val[2][2], i)
Пример #24
0
        elif round(action) == 2:
            converted_action = np.array([[0, 0, 1, 0]])  # counterclock
        elif round(action) == 3:
            converted_action = np.array([[0, 0, 2, 0]])  # clock
        # converted_action = np.column_stack([np.random.randint(0, converted_action_size[i], size=(converted_agent_num)) for i in range(len(converted_action_size))])
        # converted_action = np.array([[1,0,0,0]])

        # send the action to the environment and receive resultant environment information
        env_info = env.step(converted_action)[brain_name]

        next_state = env_info.vector_observations[0]  # get the next state
        reward = env_info.rewards[0]  # get the reward
        done = env_info.local_done[0]  # see if episode has finished

        #Send (S, A, R, S') info to the DQN agent for a neural network update
        agent.step(state, action, reward, next_state, done)

        # set new state to current state for determining next action
        state = next_state

        # Update episode score
        score += reward

        # If unity indicates that episode is done,
        # then exit episode loop, to begin new episode
        if done:
            break

    # Add episode score to Scores and...
    # Calculate mean score over last 100 episodes
    # Mean score is calculated over current episodes until i_episode > 100
def dqn(n_episodes=2000,
        max_t=1000,
        eps_start=1.0,
        eps_end=0.01,
        eps_decay=0.995):
    """Deep Q-Learning.
    
    Params
    ======
        n_episodes (int): maximum number of training episodes
        max_t (int): maximum number of timesteps per episode
        eps_start (float): starting value of epsilon, for epsilon-greedy action selection
        eps_end (float): minimum value of epsilon
        eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
    """
    scores = []  # list containing scores from each episode
    scores_window = deque(maxlen=100)  # last 100 scores
    eps = eps_start  # initialize epsilon

    agent = Agent(state_size=state_size, action_size=action_size, seed=0)

    for i_episode in range(1, n_episodes + 1):
        env_info = env.reset(
            train_mode=GRAPHICS_OFF)[brain_name]  # reset the environment
        state = env_info.vector_observations[0]  # get the current state
        score = 0
        for t in range(max_t):
            action = agent.act(state, eps)
            action = int(action)  ### FIX

            env_info = env.step(action)[brain_name]
            next_state = env_info.vector_observations[0]
            reward = env_info.rewards[0]
            done = env_info.local_done[0]

            agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward

            if done:
                break
        scores_window.append(score)  # save most recent score
        scores.append(score)  # save most recent score
        eps = max(eps_end, eps_decay * eps)  # decrease epsilon

        print('\rEpisode {}\tAverage Score: {:.2f}'.format(
            i_episode, np.mean(scores_window)),
              end="")
        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                i_episode, np.mean(scores_window)))

        if np.mean(scores_window) >= TARGET_SCORE:
            print(
                '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                .format(i_episode - 100, np.mean(scores_window)))
            torch.save(agent.qnetwork_local.state_dict(),
                       "ckpt/{}".format(CHECKPOINT_NAME))
            break

    return scores