示例#1
0
def demo_saved_agent(env,
                     agent_name,
                     n_episodes=3,
                     epsilon=0.05,
                     seed=0,
                     train_mode=False,
                     verbose=False):
    # initialize environment and scenario info
    brain, brain_name, state, action_size, state_size = env_initialize(
        env, train_mode=train_mode)

    # load the agent params and create the agent
    params, local_weights, target_weights = load_dqn(agent_name,
                                                     verbose=verbose)
    agent = DQN_Agent(state_size, action_size, brain_name, seed, params=params)
    print(agent.display_params())

    # set trained agent weights
    agent.qnetwork_local.load_state_dict(local_weights)
    agent.qnetwork_target.load_state_dict(target_weights)

    # run demo
    return demo_agent(env,
                      agent,
                      n_episodes=n_episodes,
                      epsilon=epsilon,
                      seed=seed,
                      train_mode=train_mode)
示例#2
0
def eval():
    agent = DQN_Agent(learning=False, rew_attr="wait_time")
    env = Environment(agent)
    generate_routefile(2000)
    env.run()
    for key in TRAFFIC_ATTRS:
        print("STATS: ", sum(env.stats[key])/len(env.stats[key]))
def create_agent(test_case_id, *args, **kwargs):
    """
    Method that will be called to create your agent during testing.
    You can, for example, initialize different class of agent depending on test case.
    """
    # return Base_Agent(test_case_id=test_case_id)
    return DQN_Agent(test_case_id=test_case_id)
示例#4
0
def learn():
    for i in range(100):
        print("Inside learning step: ", i)
        generate_routefile(2000)
        learning_rate = 10/(50 + i)
        eps_prob = 10/(10 + i)
        print("Loop: ", i)
        agent =DQN_Agent(learning=True, rew_attr="wait_time", Lnorm=3,
            # learning_rate=learning_rate,
            # exploration_eps=eps_prob
            )
        env = Environment(agent)
        env.run()
示例#5
0
def main():
    env = make(game='SonicTheHedgehog2-Genesis', state='EmeraldHillZone.Act1')

    # Parameters for observation image size processing.
    img_rows = 128
    img_cols = 128
    img_stack = 4

    action_size = 8  # 8 valid button combinations

    # Inputs to the agent's prediction network will have the following shape.
    input_size = (img_rows, img_cols, img_stack)

    # File paths
    stat_path = '../statistics/dqn_n-step'
    model_path = '../models/dqn_n-step'

    # Priortized Experience Replay.
    if (PER_AGENT):
        print('PER agent')
        stat_path += '_PER'
        model_path += '_PER'
        dqn_agent = DQN_PER_Agent(input_size, action_size)
    else:
        dqn_agent = DQN_Agent(input_size, action_size)

    # Use the Noisy Dueling Network.
    if (NOISY):
        stat_path += '_noisy_dueling'
        model_path += '_noisy_dueling'
        print('NOISY Dueling agent')
        dqn_agent.main_model = Networks.noisy_dueling_dqn(
            input_size, action_size, dqn_agent.main_lr)
        dqn_agent.target_model = Networks.noisy_dueling_dqn(
            input_size, action_size, dqn_agent.target_lr)
        dqn_agent.noisy = True
    # Use the normal dueling network.
    elif (DUELING):
        stat_path += '_dueling'
        model_path += '_dueling'
        print('Dueling agent')
        dqn_agent.main_model = Networks.dueling_dqn(input_size, action_size,
                                                    dqn_agent.main_lr)
        dqn_agent.target_model = Networks.dueling_dqn(input_size, action_size,
                                                      dqn_agent.target_lr)
    # Normal DQN.
    else:
        dqn_agent.main_model = Networks.dqn(input_size, action_size,
                                            dqn_agent.main_lr)
        dqn_agent.target_model = Networks.dqn(input_size, action_size,
                                              dqn_agent.target_lr)

    # Append correct suffix and filetype to paths.
    stat_path += '_stats.csv'
    main_model_path = model_path + '_main.h5'
    target_model_path = model_path + '_target.h5'

    # Load previous models, or instantiate new networks.
    if (LOAD_MODELS):
        dqn_agent.load_models(main_model_path, target_model_path)

    # Modify statrting epsilon value
    if (EPSILON == START):
        dqn_agent.epsilon = dqn_agent.initial_epsilon
    elif (EPSILON == MIDDLE):
        dqn_agent.epsilon = (
            (dqn_agent.initial_epsilon - dqn_agent.final_epsilon) / 2)
    else:
        dqn_agent.epsilon = dqn_agent.final_epsilon

    # Store rewards and states from the previous n state, action pairs to
    # create experiences.
    prev_n_rewards = deque(maxlen=dqn_agent.n_step)
    prev_n_exp = deque(maxlen=dqn_agent.n_step)

    # One episode is 4500 steps if not completed
    # 5 minutes of frames at 1/15th of a second = 4 60Hz frames
    total_timestep = 0  # Total number of timesteps over all episodes.
    for episode in range(EPISODES):
        done = False
        reward_sum = 0  # Average reward within episode.
        timestep = 0  # Track timesteps within the episode.

        # Rewards and states must be consecutive to improve temporal awareness.
        # Reset at the start of each episode to compensate for sudden scene change.
        prev_n_rewards.clear()
        prev_n_exp.clear()

        # Experiences are a stack of the img_stack most frames to provide
        # temporal information. Initialize this sequence to the first
        # observation stacked 4 times.
        first_obs = env.reset()
        processed = preprocess_obs(first_obs, size=(img_rows, img_cols))
        # (img_rows, img_cols, img_stack)
        exp_stack = np.stack(([processed] * img_stack), axis=2)
        # Expand dimension to stack and submit multiple exp_stacks in  a batch
        # (1, img_rows, img_cols, img_stack).
        exp_stack = np.expand_dims(exp_stack, axis=0)  # 1x64x64x4

        # Continue until the end of the zone is reached or 4500 timesteps have
        # passed.
        while not done:
            # Predict an action to take based on the most recent
            # experience.
            #
            # Note that the first dimension
            # (1, img_rows, img_cols, img_stack) is ignored by the
            # network here as it represents a batch size of 1.
            act_idx, action = dqn_agent.act(exp_stack)
            obs, reward, done, info = env.step(action)
            # env.render()

            timestep += 1
            total_timestep += 1
            reward_sum += reward

            # Create a 1st dimension for stacking experiences and a 4th for
            # stacking img_stack frames.
            obs = preprocess_obs(obs, size=(img_rows, img_cols))
            obs = np.reshape(obs, (1, img_rows, img_cols, 1))

            # Append the new observation to the front of the stack and remove
            # the oldest (4th) frame.
            exp_stack_new = np.append(obs, exp_stack[:, :, :, :3], axis=3)

            # Save the previous state, selected action, and resulting reward
            prev_n_rewards.appendleft(reward)
            prev_n_exp.append((exp_stack, act_idx, done))
            exp_stack = exp_stack_new

            # Once sufficent steps have been taken, discount rewards and save nth
            # previous experience
            if (len(prev_n_rewards) >= dqn_agent.n_step):
                # Compute discounted reward
                discounted_reward = 0
                for idx in range(len(prev_n_rewards)):
                    prev_reward = prev_n_rewards[idx]
                    # rewards are append left so that the most recent rewards are
                    # discounted the least.
                    discounted_reward += ((dqn_agent.gamma**idx) * prev_reward)

                # Experiences are pushed forward into the deque as more are appened. The
                # nth previous experience is at the last index.
                original_state, original_act, _ = prev_n_exp[-1]
                nth_state, _, nth_done = prev_n_exp[0]

                # Save the nth previous state and predicted action the discounted sum of rewards
                # and final state over the next n steps.
                dqn_agent.save_memory(original_state, original_act,
                                      discounted_reward, nth_state, nth_done)

            # In the observation phase skip training updates and decrmenting epsilon.
            if (total_timestep >= dqn_agent.observation_timesteps):

                # Update the target model with the main model's weights.
                if ((total_timestep % dqn_agent.update_target_freq) == 0):
                    dqn_agent.update_target_model()

                # Train the agent on saved experiences.
                if ((total_timestep % dqn_agent.timestep_per_train) == 0):
                    dqn_agent.replay_update()
                    dqn_agent.save_models(main_model_path, target_model_path)

                if (dqn_agent.epsilon > dqn_agent.final_epsilon):
                    # Decrease epsilon by a fraction of the range such that epsilon decreases
                    # for "exploration_timesteps".
                    dec = (
                        (dqn_agent.initial_epsilon - dqn_agent.final_epsilon) /
                        dqn_agent.exploration_timesteps)
                    dqn_agent.epsilon -= dec

            # print(info)
            print("Epsisode:", episode, " Timestep:", timestep, " Action:",
                  act_idx, " Episode Reward Sum:", reward_sum, " Epsilon:",
                  dqn_agent.epsilon)

        # Save mean episode reward at the end of the episode - append to stats file
        with open(stat_path, "a") as stats_fd:
            reward_str = "Epsiode Cummulative Reward: " + str(
                reward_sum) + ", Episode Timestpes: " + str(timestep) + ",\n"
            stats_fd.write(str(reward_str))
示例#6
0
def demo4_LearningPathPlanning(setting):

    n_sample = 100

    # Environment
    env = FireEnvironment(64, 64)
    # Vehicle to generate observation mask
    vehicle = Vehicle(n_time_windows=512,
                      grid_size=(64, 64),
                      planner_type='Default')
    # Trainer and Estimator
    dyn_autoencoder = DynamicAutoEncoder(SETTING,
                                         grid_size=(env.map_width,
                                                    env.map_height),
                                         n_state=3,
                                         n_obs=3,
                                         encoding_dim=16,
                                         gru_hidden_dim=16)
    ### DQN agent
    dqn_agent = DQN_Agent(state_size=16,
                          action_size=4,
                          replay_memory_size=1000,
                          batch_size=64,
                          gamma=0.99,
                          learning_rate=0.01,
                          target_tau=0.01,
                          update_rate=1,
                          seed=0)
    # Train Data Buffer
    memory = SingleTrajectoryBuffer(N_MEMORY_SIZE)
    # Train Iteration Logger

    writer = SummaryWriter()
    # Video Writier
    video_writer1 = ImageStreamWriter('LearningPlanner.avi',
                                      FPS,
                                      image_size=(1200, 820))

    # Add concat. text
    setting_text = ''
    for k, v in setting.items():
        setting_text += k
        setting_text += ':'
        setting_text += str(v)
        setting_text += '\t'
    writer.add_text('setting', setting_text)

    ########################################
    ### Interacting with the Environment ###
    ########################################
    mask_obs, obs, state = env.reset()
    state_est_grid = dyn_autoencoder.u_k

    ### Loss Monitors ###
    list_loss = []
    list_cross_entropy_loss = []
    list_entropy_loss = []
    list_rewards = []
    list_new_fire_count = []
    list_action = []

    ### Filling the Data Buffer ###
    for i in tqdm.tqdm(range(N_TRAIN_WAIT)):
        map_visit_mask, img_resized = vehicle.full_mask()
        mask_obs, obs, state, reward, info = env.step(map_visit_mask)
        memory.add(mask_obs.detach().long(),
                   state.detach().long(),
                   map_visit_mask.detach().long())

    for i in tqdm.tqdm(range(N_TOTAL_TIME_STEPS)):

        # determine epsilon-greedy action from current sate
        h_k = dyn_autoencoder.h_k.squeeze().data.cpu().numpy()
        epsilon = 0.1
        action = dqn_agent.act(h_k, epsilon)
        list_action.append(action)

        ### Collect Data from the Env. ###
        map_visit_mask, img_resized = vehicle.plan_a_trajectory(
            state_est_grid, n_sample, action)
        mask_obs, obs, state, reward, info = env.step(map_visit_mask)
        memory.add(mask_obs.detach().long(),
                   state.detach().long(),
                   map_visit_mask.detach().long())

        ### Run the Estimator ###
        state_est_grid = dyn_autoencoder.step(mask_obs, map_visit_mask)
        h_kp1 = dyn_autoencoder.h_k.squeeze().data.cpu().numpy()

        #### Update the reinforcement learning agent ###
        dqn_agent.step(h_k, action, reward, h_kp1, done=False)

        list_rewards.append(reward)
        list_new_fire_count.append(info['new_fire_count'])

        ################################
        ### Rendering and Save Video ###
        ################################
        img_env = env.output_image()
        img_agent = dyn_autoencoder.output_image(state_est_grid)

        # State Est
        #blank = np.zeros((400, 200, 3))
        img_top = img_env  #np.concatenate((blank, img_env[:,:800], blank), axis=1)
        blank = np.zeros((20, 1200, 3))
        img_top = np.concatenate((img_top, blank), axis=0)
        img_top = (img_top * 255).astype('uint8')

        img_state_est_grid_uint8 = (img_agent * 255).astype('uint8')
        backtorgb = cv2.cvtColor(img_state_est_grid_uint8, cv2.COLOR_GRAY2RGB)
        img_bayes_uint8 = np.concatenate((img_top, backtorgb),
                                         axis=0)  #<-- to be saved
        render('Dynamic Auto Encoder', img_bayes_uint8, 1)

        # Save video #
        video_writer1.write_image_frame(img_bayes_uint8)

        ### Training ###
        loss_val, loss_val_cross, loss_val_ent, O_np_val = dyn_autoencoder.update(
            memory, N_TRAIN_BATCH, N_TRAIN_WINDOW)
        list_loss.append(loss_val)
        list_cross_entropy_loss.append(loss_val_cross)
        list_entropy_loss.append(loss_val_ent)

        if i % N_LOGGING_PERIOD == 0:
            avg_loss = np.mean(np.array(list_loss))
            list_loss = []
            writer.add_scalar('dynautoenc/loss', avg_loss, i)

            avg_loss_cross = np.mean(np.array(list_cross_entropy_loss))
            list_cross_entropy_loss = []
            writer.add_scalar('dynautoenc/crossentropy', avg_loss_cross, i)

            avg_loss_entropy = np.mean(np.array(list_entropy_loss))
            list_entropy_loss = []
            writer.add_scalar('dynautoenc/shannonentropy', avg_loss_entropy, i)

            avg_reward = np.mean(np.array(list_rewards))
            list_rewards = []
            writer.add_scalar('perform/rewards', avg_reward, i)

            avg_new_fire_count = np.mean(np.array(list_new_fire_count))
            list_new_fire_count = []
            writer.add_scalar('perform/new_fire_counts', avg_new_fire_count, i)

            writer.add_scalar('perform/pc_coverd_new_fire',
                              avg_reward / avg_new_fire_count, i)

            action_0_count = list_action.count(0)
            action_1_count = list_action.count(1)
            action_2_count = list_action.count(2)
            action_3_count = list_action.count(3)

            writer.add_scalar('action_count/0',
                              action_0_count / len(list_action), i)
            writer.add_scalar('action_count/1',
                              action_1_count / len(list_action), i)
            writer.add_scalar('action_count/2',
                              action_2_count / len(list_action), i)
            writer.add_scalar('action_count/3',
                              action_3_count / len(list_action), i)
            list_action = []

            writer.add_scalar('obs_state0/o00', O_np_val[0][0], i)
            writer.add_scalar('obs_state1/o01', O_np_val[0][1], i)
            writer.add_scalar('obs_state2/o02', O_np_val[0][2], i)
            writer.add_scalar('obs_state0/o10', O_np_val[1][0], i)
            writer.add_scalar('obs_state1/o11', O_np_val[1][1], i)
            writer.add_scalar('obs_state2/o12', O_np_val[1][2], i)
            writer.add_scalar('obs_state0/o20', O_np_val[2][0], i)
            writer.add_scalar('obs_state1/o21', O_np_val[2][1], i)
            writer.add_scalar('obs_state2/o22', O_np_val[2][2], i)

            print(
                'losses at iteration: %d, losses: total %.3f, cross %.3f, shannon %.3f'
                % (i, avg_loss, avg_loss_cross, avg_loss_entropy))
            print('memory size at iteration: %d, size: %d' %
                  (i, len(memory.obs_memory)))

        if (i + 1) % N_SAVING_PERIOD == 0:
            f_name = setting['name']
            dyn_autoencoder.save_the_model(i, f_name)
            dqn_agent.save_the_model(i, f_name)

    video_writer1.close()
示例#7
0
from dqn_agent import DQN_Agent
from tqdm import tqdm

import networkx as nx
import numpy as np
import pylab as plt
import pandas as pd

input_dim = 1
output_dim = 9
exp_replay_size = 256

agent = DQN_Agent(seed=0,
                  layer_sizes=[input_dim, 16, output_dim],
                  lr=1e-3,
                  sync_freq=5,
                  exp_replay_size=exp_replay_size)

# Main training loop
losses_list, reward_list, episode_len_list, epsilon_list = [], [], [], []
episodes = 10000
epsilon = 1

# Initialize the graph
edge_list = [(0, 2), (0, 1), (0, 3), (2, 4), (5, 6), (7, 4), (0, 6), (5, 3),
             (3, 7), (0, 8)]
goal = 7
SIZE_MATRIX = 9

G = nx.Graph()
G.add_edges_from(edge_list)
示例#8
0
import gym
from tqdm import tqdm
from time import sleep
from dqn_agent import DQN_Agent

env = gym.make('CartPole-v0')

input_dim = env.observation_space.shape[0]
output_dim = env.action_space.n
exp_replay_size = 256
agent = DQN_Agent(seed=1423,
                  layer_sizes=[input_dim, 64, output_dim],
                  lr=1e-3,
                  sync_freq=5,
                  exp_replay_size=exp_replay_size)
agent.load_pretrained_model("cartpole-dqn.pth")

reward_arr = []

for i in tqdm(range(100)):
    obs, done, rew = env.reset(), False, 0
    while not done:
        A = agent.get_action(obs, env.action_space.n, epsilon=0)

        obs, reward, done, info = env.step(A.item())
        rew += reward
        sleep(0.01)
        env.render()

    reward_arr.append(rew)
示例#9
0
def demo5_ComparePolicies(setting, env):

    n_sample = 2048

    # Vehicle to generate observation mask
    vehicle = Vehicle(n_time_windows=64, grid_size=(64,64), planner_type='Default')
    # Trainer and Estimator
    dyn_autoencoder = DynamicAutoEncoder(SETTING, grid_size = (env.map_width, env.map_height), n_state=3, n_obs=3, encoding_dim=4, gru_hidden_dim=4)

    ### DQN agent  
    dqn_agent = DQN_Agent(state_size=4, action_size=4, replay_memory_size=1000, batch_size=64, gamma=0.99, learning_rate=0.01, target_tau=0.01, update_rate=1, seed=0)

    # Train Data Buffer
    memory = SingleTrajectoryBuffer(N_MEMORY_SIZE)
    
    # Video Writier
    '''
    video_f_name = 'UsePlanner'+ '_' + setting['name'] + '_' + setting['policy_type'] + '.avi'
    video_writer1 = ImageStreamWriter(video_f_name, FPS, image_size=(1200,820))
    '''

    # Train Iteration Logger

    writer = SummaryWriter()

    # Add concat. text
    setting_text = ''
    for k,v in setting.items():
        setting_text += k
        setting_text += ':'
        setting_text += str(v)
        setting_text += '\t'
    writer.add_text('setting', setting_text)

    ########################################
    ### Interacting with the Environment ###
    ########################################

    ### Loss Monitors ###
    list_rewards = []
    list_new_fire_count = []
    list_action = []
    list_loss = []

    ### Filling the Data Buffer ###
    for i in tqdm.tqdm(range(N_TRAIN_WAIT)):         
        map_visit_mask, img_resized =  vehicle.full_mask()
        mask_obs, obs, state, reward, info = env.step(map_visit_mask)
        memory.add(mask_obs.detach().long(), state.detach().long(), map_visit_mask.detach().long())

    mask_obs, obs, state = env.reset()
    state_est_grid = dyn_autoencoder.u_k

    for i in tqdm.tqdm(range(N_TOTAL_TIME_STEPS)):

        # determine epsilon-greedy action from current sate
        h_k = dyn_autoencoder.h_k.squeeze().data.cpu().numpy()
        epsilon = 0.1
        action = dqn_agent.act(h_k, epsilon)
          
        
        ### Collect Data from the Env. ###
        # Plan a trajectory
        policy_type = setting['policy_type']
        if policy_type == 'Default':
            map_visit_mask, img_resized = vehicle.plan_a_trajectory(state_est_grid, n_sample, action)  

        elif policy_type == 'Random':
            action = 777
            map_visit_mask, img_resized = vehicle.generate_a_random_trajectory()

        elif policy_type == 'Act0':
            action = 0
            map_visit_mask, img_resized = vehicle.plan_a_trajectory(state_est_grid, n_sample, action)

        elif policy_type == 'Act1':
            action = 1
            map_visit_mask, img_resized = vehicle.plan_a_trajectory(state_est_grid, n_sample, action)

        elif policy_type == 'Act2':
            action = 2
            map_visit_mask, img_resized = vehicle.plan_a_trajectory(state_est_grid, n_sample, action)

        else:
            action = 3
            map_visit_mask, img_resized = vehicle.plan_a_trajectory(state_est_grid, n_sample, action)

        list_action.append(action)
        

        # Collect the masked observation
        mask_obs, obs, state, reward, info = env.step(map_visit_mask)
        memory.add(mask_obs.detach().long(), state.detach().long(), map_visit_mask.detach().long())

        ### Run the Estimator ###
        state_est_grid = dyn_autoencoder.step(mask_obs, map_visit_mask)
        h_kp1 = dyn_autoencoder.h_k.squeeze().data.cpu().numpy()

        list_rewards.append(reward)
        list_new_fire_count.append(info['new_fire_count'])

        
        update = True
        #### Update the reinforcement learning agent and Dyn Auto Enc ###
        if policy_type != 'Random':
            dqn_agent.step(h_k, action, reward, h_kp1, False, update)
            loss_val, loss_val_cross, loss_val_ent, O_np_val =  dyn_autoencoder.update(memory, N_TRAIN_BATCH, N_TRAIN_WINDOW, update)
            list_loss.append(loss_val)


        ################################
        ### Rendering and Save Video ###
        ################################        
        img_env   = env.output_image()
        img_agent = dyn_autoencoder.output_image(state_est_grid)

        # State Est
        #blank = np.zeros((400, 200, 3))
        img_top = img_env  #np.concatenate((blank, img_env[:,:800], blank), axis=1)
        blank = np.zeros((20, 1200, 3))
        img_top = np.concatenate((img_top, blank), axis=0)
        img_top = (img_top*255).astype('uint8')

        img_state_est_grid_uint8 = (img_agent*255).astype('uint8')
        backtorgb = cv2.cvtColor(img_state_est_grid_uint8, cv2.COLOR_GRAY2RGB)
        img_bayes_uint8 = np.concatenate((img_top, backtorgb), axis=0) #<-- to be saved
        render('Dynamic Auto Encoder', img_bayes_uint8, 1)

        # Save video #
        #video_writer1.write_image_frame(img_bayes_uint8)

        if i%N_LOGGING_PERIOD == 0:

            avg_reward = np.mean(np.array(list_rewards))
            list_rewards = []
            writer.add_scalar('perform/rewards', avg_reward, i)

            avg_new_fire_count = max(np.mean(np.array(list_new_fire_count)), 1) # to avoid division by zero
            list_new_fire_count = []
            writer.add_scalar('perform/new_fire_counts', avg_new_fire_count, i)
            writer.add_scalar('perform/pc_coverd_new_fire', avg_reward/avg_new_fire_count, i)

            if policy_type != 'Random':

                avg_loss = np.mean(np.array(list_loss))
                list_loss = []
                writer.add_scalar('dynautoenc/loss', avg_loss, i)

                action_0_count = list_action.count(0)
                action_1_count = list_action.count(1)
                action_2_count = list_action.count(2)
                action_3_count = list_action.count(3)

                writer.add_scalar('action_count/0', action_0_count/len(list_action), i)
                writer.add_scalar('action_count/1', action_1_count/len(list_action), i)
                writer.add_scalar('action_count/2', action_2_count/len(list_action), i)
                writer.add_scalar('action_count/3', action_3_count/len(list_action), i)
                list_action = []

                writer.add_scalar('obs_state0/o00', O_np_val[0][0], i)
                writer.add_scalar('obs_state1/o01', O_np_val[0][1], i)
                writer.add_scalar('obs_state2/o02', O_np_val[0][2], i)
                writer.add_scalar('obs_state0/o10', O_np_val[1][0], i)
                writer.add_scalar('obs_state1/o11', O_np_val[1][1], i)
                writer.add_scalar('obs_state2/o12', O_np_val[1][2], i)
                writer.add_scalar('obs_state0/o20', O_np_val[2][0], i)
                writer.add_scalar('obs_state1/o21', O_np_val[2][1], i)
                writer.add_scalar('obs_state2/o22', O_np_val[2][2], i)
示例#10
0
def train(fullcover, name, setting):

    n_sample = 20

    # Environment
    env = FireEnvironment(64, 64)

    # Vehicle to generate observation mask
    vehicle = Vehicle(n_time_windows=1000, grid_size=(64,64), planner_type=setting['planner_type'])

    # Trainer and Estimator
    dyn_autoencoder = DynamicAutoEncoder(SETTING, grid_size = (env.map_width, env.map_height), n_state=3, n_obs=3, encoding_dim=16, gru_hidden_dim=16)

    # Train Data Buffer
    memory = SingleTrajectoryBuffer(N_MEMORY_SIZE)

    ### DQN agent
    dqn_agent = DQN_Agent(state_size=16, action_size=4, replay_memory_size=1000, batch_size=64, gamma=0.99, learning_rate=0.01, target_tau=0.01, update_rate=1, seed=0)

    # Train Iteration Logger
    from torch.utils.tensorboard import SummaryWriter
    writer = SummaryWriter()

    # Add concat. text
    setting_text = ''
    for k,v in setting.items():
        setting_text += k
        setting_text += str(v)
        setting_text += '\t'
    writer.add_text('setting', setting_text)


    ########################################
    ### Interacting with the Environment ###
    ########################################
    mask_obs, obs, state = env.reset()
    map_visit_mask, img_resized = vehicle.full_mask()
    state_est_grid = dyn_autoencoder.u_k

    ### Loss Monitors ###
    list_loss = []
    list_cross_entropy_loss = []
    list_entropy_loss = []
    list_rewards = []
    list_count_fire_visit = []
    list_count_all_fire = []
    list_action = []

    ### Filling the Data Buffer ###
    for i in tqdm.tqdm(range(N_TRAIN_WAIT)):         
        if fullcover:
            map_visit_mask, img_resized = vehicle.plan_a_trajectory(state_est_grid, n_sample, action)
        else:
            map_visit_mask, img_resized = vehicle.full_mask()

        mask_obs, obs, state, reward = env.step(map_visit_mask)
        memory.add(mask_obs, state, map_visit_mask)
        


    for i in tqdm.tqdm(range(N_TOTAL_TIME_STEPS)):

        # determine epsilon-greedy action from current sate
        h_k = dyn_autoencoder.h_k.squeeze().data.cpu().numpy()
        epsilon = 0.1
        action = dqn_agent.act(h_k, epsilon)
        list_action.append(action)    

        ### Collect Data from the Env. ###
        if fullcover:
            map_visit_mask, img_resized = vehicle.full_mask()
        else:
            map_visit_mask, img_resized = vehicle.plan_a_trajectory(state_est_grid, n_sample, action)
            
        
        mask_obs, obs, state, reward = env.step(map_visit_mask)
        memory.add(mask_obs, state, map_visit_mask)

        ### Run the Estimator ###
        state_est_grid = dyn_autoencoder.step(mask_obs, map_visit_mask)
        h_kp1 = dyn_autoencoder.h_k.squeeze().data.cpu().numpy()

        #### Update the reinforcement learning agent ###
        dqn_agent.step(h_k, action, reward, h_kp1, done=False)

        list_rewards.append(reward)
        fire_count = (torch.sum(state[2])).item()
        fire_visit = (torch.sum(mask_obs.permute(2,0,1) * state[2].unsqueeze(0))).item()

        if fire_count < 1:
            print('no fire')
        else:
            list_count_fire_visit.append(fire_visit)
            list_count_all_fire.append(fire_count)

        ### Render the Env. and the Est. ###
        if i % N_RENDER_PERIOD == 0:
            img_env   = env.output_image()
            img_state_est_grid = dyn_autoencoder.output_image(state_est_grid)
            
            render('env', img_env, 1)
            render('img_state_est_grid', img_state_est_grid, 1)            


        ### Training ###
        loss_val, loss_val_cross, loss_val_ent, O_np_val =  dyn_autoencoder.update(memory, N_TRAIN_BATCH, N_TRAIN_WINDOW)
        list_loss.append(loss_val)
        list_cross_entropy_loss.append(loss_val_cross)
        list_entropy_loss.append(loss_val_ent)

        if i%N_LOGGING_PERIOD == 0:
            avg_loss = np.mean(np.array(list_loss))
            list_loss = []
            writer.add_scalar('dynautoenc/loss', avg_loss, i)

            avg_loss_cross = np.mean(np.array(list_cross_entropy_loss))
            list_cross_entropy_loss = []
            writer.add_scalar('dynautoenc/crossentropy', avg_loss_cross, i)

            avg_loss_entropy = np.mean(np.array(list_entropy_loss))
            list_entropy_loss = []
            writer.add_scalar('dynautoenc/shannonentropy', avg_loss_entropy, i)

            avg_reward = np.mean(np.array(list_rewards))
            list_rewards = []
            writer.add_scalar('perform/rewards', avg_reward, i)

            avg_count_fire_visit = np.mean(np.array(list_count_fire_visit))
            list_count_fire_visit = []
            writer.add_scalar('perform/avg_count_fire_visit', avg_count_fire_visit, i)

            avg_count_all_fire = np.mean(np.array(list_count_all_fire))
            list_count_all_fire = []
            writer.add_scalar('perform/avg_count_all_fire', avg_count_all_fire, i)


            action_0_count = list_action.count(0)
            action_1_count = list_action.count(1)
            action_2_count = list_action.count(2)
            action_3_count = list_action.count(3)
            list_action = []

            if setting['planner_type'] == 'Default':
                writer.add_scalar('action_count/0', action_0_count, i)
                writer.add_scalar('action_count/1', action_1_count, i)
                writer.add_scalar('action_count/2', action_2_count, i)
                writer.add_scalar('action_count/3', action_3_count, i)


            writer.add_scalar('obs_state0/o00', O_np_val[0][0], i)
            writer.add_scalar('obs_state1/o01', O_np_val[0][1], i)
            writer.add_scalar('obs_state2/o02', O_np_val[0][2], i)
            writer.add_scalar('obs_state0/o10', O_np_val[1][0], i)
            writer.add_scalar('obs_state1/o11', O_np_val[1][1], i)
            writer.add_scalar('obs_state2/o12', O_np_val[1][2], i)
            writer.add_scalar('obs_state0/o20', O_np_val[2][0], i)
            writer.add_scalar('obs_state1/o21', O_np_val[2][1], i)
            writer.add_scalar('obs_state2/o22', O_np_val[2][2], i)

            print('losses at iteration: %d, losses: total %.3f, cross %.3f, shannon %.3f' % (i, avg_loss, avg_loss_cross, avg_loss_entropy))
            print('memory size at iteration: %d, size: %d' % (i, len(memory.obs_memory)))

        if (i+1)%N_SAVING_PERIOD==0:
            f_name = name
            dyn_autoencoder.save_the_model(i, f_name)
from tqdm import tqdm
from time import sleep
from dqn_agent import DQN_Agent

import numpy as np

input_dim = 1
output_dim = 9
exp_replay_size = 256

agent = DQN_Agent(seed=1423,
                  layer_sizes=[input_dim, 16, output_dim],
                  lr=1e-3,
                  sync_freq=5,
                  exp_replay_size=exp_replay_size)
agent.load_pretrained_model("shortest-path-dqn.pth")

goal = 7

obs = 4
done = False
steps = [obs]

while not obs == goal:
    A = agent.get_action(np.array([obs]), 9, epsilon=0)
    print(str(obs) + ' -> ' + str(A.item()))

    obs = A.item()
    steps.append(A.item())

print(steps)
示例#12
0
def main():
    env = make(game='SonicTheHedgehog2-Genesis', state='EmeraldHillZone.Act1')
    
    # Parameters for observation image size processing.
    img_rows = 128
    img_cols = 128          
    img_stack = 4        

    action_size = 8         # 8 valid button combinations
    
    # Inputs to the agent's prediction network will have the following shape.
    input_size = (img_rows, img_cols, img_stack)
    
    # File paths
    stat_path = '../statistics/dqn'
    model_path = '../models/dqn'

    # Priortized Experience Replay.
    if (PER_AGENT):
        print('PER agent')
        stat_path += '_PER'
        model_path+= '_PER'
        dqn_agent = DQN_PER_Agent(input_size, action_size)
    elif(DIST_AGENT):
        stat_path += '_DIST'
        model_path+= '_DIST'
        dqn_agent = DistributionalDQN(input_size, action_size)
    else:
        dqn_agent = DQN_Agent(input_size, action_size)

    
    # Use the Noisy Dueling Network.
    if (NOISY):
        stat_path += '_noisy_dueling'
        model_path += '_noisy_dueling'
        print('NOISY Dueling agent')
        dqn_agent.main_model = Networks.noisy_dueling_dqn(input_size, action_size, dqn_agent.main_lr)
        dqn_agent.target_model = Networks.noisy_dueling_dqn(input_size, action_size, dqn_agent.target_lr)
        dqn_agent.noisy = True
    # Use the normal dueling network.
    elif(DUELING and DIST_AGENT):
        stat_path += '_dueling'
        model_path += '_dueling'
        print('Dueling distributional')
        dqn_agent.main_model = Networks.dueling_C51(input_size, action_size, dqn_agent.main_lr)
        dqn_agent.target_model = Networks.dueling_C51(input_size, action_size, dqn_agent.target_lr)
    elif (DUELING):
        stat_path += '_dueling'
        model_path += '_dueling'
        print('Dueling agent')
        dqn_agent.main_model = Networks.dueling_dqn(input_size, action_size, dqn_agent.main_lr)
        dqn_agent.target_model = Networks.dueling_dqn(input_size, action_size, dqn_agent.target_lr)
    elif(DIST_AGENT):
        dqn_agent.main_model = Networks.C51(input_size, action_size, dqn_agent.main_lr)
        dqn_agent.target_model = Networks.C51(input_size, action_size, dqn_agent.target_lr)
    # Normal DQN.
    else:
        dqn_agent.main_model = Networks.dqn(input_size, action_size, dqn_agent.main_lr)
        dqn_agent.target_model = Networks.dqn(input_size, action_size, dqn_agent.target_lr)
    
    # Append correct suffix and filetype to paths.
    stat_path += '_stats.csv'
    main_model_path = model_path + '_main.h5'
    target_model_path = model_path + '_target.h5'

    # Load previous models.
    if (LOAD_MODELS):
        dqn_agent.load_models(main_model_path, target_model_path)

    # Modify statrting epsilon value
    if (EPSILON == START):
        dqn_agent.epsilon = dqn_agent.initial_epsilon
    elif (EPSILON == MIDDLE):
        dqn_agent.epsilon = ((dqn_agent.initial_epsilon - dqn_agent.final_epsilon) / 2)
    else:
        dqn_agent.epsilon = dqn_agent.final_epsilon

    # One episode is 4500 steps if not completed 
    # 5 minutes of frames at 1/15th of a second = 4 60Hz frames
    total_timestep = 0              # Total number of timesteps over all episodes.
    for episode in range(EPISODES):
        done = False
        reward_sum = 0          # Average reward within episode.
        timestep = 0            # Track timesteps within the episode.
        first_obs  = env.reset()

        # Experiences are a stack of the img_stack most frames to provide 
        # temporal information. Initialize this sequence to the first 
        # observation stacked 4 times.
        processed = preprocess_obs(first_obs, size=(img_rows, img_cols))
        # (img_rows, img_cols, img_stack)
        exp_stack = np.stack(([processed]*img_stack), axis = 2)
        # Expand dimension to stack and submit multiple exp_stacks in  a batch
        # (1, img_rows, img_cols, img_stack).
        exp_stack = np.expand_dims(exp_stack, axis=0) # 1x64x64x4
        
        # Punish the agent for not moving forward
        prev_state = {}
        steps_stuck = 0

        # Continue until the end of the zone is reached or 4500 timesteps have 
        # passed.
        while not done:
                # Predict an action to take based on the most recent
                # experience. 
                # 
                # Note that the first dimension 
                # (1, img_rows, img_cols, img_stack) is ignored by the
                # network here as it represents a batch size of 1.
                act_idx, action = dqn_agent.act(exp_stack)
                obs, reward, done, info = env.step(action)
                # env.render()
                
                # Punish the agent for standing still for too long.
                if (prev_state == info):
                    steps_stuck += 1
                else:
                    steps_stuck = 0
                prev_state = info

                # Position based reward does not include stagnation punishment.
                reward_sum += reward      
                if (steps_stuck > 20):
                    reward -= 1
                
                # Track various events
                timestep += 1
                total_timestep += 1

                obs = preprocess_obs(obs, size=(img_rows, img_cols))
                
                # Create a 1st dimension for stacking experiences and a 4th for 
                # stacking img_stack frames.
                obs = np.reshape(obs, (1, img_rows, img_cols, 1))
                
                # Append the new observation to the front of the stack and remove
                # the oldest (4th) frame.
                exp_stack_new = np.append(obs, exp_stack[:, :, :, :3], axis=3)

                # Save the experience: <state, action, reward, next_state, done>. 
                dqn_agent.save_memory(exp_stack, act_idx, reward, exp_stack_new, done)
                exp_stack = exp_stack_new
                
                # In the observation phase skip training updates and decrmenting epsilon.
                if (total_timestep >= dqn_agent.observation_timesteps):
                     
                    # Update the target model with the main model's weights.
                    if ((total_timestep % dqn_agent.update_target_freq) == 0):
                        dqn_agent.update_target_model()

                    # Train the agent on saved experiences.
                    if ((total_timestep % dqn_agent.timestep_per_train) == 0):
                            dqn_agent.replay_update()
                            dqn_agent.save_models(main_model_path, target_model_path)
                        
                    if (dqn_agent.epsilon > dqn_agent.final_epsilon):
                        # Decrease epsilon by a fraction of the range such that epsilon decreases
                        # for "exploration_timesteps".
                        dec = ((dqn_agent.initial_epsilon - dqn_agent.final_epsilon) / dqn_agent.exploration_timesteps)
                        dqn_agent.epsilon -= dec

                # print(info)
                print("Epsisode:", episode, " Timestep:", timestep, " Action:", act_idx, " Episode Reward Sum:", reward_sum, " Epsilon:", dqn_agent.epsilon)
        
        # Save mean episode reward at the end of the episode - append to stats file            
        with open(stat_path, "a") as stats_fd:
            reward_str = "Epsiode Cummulative Reward: " + str(reward_sum) + ", Episode Timestpes: " +  str(timestep) + ",\n"
            stats_fd.write(str(reward_str))
示例#13
0
from dqn_agent import DQN_Agent
import gym
from tqdm import tqdm

env = gym.make('CartPole-v0')
input_dim = env.observation_space.shape[0]
output_dim = env.action_space.n
exp_replay_size = 256
agent = DQN_Agent(seed=1423,
                  layer_sizes=[input_dim, 64, output_dim],
                  lr=1e-3,
                  sync_freq=5,
                  exp_replay_size=exp_replay_size)

# Main training loop
losses_list, reward_list, episode_len_list, epsilon_list = [], [], [], []
episodes = 10000
epsilon = 1

# initiliaze experiance replay
index = 0
for i in range(exp_replay_size):
    obs = env.reset()
    done = False
    while not done:
        A = agent.get_action(obs, env.action_space.n, epsilon=1)
        obs_next, reward, done, _ = env.step(A.item())
        agent.collect_experience([obs, A.item(), reward, obs_next])
        obs = obs_next
        index += 1
        if index > exp_replay_size: