예제 #1
0
class TestDQNAgent(unittest.TestCase):
    def setUp(self):
        self.state_size = 3
        self.action_size = 5
        fc = nn.Sequential(nn.Linear(self.state_size, 5), nn.ReLU(),
                           nn.Linear(5, 7), nn.ReLU(), nn.Linear(7, 9),
                           nn.ReLU(), nn.Linear(9, self.action_size))
        self.main_model = QNetwork(name="my_network", fc=fc)
        self.target_model = QNetwork(name="my_network", fc=fc)
        self.agent = DQNAgent(main_model=self.main_model,
                              target_network=self.target_model,
                              memory=WeightedReplayBuffer(buffer_size=12,
                                                          batch_size=3))
        self.eps_greediness = 0.01

    def test_allruns(self):
        """ No explosions? """
        # act
        state_value = [random()] * self.agent.state_size
        self.agent.act(state=state_value, eps=self.eps_greediness)

        agent_learned = False
        while not agent_learned:  # I want to force a learning step.
            agent_learned = self.agent.step(
                state=[random()] * self.agent.state_size,
                action=np.random.randint(self.agent.action_size),
                reward=random(),
                next_state=[random()] * self.agent.state_size,
                done=random() > 0.75)
예제 #2
0
def main(args):
    with open(args.param, "r") as f:
        config = json.load(f)

    env = gym.make('Freeway-v0')
    env.seed(args.seed)
    env = FrameStack(env, config)

    print('State shape: ', env.observation_space.shape)
    print('Action shape: ', env.action_space.n)
    agent = DQNAgent(state_size=200,
                     action_size=env.action_space.n,
                     config=config)
    #agent_r.load("models-28_11_2020_22:25:27/2000-")
    env = gym.wrappers.Monitor(env,
                               "./vid",
                               video_callable=lambda episode_id: True,
                               force=True)
    #agent.qnetwork_local.load_state_dict(torch.load('checkpoint-score80.47156817885116_epi_125.pth'))
    agent.qnetwork_local.load_state_dict(
        torch.load('search_results/models/eval-{}/_q_net.pth'.format(
            args.agent)))
    agent.encoder.load_state_dict(
        torch.load('search_results/models/eval-{}/_encoder.pth'.format(
            args.agent)))
    n_episodes = 1
    max_t = 3000
    eps = 0
    for i_episode in range(1, n_episodes + 1):
        state = env.reset()
        score = 0
        for t in range(max_t):
            action = agent.act(state, eps)

            next_state, reward, done, _ = env.step(action)
            score += reward
            time.sleep(0.01)
            state = next_state
            env.render()
            if done:
                break
        print("Episode {}  Reward {} Steps {}".format(i_episode, score, t))
        env.close()
예제 #3
0
def train_speed_agent(coach):
    """
    takes caoch(lstm) to modify the
    target reward function for the 
    agent
    
    """
    score = 0
    coaching_score_keep = []
    coaching_episode_keep = []
    env = gym.make('CartPole-v0')
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n
    coaching = DQNAgent(len(env.reset()), env.action_space.n)
    done = False
    batch_size = 32
    index = 0
    for e in range(EPISODES):
        state = env.reset()
        state = np.reshape(state, [1, state_size])
        for time in range(500):
            #env.render()
            action = coaching.act(state)
            next_state, reward, done, _ = env.step(action)
            reward = reward if not done else -10
            next_state = np.reshape(next_state, [1, state_size])
            index = index + 1
            coaching.remember(state, action, reward, next_state, done, index)
            score = score + reward
            success = determine_sucess(done, score)
            coaching.lstm_data(state, action, reward, next_state, done,
                               success)
            state = next_state
            if done:
                print("episode: {}/{}, score: {}, e: {:.2}".format(
                    e, EPISODES, time, coaching.epsilon))
                coaching_score_keep.append(score)
                coaching_episode_keep.append(e)
                score = 0
                break
        if len(coaching.memory) > batch_size:
            coaching.replay(batch_size, coach)
    return agent, coaching_score_keep, coaching_episode_keep
예제 #4
0
def train_expert():
    """
    craetes an agent that is trained to an optimal policy
    and captures all values required to train lstm
    """
    agent_score_keep = []
    agent_episode_keep = []
    score = 0
    env = gym.make('CartPole-v1')
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n
    agent = DQNAgent(len(env.reset()), env.action_space.n)
    # agent.load("./save/cartpole-dqn.h5")
    done = False
    batch_size = 32
    index = 0
    for e in range(EPISODES):
        state = env.reset()
        state = np.reshape(state, [1, state_size])
        for time in range(500):
            #                env.render()
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)
            a_reward = reward if not done else -10
            next_state = np.reshape(next_state, [1, state_size])
            index = index + 0
            agent.remember(state, action, a_reward, next_state, done, index)
            score = score + reward
            success = determine_sucess(done, score)
            agent.lstm_data(state, action, reward, next_state, done, success)
            state = next_state
            if done:
                print("episode: {}/{}, score: {}, e: {:.2}".format(
                    e, EPISODES, score, agent.epsilon))
                agent_score_keep.append(score)
                agent_episode_keep.append(e)
                score = 0
                break
        if len(agent.memory) > batch_size:
            agent.replay(batch_size, None)
    return agent, agent_score_keep, agent_episode_keep
                     input_shape=[len(obs)],
                     policy=policy,
                     obs_processer=obs_processer)

    agent.compile()

    result = []
    nb_epsiodes = 1000
    for episode in range(nb_epsiodes):
        agent.reset()
        observation = env.reset()
        observation = deepcopy(observation)
        agent.observe(observation)
        done = False
        while not done:
            action = deepcopy(agent.act())
            observation, reward, done, info = env.step(action)
            observation = deepcopy(observation)
            agent.observe(observation, reward, done)
            if done:
                break

        agent.training = False
        observation = env.reset()
        agent.observe(observation)
        done = False
        step = 0
        while not done:
            # env.render() # 表示
            step += 1
            action = agent.act()
crt_num_episodes = 0
crt_ep_reward = 0.0
total_rewards = 0.0
render = False

i = 0
while agent.config['episodes_left']:
  obs = np.array(env.reset())
  
  done = False
  while not done:
    if render:
      env.render()

    action = agent.act(obs)
    new_obs, reward, done, _ = env.step(action)
    new_obs = np.array(new_obs)
    agent.remember(obs, action, reward, new_obs, done)
    
    # fname = get_img_name(agent.config, action)
    # matplotlib.image.imsave(fname, obs[:, :, 3])
    
    obs = new_obs
    crt_ep_reward += reward

    i += 1
    if len(agent.memory.buffer) >= 500:
      loss = agent.train()
      if i % 100 == 0:
        print('loss:', loss)
예제 #7
0
    print('No weights found from previous learning session. Unable to proceed.')
    exit(-1)
return_history = []

for episodes in range(1, NUM_EPISODES + 1):
    # Reset the environment
    state = env.reset()
    # This reshape is needed to keep compatibility with Keras
    state = np.reshape(state, [1, state_size])
    # Cumulative reward is the return since the beginning of the episode
    cumulative_reward = 0.0
    for time in range(1, 500):
        # Render the environment for visualization
        env.render()
        # Select action
        action = agent.act(state)
        # Take action, observe reward and new state
        next_state, reward, done, _ = env.step(action)
        # Reshaping to keep compatibility with Keras
        next_state = np.reshape(next_state, [1, state_size])
        # Making reward engineering to keep compatibility with how training was done
        reward = reward_engineering_mountain_car(state[0], action, reward, next_state[0], done)
        state = next_state
        # Accumulate reward
        cumulative_reward = agent.gamma * cumulative_reward + reward
        if done:
            print("episode: {}/{}, time: {}, score: {:.6}, epsilon: {:.3}"
                  .format(episodes, NUM_EPISODES, time, cumulative_reward, agent.epsilon))
            break
    return_history.append(cumulative_reward)
예제 #8
0
파일: dqn_trainer.py 프로젝트: lucms/DQN
class DQNTrainer:
    """
    Trainer for a Deep Q-Network Agent on giver environment. Currently only supports epsilon greedy exploration.

    Parameters
    ----------
    env : gym.Env
        Environment in which the training may occur.
    log_frequency : int, optional
        Frequency, in timesteps, to log training information. (Default is 1000)
    exploration : dict, optional
        Exploration algorithm to use in the training. Expects a dict in any of the following formats.
        1. For linear epsilon decay:
            {'algorithm': 'epsilon_greedy',
            'decay': 'linear',
            'initial_epsilon': 1.0,
            'final_epsilon': 0.01,
            'decay_timesteps': 1000}
        2. For exponential epsilon decay:
            {'algorithm': 'epsilon_greedy',
            'decay': 'exponential',
            'initial_epsilon': 1.0,
            'epsilon_decay': 0.995}
        (Default is as follows in 1.)
    **kwargs
        Optional keyword arguments for the DQNAgent's hyperparameters.

    Attributes
    ----------
    agent : DQNAgent
        Agent to be trained by the DQNTrainer on given environment.
    env : gym.Env
        Training environment for the agent.
    log_frequency : int
        Frequency, in timesteps, to log training information. (Default is 1000)
    exploration_config : dict
        Necessary parameters for setting the exploration algorithm, including which one to use.
    update_explo_param : function
        Updates the exploration parameter when called.


    """

    def __init__(self, env: gym.Env,
                 log_frequency=1000,
                 exploration=None,
                 **kwargs):

        self.log_frequency = log_frequency
        self.agent = DQNAgent(action_dim=env.action_space.n, state_dim=env.observation_space.shape[0], **kwargs)
        self.env = env

        # Avoid mutable argument
        if exploration is None:
            exploration = {'algorithm': 'epsilon_greedy',
                           'decay': 'linear',
                           'initial_epsilon': 1.0,
                           'final_epsilon': 0.01,
                           'decay_timesteps': 1000}
        self.exploration_config = exploration

        # Parse the exploration dict to make the update_explo_param function
        if self.exploration_config['algorithm'] == 'epsilon_greedy':
            if self.exploration_config['decay'] == 'linear':
                update_term = (self.exploration_config['initial_epsilon'] - self.exploration_config[
                    'final_epsilon']) / self.exploration_config['decay_timesteps']
                self.update_explo_param = (lambda epsilon: epsilon - update_term if epsilon > self.exploration_config[
                    'final_epsilon'] else epsilon)

            elif self.exploration_config['decay'] == 'exponential':
                self.update_explo_param = (lambda epsilon: epsilon * self.exploration_config['epsilon_decay'])
        else:
            raise NotImplementedError

    def train(self, num_timesteps=100000, render=False):
        """
        Perform the training loop for num_timesteps duration.

        Parameters
        ----------
        num_timesteps : int, optional
            Number of timesteps to perform during the training session. (Default is 100000)
        render : bool, optional
            Whether to render the environment or not. (Default is False)
        """

        # Initialize log metrics
        episode_rewards = []
        episode_lengths = []
        episode_losses = []
        num_episodes = 0
        episode_reward = 0
        episode_length = 0
        episode_loss = 0

        # Get initial state and set initial epsilon
        state = self.env.reset()
        epsilon = self.exploration_config['initial_epsilon']

        # Perform the training loop num_timesteps times
        for timestep in range(num_timesteps):
            # Get agent action and perform a step on the environment
            action = self.agent.act(state, epsilon)
            if render:
                self.env.render()
            next_state, reward, done, info = self.env.step(action)

            # Add S,A,R,S',d transition to the agent's buffer
            self.agent.replay_buffer.add_transition((state, action, reward, next_state, done))

            # Optimize the agent, if it has enough experiences stored
            if timestep > self.agent.batch_size:
                loss = self.agent.optimize(timestep)
            else:
                loss = 0

            # Update the epsilon value and log metrics
            epsilon = self.update_explo_param(epsilon)
            episode_reward += reward
            episode_loss += loss
            episode_length += 1

            # Reset the environment if the episode has ended, logging more information
            if done:
                num_episodes += 1
                episode_rewards.append(episode_reward)
                episode_losses.append(episode_loss)
                episode_lengths.append(episode_length)

                episode_reward = 0
                episode_loss = 0
                episode_length = 0

                state = self.env.reset()

            else:
                state = next_state

            # Report the log metrics every log_frequency timesteps
            if timestep % self.log_frequency == 0:
                print('Num episodes: ', num_episodes)
                print('Num timesteps: ', timestep)
                print('Mean episode reward: ', sum(episode_rewards[-20:]) / min(20, max(1, len(episode_rewards))))
                print('Mean episode loss: ', sum(episode_losses[-20:]) / min(20, max(1, len(episode_losses))))
                print('Mean episode length: ', sum(episode_lengths[-20:]) / min(20, max(1, len(episode_lengths))))
                print('\n')
예제 #9
0
        for phase in range(7, 8):
            env.number_of_grids = phase + 3
            agent.epsilon = 1.0
            agent.memory = deque(maxlen=2000)
            phase_scores = deque(maxlen=5)
            for e in range(EPISODES):
                done = False
                state = env.reset()
                # env.seed(0)
                state = np.reshape(state, [1, state_size])
                score = 0
                # while not done:
                for _ in range(500):
                    # env.render()
                    possible_actions = env.get_possible_actions()
                    action = agent.act(state, possible_actions)
                    # if action not in possible_actions:
                    #     reward = -1*99999
                    #     env.done = True
                    #     done = True
                    # else:
                    next_state, reward, done, _ = env.step(action)

                    score += reward

                    next_state = np.reshape(next_state, [1, state_size])
                    agent.remember(state, action, reward, next_state, done)
                    state = next_state

                    if done:
                        print("episode: {}/{}, score: {}, e: {:.2}".format(
def train_agent(env, config):
    """
    Args:
    """

    # create CNN convert the [1,3,84,84] to [1, 200]
    now = datetime.now()
    dt_string = now.strftime("%d_%m_%Y_%H:%M:%S")
    torch.manual_seed(config["seed"])
    np.random.seed(config["seed"])
    #pathname = str(args.locexp) + "/" + str(args.env_name) + '_agent_' + str(args.policy)
    #pathname += "_batch_size_" + str(args.batch_size) + "_lr_act_" + str(args.lr_actor)
    #pathname += "_lr_critc_" + str(args.lr_critic) + "_lr_decoder_"
    pathname = dt_string
    tensorboard_name = str(config["locexp"]) + '/runs/' + pathname
    agent = DQNAgent(state_size=200,
                     action_size=env.action_space.n,
                     config=config)
    writer = SummaryWriter(tensorboard_name)
    print("action_size {}".format(env.action_space.n))
    # eval_policy(env, agent, writer, 0, config)
    memory = ReplayBuffer((3, config["size"], config["size"]),
                          (1, ), config["expert_buffer_size"],
                          int(config["image_pad"]), config["device"])
    if config["create_buffer"]:
        create_buffer(env, memory, config)
        memory.load_memory("/export/leiningc/" + config["buffer_path"])
    else:
        print("load Buffer")
        memory.load_memory("/export/leiningc/" + config["buffer_path"])
        print("Buffer size {}".format(memory.idx))
    eps = config["eps_start"]
    eps_end = config["eps_end"]
    eps_decay = config["eps_decay"]
    scores_window = deque(maxlen=100)
    scores = []
    t0 = time.time()
    for i_episode in range(config["train_episodes"]):
        obs = env.reset()
        score = 0
        for t in range(config["max_t"]):
            action = agent.act(obs, eps)
            # action = env.action_space.sample()
            next_obs, reward, done_no_max, _ = env.step(action)
            done = done_no_max
            if t + 1 == config["max_t"]:
                print("t ", t)
                done = 0
            memory.add(obs, action, reward, next_obs, done, done_no_max)
            agent.step(memory, writer)
            obs = next_obs
            eps = max(eps_end, eps_decay * eps)  # decrease epsilon
            score += reward
            if done:
                break
        scores_window.append(score)  # save most recent scor
        scores.append(score)  # save most recent score
        ave_score = np.mean(scores_window)
        writer.add_scalar("ave_score", ave_score, i_episode)
        writer.add_scalar("episode_score", score, i_episode)
        print(
            '\rEpisode {} score {} \tAverage Score: {:.2f}  eps: {:.2f} time: {}'
            .format(i_episode, score, np.mean(scores_window), eps,
                    time_format(time.time() - t0)),
            end="")
        if i_episode % config["eval"] == 0:
            eval_policy(env, agent, writer, i_episode, config)
            agent.save(
                str(config["locexp"]) + "/models/eval-{}/".format(i_episode))
            print(
                'Episode {} Average Score: {:.2f}  eps: {:.2f} time: {}'.
                format(i_episode, np.mean(scores_window), eps,
                       time_format(time.time() - t0)), )
예제 #11
0
def main(model=None, mode='train', start_episode=0):
    my_xml = '''<?xml version="1.0" encoding="UTF-8" standalone="no" ?>
    <Mission xmlns="http://ProjectMalmo.microsoft.com" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
      <About>
        <Summary>Hill Descent.</Summary>
      </About>
      <ModSettings>
        <MsPerTick>20</MsPerTick>
      </ModSettings>
      <ServerSection>

        <ServerInitialConditions>

            <Time><StartTime>1</StartTime></Time>
        </ServerInitialConditions>
        <ServerHandlers>

          <DefaultWorldGenerator seed="-999595225643433963" forceReset="false" destroyAfterUse="false" />

          <ServerQuitFromTimeUp timeLimitMs="100000000"/>
          <ServerQuitWhenAnyAgentFinishes/>
        </ServerHandlers>
      </ServerSection>
      <AgentSection mode="Survival">
        <Name>Bob</Name>
        <AgentStart>
          <Placement x="28.5" y="87" z="330.5" pitch="-90" yaw="0"/>
        </AgentStart>
        <AgentHandlers>
          <DiscreteMovementCommands/>
          <MissionQuitCommands quitDescription="done"/>
          <ChatCommands/>
          <ObservationFromFullStats/>
          <ObservationFromGrid>
              <Grid name="sight">
                  <min x="{}" y="{}" z="{}"/>
                  <max x="{}" y="{}" z="{}"/>
              </Grid>
              <Grid name="feet">
                  <min x="0" y="-1" z="0"/>
                  <max x="0" y="-1" z="0"/>
              </Grid>
      </ObservationsationFromGrid>
          <AgentQuitFromTouchingBlockType>
              <Block type="cobblestone" />
          </AgentQuitFromTouchingBlockType>
        </AgentHandlers>
      </AgentSection>
    </Mission>

    '''.format(-(grid_width - 1) // 2, -grid_height, -(grid_width - 1) // 2,
               (grid_width - 1) // 2, grid_height, (grid_width - 1) // 2)

    batch_size = 100
    agent = DQNAgent(state_size, action_size, learning_rate, discount_rate,
                     epsilon, epsilon_min, epsilon_decay)
    if model != None:
        agent.load(model)
        if mode == 'test':
            agent.epsilon = 0.0
        print('loaded model: {}'.format(model))
    else:
        clear_csv('./data/results.csv')
        clear_csv('./data/moves.csv')

    my_client_pool = MalmoPython.ClientPool()
    my_client_pool.add(MalmoPython.ClientInfo("127.0.0.1", 10001))
    agent_host = MalmoPython.AgentHost()

    for e in range(start_episode + 1, episodes + 1):
        my_mission = MalmoPython.MissionSpec(my_xml, True)
        my_mission_record = MalmoPython.MissionRecordSpec()
        my_mission.requestVideo(800, 500)
        my_mission.setViewpoint(2)
        print("Waiting for the mission to start", end=' ')
        agent_host.startMission(
            my_mission,
            my_mission_record,
        )
        world_state = agent_host.getWorldState()
        while not world_state.has_mission_begun:
            print(".", end="")
            time.sleep(0.1)
            world_state = agent_host.getWorldState()
            for error in world_state.errors:
                print("Error:", error.text)
        print()
        agent_host.sendCommand('chat /kill @e[type=Chicken]')
        agent_host.sendCommand('chat /kill @e[type=Pig]')
        agent_host.sendCommand('chat /kill @e[type=Cow]')
        moves = 0
        episode_reward = 0

        while world_state.is_mission_running:
            world_state = agent_host.getWorldState()
            if world_state.number_of_observations_since_last_state > 0:
                try:
                    obvsText = world_state.observations[-1].text
                    data = json.loads(obvsText)
                except:
                    print("Error when getting state")
                    continue

                state = get_state(data)

                prev_x = data.get(u'XPos', 0)
                prev_y = data.get(u'YPos', 0)
                prev_z = data.get(u'ZPos', 0)

                useful_state = [state[2], state[6], state[7], state[8], \
                    state[10], state[11], state[13], \
                    state[14], state[16], state[17], \
                    state[18], state[22]]

                action = agent.act(useful_state)

                if ((action == 0 and state[grid_center - grid_width] == 0)
                        or (action == 1 and state[grid_center + 1] == 0) or
                    (action == 2 and state[grid_center + grid_width] == 0)
                        or (action == 3 and state[grid_center - 1] == 0)):
                    agent_host.sendCommand(jump_directions[action])
                else:
                    agent_host.sendCommand(directions[action])
                time.sleep(0.25)
                #print("North:", state[grid_center - grid_width], \
                #      "  East:", state[grid_center + 1], \
                #      "  South:", state[grid_center + grid_width], \
                #      "  West:", state[grid_center - 1])

                try:
                    world_state = wait_world_state(agent_host, world_state)
                    obvsText = world_state.observations[-1].text
                    data = json.loads(obvsText)
                except:
                    print("Error when getting state")
                    continue

                current_x = data.get(u'XPos', 0)
                current_y = data.get(u'YPos', 0)
                current_z = data.get(u'ZPos', 0)
                damage_taken = calculate_damage(prev_y, current_y)
                next_state = get_state(data)

                useful_next_state = [state[2], state[6], state[7], state[8], \
                    state[10], state[11], state[13], \
                    state[14], state[16], state[17], \
                    state[18], state[22]]

                # print("previous and current y", prev_y, current_y)
                # print("damage taken", damage_taken)
                #print("X:", prev_x, current_x, "\n", \
                #      "Y:", prev_y, current_y, "\n", \
                #      "Z:", prev_z, current_z, "\n")
                reward = 2 * (
                    prev_y - current_y
                ) - 50 * damage_taken - 1 if prev_x != current_x or prev_y != current_y or prev_z != current_z else -1000
                episode_reward += reward
                done = True if current_y <= goal_height or not world_state.is_mission_running or data[
                    'Life'] <= 0 else False

                agent.remember(useful_state, action, reward, useful_next_state,
                               done)
                if ((action == 0 and state[grid_center - grid_width] == 0)
                        or (action == 1 and state[grid_center + 1] == 0) or
                    (action == 2 and state[grid_center + grid_width] == 0)
                        or (action == 3 and state[grid_center - 1] == 0)):
                    print(
                        'episode {}/{}, action: {}, reward: {}, e: {:.2}, move: {}, done: {}'
                        .format(e, episodes, jump_directions[action], reward,
                                agent.epsilon, moves, done))
                else:
                    print(
                        'episode {}/{}, action: {}, reward: {}, e: {:.2}, move: {}, done: {}'
                        .format(e, episodes, directions[action], reward,
                                agent.epsilon, moves, done))
                moves += 1

                if mode == 'train' or model == None:
                    write_to_csv('./data/moves.csv',
                                 [e, current_x, current_y, current_z, reward])

                if e > batch_size:
                    agent.replay(batch_size)

                if done or moves > max_moves:
                    agent_host.sendCommand("quit")

        if (mode == 'train'
                or model == None) and (e in checkpoints
                                       or agent.epsilon <= epsilon_min):
            print('saving model at episode {}'.format(e))
            agent.save('./models/model_{}'.format(e))
            if agent.epsilon <= epsilon_min:
                break

        time.sleep(1)
        # my_mission.forceWorldReset()
        if mode == 'train' or model == None:
            write_to_csv('./data/results.csv',
                         [e, episode_reward, moves,
                          int(episode_reward > 0)])
예제 #12
0
class Executor:
    def __init__(self, config, env, eval_env):
        self.config = config
        self.env = env
        self.eval_env = eval_env

        self.stats = None

        self.student_agent = None

        self.evaluation_dir = None
        self.save_videos_path = None

        self.steps_reward = 0.0
        self.steps_error_in = 0.0
        self.steps_error_out = 0.0

        self.episode_duration = 0
        self.episode_reward = 0.0
        self.episode_error_in = 0.0
        self.episode_error_out = 0.0

        self.episode_visited = set()

        self.obs_images = None
        self.tr_info = None

        # ==============================================================================================================

        self.process = None
        self.run_id = None

        self.scripts_dir = None
        self.local_workspace_dir = None

        self.runs_local_dir = None
        self.summaries_dir = None
        self.checkpoints_dir = None
        self.copy_scripts_dir = None
        self.videos_dir = None

        self.save_summary_path = None
        self.save_model_path = None
        self.save_scripts_path = None
        self.save_videos_path = None

        self.plots_subdirs = None
        self.save_plots_paths = None

        self.session = None
        self.summary_writer = None
        self.saver = None

        self.teacher_agent = None

        self.rnd_rm = None

        self.rnd_uncertainty_c = None
        self.rnd_uncertainty_d = None

        self.action_advising_enabled = None
        self.action_advising_budget = None
        self.action_advising_method = None
        self.action_advising_rm_th = self.config['action_advising_rm_th']
        self.action_advising_check_rm = None

        # RND Observation normalization
        self.obs_running_mean = None
        self.obs_running_std = None
        self.obs_norm_n = 0
        self.obs_norm_max_n = 5000 if self.config['env_type'] == 1 else 1000

        self.data_collection_period = 500 if self.config[
            'env_type'] == 1 else 100  # Frames
        self.data_collection_step = 0
        self.data_rnd_uncertainty = None
        self.data_rnd_rm = None

        # Online counters
        self.rm_obs_counter_all = None
        self.rm_obs_counter_teacher = None
        self.rm_tr_counter_all = None
        self.rm_tr_counter_teacher = None

        # Snapshots
        self.data_rm_obs_counter_all = None
        self.data_rm_obs_counter_teacher = None
        self.data_rm_tr_counter_all = None
        self.data_rm_tr_counter_teacher = None

    # ------------------------------------------------------------------------------------------------------------------

    def render(self, env):
        if self.config['env_type'] == 0:
            return env.render()
        elif self.config['env_type'] == 1:
            return env.render_state()

    # ------------------------------------------------------------------------------------------------------------------

    def run(self):
        os.environ['PYTHONHASHSEED'] = str(self.config['seed'])
        random.seed(self.config['seed'])
        np.random.seed(self.config['seed'])
        tf.set_random_seed(self.config['seed'])

        self.run_id = self.config['run_id'] if self.config['run_id'] is not None \
            else strftime("%Y%m%d-%H%M%S", localtime()) + '-' + str(self.config['process_index'])
        self.seed_id = str(self.config['seed'])

        print('Run ID: {}'.format(self.run_id))

        self.scripts_dir = os.path.dirname(os.path.abspath(__file__))
        self.local_workspace_dir = os.path.join(
            str(pathlib.Path(self.scripts_dir).parent.parent.parent.parent))

        print('{} (Scripts directory)'.format(self.scripts_dir))
        print('{} (Local Workspace directory)'.format(
            self.local_workspace_dir))

        self.runs_local_dir = os.path.join(self.local_workspace_dir, 'Runs')
        os.makedirs(self.runs_local_dir, exist_ok=True)

        self.summaries_dir = os.path.join(self.runs_local_dir, 'Summaries')
        os.makedirs(self.summaries_dir, exist_ok=True)

        self.checkpoints_dir = os.path.join(self.runs_local_dir, 'Checkpoints')
        os.makedirs(self.checkpoints_dir, exist_ok=True)

        self.copy_scripts_dir = os.path.join(self.runs_local_dir, 'Scripts')
        os.makedirs(self.copy_scripts_dir, exist_ok=True)

        self.videos_dir = os.path.join(self.runs_local_dir, 'Videos')
        os.makedirs(self.videos_dir, exist_ok=True)

        self.plots_dir = os.path.join(self.runs_local_dir, 'Plots')
        os.makedirs(self.plots_dir, exist_ok=True)

        self.data_dir = os.path.join(self.runs_local_dir, 'Data')
        os.makedirs(self.data_dir, exist_ok=True)

        # --------------------------------------------------------------------------------------------------------------

        self.save_summary_path = os.path.join(self.summaries_dir, self.run_id,
                                              self.seed_id)
        self.save_model_path = os.path.join(self.checkpoints_dir, self.run_id,
                                            self.seed_id)
        self.save_scripts_path = os.path.join(self.copy_scripts_dir,
                                              self.run_id, self.seed_id)
        self.save_videos_path = os.path.join(self.videos_dir, self.run_id,
                                             self.seed_id)
        self.save_data_path = os.path.join(self.data_dir, self.run_id,
                                           self.seed_id)

        self.plots_subdirs = []
        self.plots_subdirs.append(os.path.join(self.plots_dir,
                                               'TD-Error-All'))  # 0
        self.plots_subdirs.append(
            os.path.join(self.plots_dir, 'State-Uncertainty'))  # 1
        self.plots_subdirs.append(os.path.join(self.plots_dir,
                                               'Combined'))  # 2
        self.plots_subdirs.append(os.path.join(self.plots_dir, 'ER'))  # 3
        for graphs_subdir in self.plots_subdirs:
            os.makedirs(graphs_subdir, exist_ok=True)

        self.save_plots_paths = [
            os.path.join(plots_subdir, self.run_id, self.seed_id)
            for plots_subdir in self.plots_subdirs
        ]

        for save_plots_path in self.save_plots_paths:
            os.makedirs(save_plots_path, exist_ok=True)

        if self.config['save_models']:
            os.makedirs(self.save_model_path, exist_ok=True)
        os.makedirs(self.save_videos_path, exist_ok=True)
        os.makedirs(self.save_data_path, exist_ok=True)

        self.copy_scripts(self.save_scripts_path)

        if self.config['use_gpu']:
            print('Using GPU.')
            session_config = tf.ConfigProto(intra_op_parallelism_threads=1,
                                            inter_op_parallelism_threads=1)
        else:
            print('Not using GPU.')
            session_config = tf.ConfigProto(intra_op_parallelism_threads=1,
                                            inter_op_parallelism_threads=1,
                                            allow_soft_placement=True,
                                            device_count={
                                                'CPU': 1,
                                                'GPU': 0
                                            })

        self.session = tf.InteractiveSession(graph=tf.get_default_graph(),
                                             config=session_config)
        self.summary_writer = tf.summary.FileWriter(self.save_summary_path,
                                                    self.session.graph)

        # --------------------------------------------------------------------------------------------------------------

        # Experiment setup format: abc (0: No Advising)
        # a: Action advising method
        # -- 1: Early advising
        # -- 2: Uniformly random advising
        # -- 3: Uncertainty based advising (RND)
        # -- 4: Uncertainty based bootstrapped DQN
        # b: Replay memory checking (True/False)
        # c: Budget

        if self.config['experiment_setup'] == 0:  # Self exploration
            self.action_advising_enabled = False
            self.action_advising_check_rm = False
            self.action_advising_budget = 0

        else:
            es_1 = self.config['experiment_setup']
            es_2 = es_1 % 100
            es_3 = es_2 % 10

            action_advising_budgets = {
                0: 500,
                1: 1000,
                2: 2500,
                3: 5000,
                4: 10000,
                5: 25000,
                6: 50000,
                7: 100000
            }

            self.action_advising_enabled = True
            self.action_advising_method = es_1 // 100
            self.action_advising_check_rm = es_2 // 10
            self.action_advising_budget = action_advising_budgets[es_3]

        # --------------------------------------------------------------------------------------------------------------

        # Config to be passed to agents
        if self.config['env_type'] == 0:
            self.config['env_obs_dims'] = self.env.obs_space.shape
            self.config['env_n_actions'] = self.env.action_space.n
        elif self.config['env_type'] == 1:
            self.config['env_obs_dims'] = self.env.state_shape()
            self.config['env_n_actions'] = self.env.num_actions()

        student_agent_name = self.run_id.replace('-', '') + '0' + '_' + str(
            self.config['seed'])

        self.student_agent = DQNAgent(student_agent_name, self.config,
                                      self.session, 'task')
        self.config['student_model_name'] = self.student_agent.name
        print('Student agent name: {}'.format(self.student_agent.name))

        self.save_config(self.config,
                         os.path.join(self.save_summary_path, 'config.txt'))

        if self.config[
                'env_type'] == 1 and self.config['experiment_setup'] != 0:
            self.teacher_agent = DQNAgent(
                self.config['expert_agent_id'].replace("-", "") + '0_' +
                self.config['expert_agent_seed'], self.config, self.session,
                'task')
            print('Expert agent name: {}'.format(self.teacher_agent.name))

        # --------------------------------------------------------------------------------------------------------------

        # Initialize RND:
        if self.action_advising_check_rm == 2:  # RM checking with RND
            self.rnd_rm = RND(student_agent_name + '_RM', self.config,
                              self.session,
                              self.config['rm_rnd_learning_rate'],
                              self.config['rm_rnd_adam_epsilon'])
            self.student_agent.rnd_rm = self.rnd_rm

        # --------------------------------------------------------------------------------------------------------------

        if self.config['env_type'] == 0:
            n_data_points = int(self.config['n_training_frames'] /
                                self.data_collection_period) + 1

            self.data_rnd_uncertainty = \
                np.zeros((self.config['env_obs_dims'][0], self.config['env_obs_dims'][1], n_data_points), dtype=np.float32)
            self.data_rnd_rm = \
                np.zeros((self.config['env_obs_dims'][0], self.config['env_obs_dims'][1], n_data_points), dtype=np.float32)

            # Online counters
            self.rm_obs_counter_all = np.zeros(self.env.n_states, dtype=int)
            self.rm_obs_counter_teacher = np.zeros(self.env.n_states,
                                                   dtype=int)
            self.rm_tr_counter_all = np.zeros(self.env.n_transitions,
                                              dtype=int)
            self.rm_tr_counter_teacher = np.zeros(self.env.n_transitions,
                                                  dtype=int)

            # Snapshots
            self.data_rm_obs_counter_all = np.zeros(
                (self.env.n_states, n_data_points), dtype=int)
            self.data_rm_obs_counter_teacher = np.zeros(
                (self.env.n_states, n_data_points), dtype=int)
            self.data_rm_tr_counter_all = np.zeros(
                (self.env.n_transitions, n_data_points), dtype=int)
            self.data_rm_tr_counter_teacher = np.zeros(
                (self.env.n_transitions, n_data_points), dtype=int)

        # --------------------------------------------------------------------------------------------------------------

        total_parameters = 0
        for variable in tf.trainable_variables():
            shape = variable.get_shape()
            variable_parameters = 1
            for dim in shape:
                variable_parameters *= dim.value
            total_parameters += variable_parameters
        print('Number of parameters: {}'.format(total_parameters))

        self.saver = tf.train.Saver()
        self.session.run(tf.global_variables_initializer())

        self.stats = Statistics(self.summary_writer, self.session)

        # Restore
        if self.config['experiment_setup'] != 0:
            if self.config['env_type'] == 1:
                self.teacher_agent.restore(
                    self.checkpoints_dir, self.config['expert_agent_id'] +
                    '/' + self.config['expert_agent_seed'],
                    self.config['expert_agent_checkpoint'])

        if not self.config['save_models']:
            tf.get_default_graph().finalize()

        reward_is_seen = False

        if self.stats.n_env_steps % self.config['evaluation_period'] == 0:
            eval_score = self.evaluate()
            print('Evaluation @ {} | {}'.format(self.stats.n_env_steps,
                                                eval_score))

        render = self.stats.n_episodes % self.config[
            'visualization_period'] == 0

        if render:
            self.obs_images = []
            self.tr_info = []

        obs = None
        if self.config['env_type'] == 0:
            obs = self.env.reset()
        elif self.config['env_type'] == 1:
            self.env.reset()
            obs = self.env.state().astype(dtype=np.float32)

        if render:
            self.obs_images.append(self.render(self.env))

        if self.config['env_type'] == 0:
            self.record_data()

        while True:
            if self.obs_norm_n < self.obs_norm_max_n:
                obs_mean = obs.mean(axis=(0, 1))
                obs_std = obs.std(axis=(0, 1))
                if self.obs_norm_n == 0:
                    self.obs_running_mean = obs_mean
                    self.obs_running_std = obs_std
                else:
                    self.obs_running_mean = \
                        self.obs_running_mean + (obs_mean - self.obs_running_mean)/(self.obs_norm_n + 1)
                    self.obs_running_std = \
                        self.obs_running_std + (obs_std - self.obs_running_std) / (self.obs_norm_n + 1)
                self.obs_norm_n += 1

            if self.obs_norm_n == self.obs_norm_max_n:
                print(self.obs_running_mean)
                print(self.obs_running_std)
                self.obs_norm_n += 1
            #

            state_id = self.env.state_id_dict[(self.env.state.agent_pos[0], self.env.state.agent_pos[1])] \
                if self.config['env_type'] == 0 else None

            # ----------------------------------------------------------------------------------------------------------
            # Action Advising

            get_action_advice = False
            if self.action_advising_enabled and self.action_advising_budget > 0:
                if self.action_advising_method == 1:
                    get_action_advice = True
                elif self.action_advising_method == 2:
                    if random.random() < 0.5:
                        get_action_advice = True

            # Second-factor check for RM
            if get_action_advice and self.action_advising_check_rm != 0:
                if self.action_advising_check_rm == 1:
                    if self.rm_obs_counter_teacher[
                            state_id] >= self.action_advising_rm_th:
                        get_action_advice = False
                elif self.action_advising_check_rm == 2:
                    sparsity = self.rnd_rm.get_error(obs, normalize=True)
                    if sparsity >= self.action_advising_rm_th:
                        pass
                    else:
                        get_action_advice = False

            if get_action_advice:
                self.action_advising_budget -= 1
                self.stats.advices_taken += 1
                self.stats.advices_taken_cumulative += 1

                if self.config['env_type'] == 0:
                    action = self.env.optimal_action()
                elif self.config['env_type'] == 1:
                    action = self.teacher_agent.greedy_action(obs,
                                                              evaluation=True)

                source = 1
            else:
                action = self.student_agent.act(obs, evaluation=False)
                source = 0

            # ----------------------------------------------------------------------------------------------------------

            transition_id = self.env.transition_id_dict[(self.env.state.agent_pos[0], self.env.state.agent_pos[1], action)] \
                if self.config['env_type'] == 0 else None

            obs_next, reward, done = None, None, None
            if self.config['env_type'] == 0:
                obs_next, reward, done = self.env.step(action)
            elif self.config['env_type'] == 1:
                reward, done = self.env.act(action)
                obs_next = self.env.state().astype(dtype=np.float32)

            td_error = self.student_agent.get_td_error(obs, action, reward,
                                                       obs_next, done)

            if render:
                self.obs_images.append(self.render(self.env))

            self.episode_error_in += td_error
            self.episode_reward += reward
            self.episode_duration += 1

            self.steps_error_in += td_error
            self.steps_reward += reward
            self.stats.n_env_steps += 1

            if reward > 0 and reward_is_seen is False:
                reward_is_seen = True
                print(">>> Reward is seen at ", self.stats.n_episodes, "|",
                      self.episode_duration)

            if source == 1:
                if self.action_advising_check_rm == 2:
                    self.rnd_rm.train_model(obs,
                                            loss_id=0,
                                            is_batch=False,
                                            normalize=True)

            if self.config['env_type'] == 0:
                self.rm_obs_counter_all[state_id] += 1
                self.rm_tr_counter_all[transition_id] += 1
                if source == 1:
                    self.rm_obs_counter_teacher[state_id] += 1
                    self.rm_tr_counter_teacher[transition_id] += 1

            # ----------------------------------------------------------------------------------------------------------
            # Dropped data from RM

            old_transition = self.student_agent.feedback_observe(
                obs, action, reward, obs_next, done, source, state_id)

            if old_transition is not None:
                if self.action_advising_check_rm == 2 and old_transition[
                        5] == 1:
                    self.rnd_rm.train_model(old_transition[0],
                                            loss_id=1,
                                            is_batch=False,
                                            normalize=True)

                if self.config['env_type'] == 0:
                    old_state_id = old_transition[6]
                    old_action = old_transition[1]
                    old_agent_pos = self.env.agent_pos_dict[old_state_id]
                    old_transition_id = self.env.transition_id_dict[(
                        old_agent_pos[0], old_agent_pos[1], old_action)]

                    self.rm_obs_counter_all[old_state_id] -= 1
                    self.rm_tr_counter_all[old_transition_id] -= 1

                    if old_transition[5] == 1:
                        self.rm_obs_counter_teacher[old_state_id] -= 1
                        self.rm_tr_counter_teacher[old_transition_id] -= 1

            # ----------------------------------------------------------------------------------------------------------

            td_error_batch, loss = self.student_agent.feedback_learn()
            td_error_batch_sum = np.sum(td_error_batch)

            self.episode_error_out += td_error_batch_sum
            self.steps_error_out += td_error_batch_sum

            self.stats.loss += loss
            obs = obs_next

            if self.config[
                    'env_type'] == 0 and self.stats.n_env_steps % self.data_collection_period == 0:
                self.record_data()

            if done:
                self.action_advising_countdown = 0

                self.stats.n_episodes += 1
                self.stats.episode_reward_auc += np.trapz(
                    [self.stats.episode_reward_last, self.episode_reward])
                self.stats.episode_reward_last = self.episode_reward

                self.stats.update_summary_episode(
                    self.episode_reward, self.stats.episode_reward_auc,
                    self.episode_duration, self.episode_error_in,
                    self.episode_error_out)

                print(
                    'ER: {:.1f} ({}) (error: {:.3f}) @ {} frames - {}'.format(
                        self.episode_reward, self.stats.n_episodes,
                        self.episode_error_in, self.stats.n_env_steps,
                        self.stats.advices_taken_cumulative))

                if render:
                    self.write_video(
                        self.obs_images, '{}_{}'.format(
                            str(self.stats.n_episodes - 1),
                            str(self.stats.n_env_steps -
                                self.episode_duration)))
                    self.obs_images.clear()
                    self.tr_info.clear()

                self.episode_duration = 0
                self.episode_reward = 0.0
                self.episode_error_in = 0.0
                self.episode_error_out = 0.0

                render = self.stats.n_episodes % self.config[
                    'visualization_period'] == 0

                obs = None
                if self.config['env_type'] == 0:
                    obs = self.env.reset()
                elif self.config['env_type'] == 1:
                    self.env.reset()
                    obs = self.env.state().astype(dtype=np.float32)

                if render:
                    self.obs_images.append(self.render(self.env))

            # Per N steps summary update
            if self.stats.n_env_steps % self.stats.n_steps_per_update == 0:
                self.stats.steps_reward_auc += np.trapz(
                    [self.stats.steps_reward_last, self.steps_reward])
                self.stats.steps_reward_last = self.steps_reward
                self.stats.epsilon = self.student_agent.epsilon

                self.stats.update_summary_steps(self.steps_reward,
                                                self.stats.steps_reward_auc,
                                                self.steps_error_in,
                                                self.steps_error_out)

                self.stats.advices_taken = 0.0
                self.stats.exploration_steps_taken = 0
                self.steps_reward = 0.0
                self.steps_error_in = 0.0
                self.steps_error_out = 0.0

            if self.stats.n_env_steps % self.config['evaluation_period'] == 0:
                evaluation_score = self.evaluate()
                print('Evaluation ({}): {}'.format(self.stats.n_episodes,
                                                   evaluation_score))

            if self.config[
                    'save_models'] and self.stats.n_env_steps % self.config[
                        'model_save_period'] == 0:
                model_path = os.path.join(os.path.join(self.save_model_path),
                                          'model-{}.ckpt').format(
                                              self.stats.n_env_steps)
                print('[{}] Saving model... {}'.format(self.stats.n_env_steps,
                                                       model_path))
                self.saver.save(self.session, model_path)

            if self.stats.n_env_steps >= self.config['n_training_frames']:
                if self.config['save_models']:
                    model_path = os.path.join(
                        os.path.join(self.save_model_path),
                        'model-{}.ckpt').format(self.stats.n_env_steps)
                    print('[{}] Saving model... {}'.format(
                        self.stats.n_env_steps, model_path))
                    self.saver.save(self.session, model_path)
                break

        print('Env steps: {}'.format(self.stats.n_env_steps))

        if self.config['env_type'] == 0:
            self.save_data()

        self.session.close()

    def write_video(self, images, filename):
        v_w = np.shape(images[0])[0]
        v_h = np.shape(images[0])[1]
        filename_full = os.path.join(self.save_videos_path, str(filename))
        video = cv2.VideoWriter(filename_full + '.avi',
                                cv2.VideoWriter_fourcc('M', 'J', 'P', 'G'), 20,
                                (v_h, v_w))
        for image in images:
            video.write(image)
        video.release()

    def copy_scripts(self, target_directory):
        if not os.path.exists(target_directory):
            os.makedirs(target_directory)
        files = glob.iglob(os.path.join(self.scripts_dir, '*.py'))
        for file in files:
            if os.path.isfile(file):
                shutil.copy2(file, target_directory)

    def save_config(self, config, filepath):
        fo = open(filepath, "w")
        for k, v in config.items():
            fo.write(str(k) + '>> ' + str(v) + '\n')
        fo.close()

    def evaluate(self):
        eval_render = self.stats.n_evaluations % self.config[
            'evaluation_visualization_period'] == 0

        eval_total_reward = 0.0
        eval_duration = 0

        if self.config['env_type'] == 0 or self.config['env_type'] == 1:
            self.eval_env.set_random_state(self.config['env_evaluation_seed'])

        for i_eval_trial in range(self.config['n_evaluation_trials']):
            eval_obs_images = []

            eval_obs = None
            if self.config['env_type'] == 0:
                eval_obs = self.eval_env.reset()
            elif self.config['env_type'] == 1:
                self.eval_env.reset()
                eval_obs = self.eval_env.state().astype(dtype=np.float32)

            eval_episode_reward = 0.0
            eval_episode_duration = 0

            while True:
                if eval_render:
                    eval_obs_images.append(self.render(self.eval_env))

                eval_action = self.student_agent.greedy_action(eval_obs,
                                                               evaluation=True)

                eval_obs_next, eval_reward, eval_done = None, None, None
                if self.config['env_type'] == 0:
                    eval_obs_next, eval_reward, eval_done = self.eval_env.step(
                        eval_action)
                elif self.config['env_type'] == 1:
                    eval_reward, eval_done = self.eval_env.act(eval_action)
                    eval_obs_next = self.eval_env.state().astype(
                        dtype=np.float32)

                eval_episode_reward += eval_reward
                eval_duration += 1
                eval_episode_duration += 1
                eval_obs = eval_obs_next

                if eval_done:

                    if self.config['env_type'] == 0:
                        if eval_episode_reward == 1.0:
                            eval_episode_reward = 1.0 - (
                                eval_episode_duration - 24) / 76.0

                    if eval_render:
                        eval_obs_images.append(self.render(self.eval_env))
                        self.write_video(
                            eval_obs_images,
                            'E_{}_{}'.format(str(self.stats.n_episodes),
                                             str(self.stats.n_env_steps)))
                        eval_obs_images.clear()
                        eval_render = False
                    eval_total_reward += eval_episode_reward

                    break

        eval_mean_reward = eval_total_reward / float(
            self.config['n_evaluation_trials'])

        self.stats.evaluation_reward_auc += np.trapz(
            [self.stats.evaluation_reward_last, eval_mean_reward])
        self.stats.evaluation_reward_last = eval_mean_reward

        self.stats.n_evaluations += 1

        self.stats.update_summary_evaluation(eval_mean_reward, eval_duration,
                                             self.stats.evaluation_reward_auc)

        return eval_mean_reward

    # ------------------------------------------------------------------------------------------------------------------

    def record_data(self):
        # Grid
        for n in range(len(self.env.passage_positions[0])):
            y = self.env.passage_positions[0][n]
            x = self.env.passage_positions[1][n]
            obs = self.env.generate_obs((y, x))
            #if self.ac:
            if self.action_advising_method == 3:
                self.data_rnd_uncertainty[
                    y, x, self.
                    data_collection_step] = self.student_agent.get_uncertainty(
                        obs)
            if self.rnd_rm is not None:
                self.data_rnd_rm[y, x, self.data_collection_step] = \
                    self.rnd_rm.get_error(obs, normalize=True)

        self.data_rm_obs_counter_all[:, self.
                                     data_collection_step] = self.rm_obs_counter_all.copy(
                                     )
        self.data_rm_obs_counter_teacher[:, self.
                                         data_collection_step] = self.rm_obs_counter_teacher.copy(
                                         )
        self.data_rm_tr_counter_all[:, self.
                                    data_collection_step] = self.rm_tr_counter_all.copy(
                                    )
        self.data_rm_tr_counter_teacher[:, self.
                                        data_collection_step] = self.rm_tr_counter_teacher.copy(
                                        )

        self.data_collection_step += 1

    # ------------------------------------------------------------------------------------------------------------------

    def save_data(self):
        np.save(os.path.join(self.save_data_path, 'RND_Uncertainty.npy'),
                self.data_rnd_uncertainty)
        np.save(os.path.join(self.save_data_path, 'RND_RM.npy'),
                self.data_rnd_rm)
        np.save(os.path.join(self.save_data_path, 'RM_Obs_All.npy'),
                self.data_rm_obs_counter_all)
        np.save(os.path.join(self.save_data_path, 'RM_Obs_Teacher.npy'),
                self.data_rm_obs_counter_teacher)
        np.save(os.path.join(self.save_data_path, 'RM_TR_All.npy'),
                self.data_rm_tr_counter_all)
        np.save(os.path.join(self.save_data_path, 'RM_TR_Teacher.npy'),
                self.data_rm_tr_counter_teacher)
env = gym.make('LunarLander-v2')
env.seed(0)

print('State shape: ', env.observation_space.shape)
print('Number of actions: ', env.action_space.n)
agent = DQNAgent(state_size=8, action_size=4, seed=0)

agent.qnetwork_local.load_state_dict(torch.load('checkpoint.pth'))
memory = ReplayBuffer((8, ), (1, ), 20000, 'cuda')
n_episodes = 40
max_t = 500
eps = 0
for i_episode in range(1, n_episodes + 1):
    state = env.reset()
    score = 0
    for t in range(max_t):
        action = agent.act(state, eps)
        next_state, reward, done, _ = env.step(action)
        score += reward
        memory.add(state, action, reward, next_state, done, done)
        state = next_state
        # env.render()
        if done:
            print("Episode {}  Reward {}".format(i_episode, score))
            break

mkdir("", "expert_policy")
print("save memory ...")
memory.save_memory("expert_policy")
print("... memory saved")
예제 #14
0
def run():
    # environment name
    env = gym.make('LunarLander-v2')
    plt.figure()

    all_scores = []
    all_losses = []
    all_t = []

    agent = DQNAgent(
        env.observation_space.shape[0],
        # first 2 are position in x axis and y axis(hieght) , other 2 are the x,y axis velocity terms,
        # lander angle and angular velocity, left and right left contact points (bool)
        env.action_space.n,
        args
    )
    is_end = False

    for e in range(args.episodes):
        s_t0 = env.reset()
        reward_total = 0
        episode_loss = []
        is_win = False
        for t in range(args.max_steps):
            if args.is_render and len(all_scores):  # and all_scores[-1] > 0:
                # if e % 10 == 0 and all_scores[-1] > 0:
                env.render()
            a_t0 = agent.act(s_t0)
            s_t1, r_t1, is_end, _ = env.step(a_t0)

            reward_total += r_t1

            if t == args.max_steps - 1:
                r_t1 = -100
                is_end = True

            agent.replay_memory.push(
                (s_t0, a_t0, r_t1, s_t1, is_end)
            )
            s_t0 = s_t1

            if len(agent.replay_memory) > args.batch_size:
                loss = agent.replay()
                episode_loss.append(loss)

            if is_end:
                all_scores.append(reward_total)
                all_losses.append(np.mean(episode_loss))
                '''
                if terminal reward is =100 => landed
                https://github.com/openai/gym/blob/master/gym/envs/box2d/lunar_lander.py#L381
                '''
                if r_t1 >= 100:
                    is_win = True
                break

        all_t.append(t)
        metrics_episode = {
            'loss': all_losses[-1],
            'score': reward_total,
            't': t,
            'e': agent.epsilon,
            'is_win': is_win
        }

        if args.is_csv is True:
            CsvUtils.add_hparams(
                sequence_dir=os.path.join('.', args.sequence_name),
                sequence_name=args.sequence_name,
                run_name=args.run_name,
                args_dict=args.__dict__,
                metrics_dict=metrics_episode,
                global_step=e
            )
        else:
            logging.info(f'episode: {e}/{args.episodes} ', metrics_episode)
            print(f'episode: {e}/{args.episodes} ', metrics_episode)

        if e % 100 == 0 and not args.is_inference:
            # save logs, graphics and weights during training
            plt.clf()

            plt.subplot(3, 1, 1)
            plt.ylabel('Score')
            plt.plot(all_scores)

            plt.subplot(3, 1, 2)
            plt.ylabel('Loss')
            plt.plot(all_losses)

            plt.subplot(3, 1, 3)
            plt.ylabel('Steps')
            plt.plot(all_t)

            plt.xlabel('Episode')
            plt.savefig(os.path.join(seq_run_name, f'plt-{e}.png'))
            torch.save(agent.q_model.cpu().state_dict(), os.path.join(seq_run_name, f'model-{e}.pt'))
    env.close()
예제 #15
0
def train(conf: dict) -> dict:

    env = gym.make(**conf['env'])
    env.seed(conf['seed'])

    conf['action_size'] = env.action_space.n
    conf['device'] = torch.device(
        "cuda" if torch.cuda.is_available() else "cpu")
    module, model_to_use = conf['model_to_use']
    model = getattr(globals()[module], model_to_use)
    conf['model'] = model
    crop_params = conf['preprocess']['exclude']
    n_episodes = conf['n_episodes']
    scores = []
    epsilons = []
    scores_window = deque(maxlen=20)
    eps = conf['eps_start']
    # Evaluate the agent based on the mean of the Q values on the fixed set
    # of states
    fixed_states = collect_fixed_set_of_states(conf, env)
    average_action_values = []

    agent = DQNAgent(**exp_conf)
    agent_hps = np.inf

    for i_episode in range(1, n_episodes + 1):

        state = stack_frames(None, env.reset(), crop_params, True)
        score = 0
        epsilons.append(eps)
        eps = decay_epsilon(conf, i_episode)

        while True:
            # env.render()
            action = agent.act(state, eps)
            next_state, reward, done, info = env.step(action)

            if reward == 0.0 and not done:
                reward += -0.01

            if agent_hps == np.inf:
                agent_hps = info['ale.lives']

            elif info['ale.lives'] < agent_hps:
                reward += -50.0
                agent_hps += -1

            score += reward
            next_state = stack_frames(state, next_state, crop_params, False)
            agent.step(state, action, reward, next_state, done)
            state = next_state

            if done:
                break

        scores_window.append(score)  # save most recent score
        scores.append(score)  # save most recent score
        avg_av = agent.evaluate_on_fixed_set(fixed_states)
        average_action_values.append(avg_av)

        print(f'Episode {i_episode}\tAverage Score: '
              f'{round(np.mean(scores_window),4)}\tEpsilon: {round(eps, 4)}\t'
              f'Average Q value: {round(avg_av, 4)}')

        if i_episode % conf['save_every'] == 0 and i_episode > 0:
            print(f'Saving model at iteration: {i_episode}')
            save_model(conf, agent)

    env.close()

    return {
        'scores': scores,
        'epsilons': epsilons,
        'avg_action_values': average_action_values
    }
예제 #16
0
def run():
    game = ple.games.flappybird.FlappyBird()
    # game = ple.games.snake.Snake(width=512, height=512)
    # game = ple.games.pong.Pong(width=512, height=512)
    p = ple.PLE(game, fps=30, display_screen=args.is_render)
    p.init()

    plt.figure()

    all_scores = []
    all_losses = []
    all_t = []

    agent = DQNAgent(len(p.getGameState()), len(p.getActionSet()), args)
    is_end = p.game_over()

    for e in range(args.episodes):
        p.reset_game()
        s_t0 = np.asarray(list(p.getGameState().values()), dtype=np.float32)
        reward_total = 0
        pipes = 0
        episode_loss = []
        for t in range(args.max_steps):
            a_t0_idx = agent.act(s_t0)
            a_t0 = p.getActionSet()[a_t0_idx]
            r_t1 = p.act(a_t0)
            is_end = p.game_over()
            s_t1 = np.asarray(list(p.getGameState().values()), dtype=np.float32)

            reward_total += r_t1

            '''
            from /PyGame-Learning-Environment/ple/games/base/pygamewrapper.py
            self.rewards = {
            "positive": 1.0,
            "negative": -1.0,
            "tick": 0,
            "loss": -5.0,
            "win": 5.0
            }
            '''
            if r_t1 == 1.0:
                pipes += 1

            if t == args.max_steps - 1:
                r_t1 = -100
                is_end = True

            agent.replay_memory.push(
                (s_t0, a_t0_idx, r_t1, s_t1, is_end)
            )
            s_t0 = s_t1

            if len(agent.replay_memory) > args.batch_size:
                loss = agent.replay()
                episode_loss.append(loss)

            if is_end:
                all_scores.append(reward_total)
                all_losses.append(np.mean(episode_loss))
                break

        all_t.append(t)

        metrics_episode = {
            'loss': all_losses[-1],
            'score': reward_total,
            't': t,
            'e': agent.epsilon,
            'pipes': pipes
        }

        if args.is_csv is True:
            CsvUtils.add_hparams(
                sequence_dir=os.path.join('.', args.sequence_name),
                sequence_name=args.sequence_name,
                run_name=args.run_name,
                args_dict=args.__dict__,
                metrics_dict=metrics_episode,
                global_step=e
            )
        else:
            logging.info(f'episode: {e}/{args.episodes} ', metrics_episode)
            print(f'episode: {e}/{args.episodes} ', metrics_episode)

        if e % 100 == 0 and not args.is_inference:
            # save logs, graphics and weights during training
            plt.clf()

            plt.subplot(3, 1, 1)
            plt.ylabel('Score')
            plt.plot(all_scores)

            plt.subplot(3, 1, 2)
            plt.ylabel('Loss')
            plt.plot(all_losses)

            plt.subplot(3, 1, 3)
            plt.ylabel('Steps')
            plt.plot(all_t)

            plt.xlabel('Episode')
            plt.savefig(os.path.join(seq_run_name, f'plt-{e}.png'))
            torch.save(agent.q_model.cpu().state_dict(), os.path.join(seq_run_name, f'model-{e}.pt'))