Exemplo n.º 1
0
  def __init__(self, environment, replay_memory, deep_q_network, args):
    self.env = environment
    self.mem = replay_memory
    self.net = deep_q_network
    self.buf = StateBuffer(args)
    self.num_actions = self.env.numActions()
    self.random_starts = args.random_starts
    self.history_length = args.history_length

    self.exploration_rate_start = args.exploration_rate_start
    self.exploration_rate_end = args.exploration_rate_end
    self.exploration_decay_steps = args.exploration_decay_steps
    self.exploration_rate_test = args.exploration_rate_test
    self.total_train_steps = args.start_epoch * args.train_steps

    self.train_frequency = args.train_frequency
    self.train_repeat = args.train_repeat
    self.target_steps = args.target_steps

    self.callback = None
Exemplo n.º 2
0
class Agent:
	def __init__(self, environment, replay_memory, deep_q_network, args):
		self.env = environment
		self.mem = replay_memory
		self.net = deep_q_network
		self.buf = StateBuffer(args)
		self.num_actions = self.env.numActions()
		print(self.num_actions)
		self.random_starts = args.random_starts
		self.history_length = args.history_length

		self.exploration_rate_start = args.exploration_rate_start
		self.exploration_rate_end = args.exploration_rate_end
		self.exploration_decay_steps = args.exploration_decay_steps
		self.exploration_rate_test = args.exploration_rate_test
		self.total_train_steps = args.start_epoch * args.train_steps

		self.train_frequency = args.train_frequency
		self.train_repeat = args.train_repeat

		self.callback = None

	def _restartRandom(self):
		self.env.restart()
		tries = 3
		# perform random number of dummy actions to produce more stochastic games
		while tries:
			try:
				for i in xrange(random.randint(self.history_length, self.random_starts) + 1):
					reward = self.env.act(0)
					screen = self.env.getScreen()
					terminal = self.env.isTerminal()
					# assert not terminal, "terminal state occurred during random initialization"
					# add dummy states to buffer
					tries = 0
					self.buf.add(screen)
			except Exception, e:
				print(e)
				tries -= 1
				if tries <= -1:
					assert not terminal, "terminal state occurred during random initialization"
Exemplo n.º 3
0
  def __init__(self, environment, replay_memory, deep_q_network, args):
    self.env = environment
    self.mem = replay_memory
    self.net = deep_q_network
    self.buf = StateBuffer(args)
    self.num_actions = self.env.numActions()
    self.random_starts = args.random_starts
    self.history_length = args.history_length

    self.exploration_rate_start = args.exploration_rate_start
    self.exploration_rate_end = args.exploration_rate_end
    self.exploration_decay_steps = args.exploration_decay_steps
    self.exploration_rate_test = args.exploration_rate_test
    self.total_train_steps = args.start_epoch * args.train_steps

    self.train_frequency = args.train_frequency
    self.train_repeat = args.train_repeat

    self.callback = None
Exemplo n.º 4
0
class Agent:
  def __init__(self, environment, replay_memory, deep_q_network, args):
    self.env = environment
    self.mem = replay_memory
    self.net = deep_q_network
    self.buf = StateBuffer(args)
    self.num_actions = self.env.numActions()
    self.random_starts = args.random_starts
    self.history_length = args.history_length

    self.exploration_rate_start = args.exploration_rate_start
    self.exploration_rate_end = args.exploration_rate_end
    self.exploration_decay_steps = args.exploration_decay_steps
    self.exploration_rate_test = args.exploration_rate_test
    self.total_train_steps = args.start_epoch * args.train_steps

    self.train_frequency = args.train_frequency
    self.train_repeat = args.train_repeat
    self.target_steps = args.target_steps

    self.callback = None

  def _restartRandom(self):
    self.env.restart()
    # perform random number of dummy actions to produce more stochastic games
    for i in range(random.randint(self.history_length, self.random_starts) + 1):
      reward = self.env.act(0)
      terminal = self.env.isTerminal()
      if terminal:
          self.env.restart()
      screen = self.env.getScreen()
      # add dummy states to buffer
      self.buf.add(screen)

  def _explorationRate(self):
    # calculate decaying exploration rate
    if self.total_train_steps < self.exploration_decay_steps:
      return self.exploration_rate_start - self.total_train_steps * (self.exploration_rate_start - self.exploration_rate_end) / self.exploration_decay_steps
    else:
      return self.exploration_rate_end

  def step(self, exploration_rate):
    # exploration rate determines the probability of random moves
    if random.random() < exploration_rate:
      action = random.randrange(self.num_actions)
      logger.debug("Random action = %d" % action)
    else:
      # otherwise choose action with highest Q-value
      state = self.buf.getStateMinibatch()
      # for convenience getStateMinibatch() returns minibatch
      # where first item is the current state
      qvalues = self.net.predict(state)
      assert len(qvalues[0]) == self.num_actions
      # choose highest Q-value of first state
      action = np.argmax(qvalues[0])
      logger.debug("Predicted action = %d" % action)

    # perform the action
    reward = self.env.act(action)
    screen = self.env.getScreen()
    terminal = self.env.isTerminal()

    # print reward
    if reward != 0:
      logger.debug("Reward: %d" % reward)

    # add screen to buffer
    self.buf.add(screen)

    # restart the game if over
    if terminal:
      logger.debug("Terminal state, restarting")
      self._restartRandom()

    # call callback to record statistics
    if self.callback:
      self.callback.on_step(action, reward, terminal, screen, exploration_rate)

    return action, reward, screen, terminal

  def play_random(self, random_steps):
    #call env.restart first so that env.reset is called before step.
    self.env.restart()
    # play given number of steps
    for i in range(random_steps):
      # use exploration rate 1 = completely random
      action, reward, screen, terminal = self.step(1)
      self.mem.add(action, reward, screen, terminal)

  def train(self, train_steps, epoch = 0):
    # do not do restart here, continue from testing
    #self._restartRandom()
    # play given number of steps
    for i in range(train_steps):
      # perform game step
      action, reward, screen, terminal = self.step(self._explorationRate())
      self.mem.add(action, reward, screen, terminal)
      # Update target network every target_steps steps
      if self.target_steps and i % self.target_steps == 0:
        self.net.update_target_network()
      # train after every train_frequency steps
      if self.mem.count > self.mem.batch_size and i % self.train_frequency == 0:
        # train for train_repeat times
        for j in range(self.train_repeat):
          # sample minibatch
          minibatch = self.mem.getMinibatch()
          # train the network
          self.net.train(minibatch, epoch)
      # increase number of training steps for epsilon decay
      self.total_train_steps += 1

  def test(self, test_steps, epoch = 0):
    # just make sure there is history_length screens to form a state
    self._restartRandom()
    # play given number of steps
    for i in range(test_steps):
      # perform game step
      self.step(self.exploration_rate_test)

  def play(self, num_games):
    # just make sure there is history_length screens to form a state
    self._restartRandom()
    for i in range(num_games):
      # play until terminal state
      terminal = False
      while not terminal:
        action, reward, screen, terminal = self.step(self.exploration_rate_test)
        # add experiences to replay memory for visualization
        self.mem.add(action, reward, screen, terminal)
Exemplo n.º 5
0
class Agent:
  def __init__(self, environment, replay_memory, deep_q_network, args):
    self.env = environment
    self.mem = replay_memory
    self.net = deep_q_network
    self.buf = StateBuffer(args)
    self.num_actions = self.env.numActions()
    self.random_starts = args.random_starts
    self.history_length = args.history_length

    self.exploration_rate_start = args.exploration_rate_start
    self.exploration_rate_end = args.exploration_rate_end
    self.exploration_decay_steps = args.exploration_decay_steps
    self.exploration_rate_test = args.exploration_rate_test
    self.total_train_steps = args.start_epoch * args.train_steps

    self.train_frequency = args.train_frequency
    self.train_repeat = args.train_repeat

    self.callback = None

  def _restartRandom(self):
    self.env.restart()
    # perform random number of dummy actions to produce more stochastic games
    for i in xrange(random.randint(self.history_length, self.random_starts) + 1):
      reward = self.env.act(0)
      screen = self.env.getScreen()
      terminal = self.env.isTerminal()
      assert not terminal, "terminal state occurred during random initialization"
      # add dummy states to buffer
      self.buf.add(screen)

  def _explorationRate(self):
    # calculate decaying exploration rate
    if self.total_train_steps < self.exploration_decay_steps:
      return self.exploration_rate_start - self.total_train_steps * (self.exploration_rate_start - self.exploration_rate_end) / self.exploration_decay_steps
    else:
      return self.exploration_rate_end

  def step(self, exploration_rate):
    # exploration rate determines the probability of random moves
    if random.random() < exploration_rate:
      action = random.randrange(self.num_actions)
      logger.debug("Random action = %d" % action)
    else:
      # otherwise choose action with highest Q-value
      state = self.buf.getStateMinibatch()
      # for convenience getStateMinibatch() returns minibatch
      # where first item is the current state
      qvalues = self.net.predict(state)
      assert len(qvalues[0]) == self.num_actions
      # choose highest Q-value of first state
      action = np.argmax(qvalues[0])
      logger.debug("Predicted action = %d" % action)

    # perform the action
    reward = self.env.act(action)
    screen = self.env.getScreen()
    terminal = self.env.isTerminal()

    # print reward
    if reward <> 0:
      logger.debug("Reward: %d" % reward)

    # add screen to buffer
    self.buf.add(screen)

    # restart the game if over
    if terminal:
      logger.debug("Terminal state, restarting")
      self._restartRandom()

    # call callback to record statistics
    if self.callback:
      self.callback.on_step(action, reward, terminal, screen, exploration_rate)

    return action, reward, screen, terminal

  def play_random(self, random_steps):
    # play given number of steps
    for i in xrange(random_steps):
      # use exploration rate 1 = completely random
      self.step(1)

  def train(self, train_steps, epoch = 0):
    # do not do restart here, continue from testing
    #self._restartRandom()
    # play given number of steps
    for i in xrange(train_steps):
      # perform game step
      action, reward, screen, terminal = self.step(self._explorationRate())
      self.mem.add(action, reward, screen, terminal)
      # train after every train_frequency steps
      if self.mem.count > self.mem.batch_size and i % self.train_frequency == 0:
        # train for train_repeat times
        for j in xrange(self.train_repeat):
          #logger.info("i=%d, j=%d, mem.count=%d" % (i, j, self.mem.count))
          # sample minibatch
          minibatch = self.mem.getMinibatch()
          # train the network
          self.net.train(minibatch, epoch)
      # increase number of training steps for epsilon decay
      self.total_train_steps += 1

  def test(self, test_steps, epoch = 0):
    # just make sure there is history_length screens to form a state
    self._restartRandom()
    # play given number of steps
    for i in xrange(test_steps):
      # perform game step
      self.step(self.exploration_rate_test)

  def play(self, num_games):
    # just make sure there is history_length screens to form a state
    self._restartRandom()
    for i in xrange(num_games):
      # play until terminal state
      terminal = False
      while not terminal:
        action, reward, screen, terminal = self.step(self.exploration_rate_test)
        # add experiences to replay memory for visualization
        self.mem.add(action, reward, screen, terminal)
Exemplo n.º 6
0
 def train(self, num_run=1, restore=False):
     memory = None
     start_episode = 0
     start_updates = 0
     start_run = 0
     start_total_numsteps = 0
     start_running_episode_reward = 0
     start_running_episode_reward_100 = 0
     start_rewards = []
     start_last_episode_steps = 0
     start_episode_reward = 0
     start_episode_steps = 0
     start_timing = 0
     start_total_timing = 0
     
     # Restore Phase
     if restore:
         # TODO: Not tested deeply yet
         with open(self.folder + "memory.pkl", "rb") as pickle_out:
             memory = ReplayMemory(self.replay_size, self.seed)
             memory.load(pickle_out)
         with open(self.folder + "context.json", "r+") as pickle_out:
             (start_episode, start_run, start_updates, start_total_numsteps, start_running_episode_reward,
              start_running_episode_reward_100, start_last_episode_steps, start_episode_reward, start_episode_steps,
              start_timing, start_total_timing) = json.load(pickle_out)
         with open(self.folder + "rewards.pkl", "rb") as pickle_out:
             start_rewards = pickle.load(pickle_out)
         self.restore_model()
         self.logger.important("Load completed!")
     
     in_ts = time.time()
     
     # Start of the iteration on runs
     for i_run in range(start_run, num_run):
         
         # Break the loop if the phase "Save'n'Close" is triggered
         if self.env.is_save_and_close():
             break
         
         self.logger.important(f"START TRAINING RUN {i_run}")
         
         # Set Seed for repeatability
         torch.manual_seed(self.seed + i_run)
         np.random.seed(self.seed + i_run)
         self.env.seed(self.seed + i_run)
         self.env.action_space.np_random.seed(self.seed + i_run)
         
         # Setup TensorboardX
         writer_train = SummaryWriter(log_dir=self.folder + 'run_' + str(i_run) + '/train')
         writer_learn = SummaryWriter(log_dir=self.folder + 'run_' + str(i_run) + '/learn')
         writer_test = SummaryWriter(log_dir=self.folder + 'run_' + str(i_run) + '/test')
         
         # Setup Replay Memory: create new memory if is not the restore case
         if not restore:
             memory = ReplayMemory(self.replay_size, self.seed)
         # Create a backup memory for Forget-Phase
         backup_memory = copy.deepcopy(memory)
         
         # TRAINING LOOP
         # All these variables must be backed up and restored
         updates = start_updates
         total_numsteps = start_total_numsteps
         running_episode_reward = start_running_episode_reward
         running_episode_reward_100 = start_running_episode_reward_100
         rewards = start_rewards
         i_episode = start_episode
         last_episode_steps = start_last_episode_steps
         episode_reward = start_episode_reward
         episode_steps = start_episode_steps
         timing = start_timing
         total_timing = start_total_timing
         updates_episode = 0
         episode_images = list()
         
         '''
             LOOP: Episode
         '''
         while True:
             
             # Stop the robot
             self.env.stop_all_motors()
             
             # Wait for the human to leave the command
             while self.env.is_human_controlled():
                 pass
             
             # Let's forget (if it is the case)
             if self.env.is_forget_enabled():
                 # print('forget')
                 i_episode -= 1
                 print(len(memory))
                 # Restore Nets
                 self.restore_model()
                 self.env.reset_forget()
                 # Restore Memory
                 memory = copy.deepcopy(backup_memory)
                 print(len(memory))
                 # memory.forget_last(last_episode_steps)
                 self.logger.info("Last Episode Forgotten")
             elif i_episode != start_episode:
                 # LEARNING AND PRINTING PHASE
                 ep_print = i_episode - 1
                 last_episode_steps = episode_steps
                 if self.pics:
                     for i, image in enumerate(episode_images):
                         writer_train.add_image('episode_{}'
                                                .format(str(ep_print)), image.unsqueeze(0),
                                                i)
                 
                 if len(memory) > self.min_replay_size and ep_print > self.warm_up_episodes:
                     updates = self.learning_phase((last_episode_steps // 10) * 10 + 10, memory, updates,
                                                   writer_learn)
                 self.print_nets(writer_train, ep_print)
                 rewards.append(episode_reward)
                 running_episode_reward += (episode_reward - running_episode_reward) / (ep_print + 1)
                 if len(rewards) < 100:
                     running_episode_reward_100 = running_episode_reward
                 else:
                     last_100 = rewards[-100:]
                     running_episode_reward_100 = np.array(last_100).mean()
                 
                 writer_train.add_scalar('reward/train', episode_reward, ep_print)
                 writer_train.add_scalar('reward/steps', last_episode_steps, ep_print)
                 writer_train.add_scalar('reward/running_mean', running_episode_reward, ep_print)
                 writer_train.add_scalar('reward/running_mean_last_100', running_episode_reward_100, ep_print)
                 self.logger.info("Ep. {}/{}, t {}, r_t {}, 100_mean {}, time_spent {}s | {}s "
                                  .format(ep_print, self.num_episode, episode_steps, round(episode_reward, 2),
                                          round(running_episode_reward_100, 2), round(timing, 2),
                                          str(datetime.timedelta(seconds=total_timing))))
             
             # Security Wall, useful for longer training Phase
             while self.env.is_human_controlled():
                 pass
             
             # Let's test (if it is the case)
             if i_episode % self.eval_every == 0 and self.eval and i_episode != 0 and not restore:
                 # print('test')
                 self.test_phase(writer_test, i_run, updates)
                 # Wait for the human to leave the command
                 while self.env.is_human_controlled():
                     pass
             
             # TODO: HP Checkpoint and check correctness of checkpoint restoring
             if i_episode % self.eval_every == 0 and i_episode != 0 and not restore:
                 self.logger.important("Saving context...")
                 self.logger.info("To restart from here set this flag: --restore " + self.folder)
                 # Save Replay, net weights, hp, i_episode and i_run
                 with open(self.folder + "memory.pkl", "wb") as pickle_out:
                     memory.dump(pickle_out)
                 with open(self.folder + "context.json", "w+") as pickle_out:
                     json.dump((i_episode, i_run, updates, total_numsteps, running_episode_reward,
                                running_episode_reward_100, last_episode_steps, episode_reward, episode_steps,
                                timing, total_timing), pickle_out)
                 with open(self.folder + "rewards.pkl", "wb") as pickle_out:
                     pickle.dump(rewards, pickle_out)
                 self.backup_model()
                 if os.path.exists(self.folder[:-1] + "_bak" + self.folder[-1:]):
                     shutil.rmtree(self.folder[:-1] + "_bak" + self.folder[-1:])
                 print(self.folder[:-1] + "_bak" + self.folder[-1:])
                 shutil.copytree(self.folder, self.folder[:-1] + "_bak" + self.folder[-1:])
                 self.logger.important("Save completed!")
             
             # Limit of episode/run reached. Let's start a new RUN
             if i_episode > self.num_episode:
                 break
             
             # Backup NNs and memory (useful in case of Forget Phase)
             self.backup_model()
             backup_memory = copy.deepcopy(memory)
             
             # Setup the episode
             self.logger.important(f"START EPISODE {i_episode}")
             ts = time.time()
             episode_reward = episode_steps = 0
             done = False
             info = {'undo': False}
             state = self.env.reset()
             state_buffer = None
             
             # If you use CNNs, the use of StateBuffer is enabled (see doc).
             if self.pics:
                 state_buffer = StateBuffer(self.state_buffer_size, state)
                 state = state_buffer.get_state()
                 episode_images = list()
             updates_episode = 0
             
             # Start of the episode
             while not done:
                 if self.pics:
                     episode_images.append(state_buffer.get_tensor()[0])
                 
                 if i_episode < self.warm_up_episodes or len(memory) < self.min_replay_size:
                     # Warm_up phase -> Completely random choice of an action
                     action = self.env.action_space.sample()
                 else:
                     # Training phase -> Action sampled from policy
                     action = self.select_action(state)
                 
                 assert action.shape == self.env.action_space.shape
                 assert action is not None
                 writer_train.add_histogram('action_speed/episode_{}'
                                            .format(str(i_episode)), torch.tensor(action[0]), episode_steps)
                 writer_train.add_histogram('action_turn/episode_{}'
                                            .format(str(i_episode)), torch.tensor(action[1]), episode_steps)
                 
                 # Make the action
                 next_state, reward, done, info = self.env.step(action)
                 
                 # Save the step
                 if self.pics:
                     state_buffer.push(next_state)
                     next_state = state_buffer.get_state()
                 episode_steps += 1
                 total_numsteps += 1
                 episode_reward += reward
                 mask = 1 if done else float(not done)
                 
                 # Push the transition in the memory only if n steps is greater than 5
                 # print('push')
                 if episode_steps > 5:
                     memory.push(state, action, reward, next_state, mask)
                 state = next_state
             print("Memory {}/{}".format(len(memory), self.replay_size))
             timing = time.time() - ts
             total_timing = time.time() - in_ts
             start_episode = 0
             i_episode += 1
             # Disable restore phase after the restored run
             restore = False
Exemplo n.º 7
0
class GymAgent(object):
    def __init__(self,
                 env=Breakout - v0,
                 net,
                 replay_memory,
                 exploration_strategy,
                 args):
        self.env = env
        self.net = net
        self.mem = replay_memory
        self.exporation_strategy = exporation_strategy
        self.buf = StateBuffer(args)
        self.history_length = args.history_length
        #self.exploration_train_strategy = exploration_strategy.args.exploration_train_strategy
        #self.exploration_test_strategy = exploration_strategy.args.exploration_test_strategy
        self.train_net_frequency = args.train_net_frequency
        self.train_net_repeat = args.train_net_repeat

    def _restart_random(self):
        self.env.reset()
        # perform random number of dummy actions to produce more stochastic games
        for t in xrange(
                random.randint(self.history_length, self.random_starts) + 1):
            self.mem.action = self.env.action_space.sample()
            self.mem.observation, self.mem.reward, self.mem.done, self.mem.info = self.env.step(
                self.mem.action)
            assert not self.env.done, "done state occurred during random initialization"

            # add dummy states to buffer
#to be merged in replay_memor=self.mem here   self.buf.add(observation)

    def act(self, exploration_strategy):
        # FOR BASE AGENT, perhasp use: raise NotImplementedError
        callbacks.on_act_begin()
        # determine whether to explore
        action = exploration_strategy()
        if action:
            logger.debug("Explore action = {}".format(action))
        else:
            # otherwise choose action with highest Q-value
            state = self.buf.getStateMinibatch()
            # for convenience getStateMinibatch() returns minibatch
            # where first item is the current state
            qvalues = self.net.predict(state)
            assert len(qvalues[0]) == self.env.action_space.n
            # choose highest Q-value of first state
            action = np.argmax(qvalues[0])
            logger.debug("Predicted action = {}".format(action))
        # perform the action, and update replay_memory
        self.mem.action = action
        self.mem.observation, self.mem.reward, self.mem.done, self.mem.info = self.env.step(
            self.mem.action)
        # add screen to buffer
        #self.buf.add(observation)
        # restart the game if over
        if done:
            self._restart_random()
        # call callback to log progress
        #MOVE THIS TO CALLBACK SELF.AGENT (need to add self stuff above - NO! USE e.g. buf.observations[last (obvisously replace with the actual number)]):
##        act_logs = {}
##        act_logs['observation'] = observation
##        act_logs['done'] = done
##        act_logs['reward'] = reward
##        act_logs['t'] = t
        self.callback.on_act_end(act)
        #see statistics vs monitor
        return action, observation, reward, done, info

    def train(self, train_steps, episode=0):
        #CHECK WHY, INPARTICULAR SURELY WE DON'T NECCESSARILY HAVE 4STATES FOR CONVNET???        # do not do restart here, continue from testing
        #self._restart_random()
        # play given number of steps
        for t in xrange(train_steps):
            # update agent replay memory regarding t
            self.mem.t = t
            # perform game step
            self.act(self.exploration_train_strategy)
            # train after every train_frequency steps
            if self.mem.count > self.mem.batch_size and t % self.train_frequency == 0:
                # train for train_repeat times
                for j in xrange(self.train_net_repeat):
                    # sample minibatch
                    minibatch = self.mem.getMinibatch()
                    # train the network
                    self.net.train(minibatch, episode)
            # restart the game if over
            if self.mem.done:
                # just make sure there is history_length screens to form a state
                # perform random number of dummy actions to produce more stochastic games
                if t < random.randint(self.history_length,
                                      self.random_starts) + 1:
                    self.act(self.exploration_strategy.play_random)

    def test(self, test_steps, episode=0):
        # play given number of steps
        for t in xrange(test_steps):
            # update agent replay memory regarding t
            # check if we trained
            if t == 0:
                test_start_t = self.mem.t
                # reset environment
                self.env.reset()
            self.mem.t = test_start_t + t
            # just make sure there is history_length screens to form a state
            # perform random number of dummy actions to produce more stochastic games
            if t < random.randint(self.history_length, self.random_starts) + 1:
                self.act(self.exploration_strategy.play_random)
            # perform game step
            self.act(self.exploration_test_strategy)

    def play(self, num_games):
        for t in xrange(num_games):
            # just make sure there is history_length screens to form a state
            # perform random number of dummy actions to produce more stochastic games
            if t < random.randint(self.history_length, self.random_starts) + 1:
                self.act(self.exploration_strategy.play_random)
            # play until terminal state
            while not self.mem.done:
                self.act(t, self.exploration_test_strategy)
Exemplo n.º 8
0
    def train(self, num_run=1):
        in_ts = time.time()
        for i_run in range(num_run):
            self.logger.important(f"START TRAINING RUN {i_run}")
            # Make the environment

            # Set Seed for repeatability
            torch.manual_seed(self.seed + i_run)
            np.random.seed(self.seed + i_run)
            self.env.seed(self.seed + i_run)
            self.env.action_space.np_random.seed(self.seed + i_run)

            # Setup TensorboardX
            writer_train = SummaryWriter(log_dir='runs/' + self.folder +
                                         'run_' + str(i_run) + '/train')
            writer_test = SummaryWriter(log_dir='runs/' + self.folder +
                                        'run_' + str(i_run) + '/test')

            # Setup Replay Memory
            memory = ReplayMemory(self.replay_size)

            # TRAINING LOOP
            total_numsteps = updates = running_episode_reward = running_episode_reward_100 = 0
            rewards = []
            i_episode = 0
            last_episode_steps = 0
            while True:
                self.env.stop_all_motors()
                while self.env.is_human_controlled():
                    continue
                if self.env.is_forget_enabled():
                    self.restore_model()
                    memory.forget_last(last_episode_steps)
                    i_episode -= 1
                    self.logger.info("Last Episode Forgotten")
                if self.env.is_test_phase():
                    self.test_phase(i_run, i_episode, writer_test)
                    continue
                if i_episode > self.num_episode:
                    break
                self.backup_model()
                self.logger.important(f"START EPISODE {i_episode}")
                ts = time.time()
                episode_reward = episode_steps = 0
                done = False
                info = {'undo': False}
                state = self.env.reset()
                state_buffer = None
                if self.pics:
                    state_buffer = StateBuffer(self.state_buffer_size, state)
                    state = state_buffer.get_state()

                critic_1_loss_acc = critic_2_loss_acc = policy_loss_acc = ent_loss_acc = alpha_acc = 0

                while not done:
                    if self.pics:
                        writer_train.add_image(
                            'episode_{}'.format(str(i_episode)),
                            state_buffer.get_tensor(), episode_steps)
                    if len(memory) < self.warm_up_steps:
                        action = self.env.action_space.sample()
                    else:
                        action = self.select_action(
                            state)  # Sample action from policy
                        if len(memory) > self.batch_size:
                            # Number of updates per step in environment
                            for i in range(self.updates_per_step):
                                # Update parameters of all the networks
                                critic_1_loss, critic_2_loss, policy_loss, ent_loss, alpha = self.update_parameters(
                                    memory, self.batch_size, updates)

                                critic_1_loss_acc += critic_1_loss
                                critic_2_loss_acc += critic_2_loss
                                policy_loss_acc += policy_loss
                                ent_loss_acc += ent_loss
                                alpha_acc += alpha
                                updates += 1

                    next_state, reward, done, info = self.env.step(
                        action)  # Step
                    if self.pics:
                        state_buffer.push(next_state)
                        next_state = state_buffer.get_state()
                    episode_steps += 1
                    total_numsteps += 1
                    episode_reward += reward
                    mask = 1 if done else float(not done)
                    memory.push(state, action, reward, next_state,
                                mask)  # Append transition to memory

                    state = next_state
                last_episode_steps = episode_steps
                i_episode += 1

                rewards.append(episode_reward)
                running_episode_reward += (episode_reward -
                                           running_episode_reward) / i_episode
                if len(rewards) < 100:
                    running_episode_reward_100 = running_episode_reward
                else:
                    last_100 = rewards[-100:]
                    running_episode_reward_100 = np.array(last_100).mean()
                writer_train.add_scalar('loss/critic_1',
                                        critic_1_loss_acc / episode_steps,
                                        i_episode)
                writer_train.add_scalar('loss/critic_2',
                                        critic_2_loss_acc / episode_steps,
                                        i_episode)
                writer_train.add_scalar('loss/policy',
                                        policy_loss_acc / episode_steps,
                                        i_episode)
                writer_train.add_scalar('loss/entropy_loss',
                                        ent_loss_acc / episode_steps,
                                        i_episode)
                writer_train.add_scalar('entropy_temperature/alpha',
                                        alpha_acc / episode_steps, i_episode)
                writer_train.add_scalar('reward/train', episode_reward,
                                        i_episode)
                writer_train.add_scalar('reward/running_mean',
                                        running_episode_reward, i_episode)
                writer_train.add_scalar('reward/running_mean_last_100',
                                        running_episode_reward_100, i_episode)
                self.logger.info(
                    "Ep. {}/{}, t {}, r_t {}, 100_mean {}, time_spent {}s | {}s "
                    .format(
                        i_episode, self.num_episode, episode_steps,
                        round(episode_reward, 2),
                        round(running_episode_reward_100, 2),
                        round(time.time() - ts, 2),
                        str(datetime.timedelta(seconds=time.time() - in_ts))))
            self.env.close()
Exemplo n.º 9
0
        # Setup Replay Memory
        memory = ReplayMemory(args.replay_size)

        # TRAINING LOOP
        total_numsteps = updates = running_episode_reward = running_episode_reward_100 = 0
        rewards = []

        for i_episode in itertools.count(1):
            print(updates)
            ts = time.time()
            episode_reward = episode_steps = 0
            done = False
            state = env.reset()
            if cnn:
                state_buffer = StateBuffer(args.state_buffer_size, state)
                state = state_buffer.get_state()

            critic_1_loss_acc = critic_2_loss_acc = policy_loss_acc = ent_loss_acc = alpha_acc = 0

            while not done:
                # if cnn:
                #     writer_train.add_images('episode_{}'.format(str(i_episode)), state_buffer.get_tensor(), episode_steps)
                if i_episode < args.warm_up_episode:
                    action = env.action_space.sample()  # Sample random action
                else:
                    action = agent.select_action(
                        state)  # Sample action from policy

                next_state, reward, done, _ = env.step(action)  # Step
                env.render()