def __init__(self, environment, replay_memory, deep_q_network, args): self.env = environment self.mem = replay_memory self.net = deep_q_network self.buf = StateBuffer(args) self.num_actions = self.env.numActions() self.random_starts = args.random_starts self.history_length = args.history_length self.exploration_rate_start = args.exploration_rate_start self.exploration_rate_end = args.exploration_rate_end self.exploration_decay_steps = args.exploration_decay_steps self.exploration_rate_test = args.exploration_rate_test self.total_train_steps = args.start_epoch * args.train_steps self.train_frequency = args.train_frequency self.train_repeat = args.train_repeat self.target_steps = args.target_steps self.callback = None
class Agent: def __init__(self, environment, replay_memory, deep_q_network, args): self.env = environment self.mem = replay_memory self.net = deep_q_network self.buf = StateBuffer(args) self.num_actions = self.env.numActions() print(self.num_actions) self.random_starts = args.random_starts self.history_length = args.history_length self.exploration_rate_start = args.exploration_rate_start self.exploration_rate_end = args.exploration_rate_end self.exploration_decay_steps = args.exploration_decay_steps self.exploration_rate_test = args.exploration_rate_test self.total_train_steps = args.start_epoch * args.train_steps self.train_frequency = args.train_frequency self.train_repeat = args.train_repeat self.callback = None def _restartRandom(self): self.env.restart() tries = 3 # perform random number of dummy actions to produce more stochastic games while tries: try: for i in xrange(random.randint(self.history_length, self.random_starts) + 1): reward = self.env.act(0) screen = self.env.getScreen() terminal = self.env.isTerminal() # assert not terminal, "terminal state occurred during random initialization" # add dummy states to buffer tries = 0 self.buf.add(screen) except Exception, e: print(e) tries -= 1 if tries <= -1: assert not terminal, "terminal state occurred during random initialization"
def __init__(self, environment, replay_memory, deep_q_network, args): self.env = environment self.mem = replay_memory self.net = deep_q_network self.buf = StateBuffer(args) self.num_actions = self.env.numActions() self.random_starts = args.random_starts self.history_length = args.history_length self.exploration_rate_start = args.exploration_rate_start self.exploration_rate_end = args.exploration_rate_end self.exploration_decay_steps = args.exploration_decay_steps self.exploration_rate_test = args.exploration_rate_test self.total_train_steps = args.start_epoch * args.train_steps self.train_frequency = args.train_frequency self.train_repeat = args.train_repeat self.callback = None
class Agent: def __init__(self, environment, replay_memory, deep_q_network, args): self.env = environment self.mem = replay_memory self.net = deep_q_network self.buf = StateBuffer(args) self.num_actions = self.env.numActions() self.random_starts = args.random_starts self.history_length = args.history_length self.exploration_rate_start = args.exploration_rate_start self.exploration_rate_end = args.exploration_rate_end self.exploration_decay_steps = args.exploration_decay_steps self.exploration_rate_test = args.exploration_rate_test self.total_train_steps = args.start_epoch * args.train_steps self.train_frequency = args.train_frequency self.train_repeat = args.train_repeat self.target_steps = args.target_steps self.callback = None def _restartRandom(self): self.env.restart() # perform random number of dummy actions to produce more stochastic games for i in range(random.randint(self.history_length, self.random_starts) + 1): reward = self.env.act(0) terminal = self.env.isTerminal() if terminal: self.env.restart() screen = self.env.getScreen() # add dummy states to buffer self.buf.add(screen) def _explorationRate(self): # calculate decaying exploration rate if self.total_train_steps < self.exploration_decay_steps: return self.exploration_rate_start - self.total_train_steps * (self.exploration_rate_start - self.exploration_rate_end) / self.exploration_decay_steps else: return self.exploration_rate_end def step(self, exploration_rate): # exploration rate determines the probability of random moves if random.random() < exploration_rate: action = random.randrange(self.num_actions) logger.debug("Random action = %d" % action) else: # otherwise choose action with highest Q-value state = self.buf.getStateMinibatch() # for convenience getStateMinibatch() returns minibatch # where first item is the current state qvalues = self.net.predict(state) assert len(qvalues[0]) == self.num_actions # choose highest Q-value of first state action = np.argmax(qvalues[0]) logger.debug("Predicted action = %d" % action) # perform the action reward = self.env.act(action) screen = self.env.getScreen() terminal = self.env.isTerminal() # print reward if reward != 0: logger.debug("Reward: %d" % reward) # add screen to buffer self.buf.add(screen) # restart the game if over if terminal: logger.debug("Terminal state, restarting") self._restartRandom() # call callback to record statistics if self.callback: self.callback.on_step(action, reward, terminal, screen, exploration_rate) return action, reward, screen, terminal def play_random(self, random_steps): #call env.restart first so that env.reset is called before step. self.env.restart() # play given number of steps for i in range(random_steps): # use exploration rate 1 = completely random action, reward, screen, terminal = self.step(1) self.mem.add(action, reward, screen, terminal) def train(self, train_steps, epoch = 0): # do not do restart here, continue from testing #self._restartRandom() # play given number of steps for i in range(train_steps): # perform game step action, reward, screen, terminal = self.step(self._explorationRate()) self.mem.add(action, reward, screen, terminal) # Update target network every target_steps steps if self.target_steps and i % self.target_steps == 0: self.net.update_target_network() # train after every train_frequency steps if self.mem.count > self.mem.batch_size and i % self.train_frequency == 0: # train for train_repeat times for j in range(self.train_repeat): # sample minibatch minibatch = self.mem.getMinibatch() # train the network self.net.train(minibatch, epoch) # increase number of training steps for epsilon decay self.total_train_steps += 1 def test(self, test_steps, epoch = 0): # just make sure there is history_length screens to form a state self._restartRandom() # play given number of steps for i in range(test_steps): # perform game step self.step(self.exploration_rate_test) def play(self, num_games): # just make sure there is history_length screens to form a state self._restartRandom() for i in range(num_games): # play until terminal state terminal = False while not terminal: action, reward, screen, terminal = self.step(self.exploration_rate_test) # add experiences to replay memory for visualization self.mem.add(action, reward, screen, terminal)
class Agent: def __init__(self, environment, replay_memory, deep_q_network, args): self.env = environment self.mem = replay_memory self.net = deep_q_network self.buf = StateBuffer(args) self.num_actions = self.env.numActions() self.random_starts = args.random_starts self.history_length = args.history_length self.exploration_rate_start = args.exploration_rate_start self.exploration_rate_end = args.exploration_rate_end self.exploration_decay_steps = args.exploration_decay_steps self.exploration_rate_test = args.exploration_rate_test self.total_train_steps = args.start_epoch * args.train_steps self.train_frequency = args.train_frequency self.train_repeat = args.train_repeat self.callback = None def _restartRandom(self): self.env.restart() # perform random number of dummy actions to produce more stochastic games for i in xrange(random.randint(self.history_length, self.random_starts) + 1): reward = self.env.act(0) screen = self.env.getScreen() terminal = self.env.isTerminal() assert not terminal, "terminal state occurred during random initialization" # add dummy states to buffer self.buf.add(screen) def _explorationRate(self): # calculate decaying exploration rate if self.total_train_steps < self.exploration_decay_steps: return self.exploration_rate_start - self.total_train_steps * (self.exploration_rate_start - self.exploration_rate_end) / self.exploration_decay_steps else: return self.exploration_rate_end def step(self, exploration_rate): # exploration rate determines the probability of random moves if random.random() < exploration_rate: action = random.randrange(self.num_actions) logger.debug("Random action = %d" % action) else: # otherwise choose action with highest Q-value state = self.buf.getStateMinibatch() # for convenience getStateMinibatch() returns minibatch # where first item is the current state qvalues = self.net.predict(state) assert len(qvalues[0]) == self.num_actions # choose highest Q-value of first state action = np.argmax(qvalues[0]) logger.debug("Predicted action = %d" % action) # perform the action reward = self.env.act(action) screen = self.env.getScreen() terminal = self.env.isTerminal() # print reward if reward <> 0: logger.debug("Reward: %d" % reward) # add screen to buffer self.buf.add(screen) # restart the game if over if terminal: logger.debug("Terminal state, restarting") self._restartRandom() # call callback to record statistics if self.callback: self.callback.on_step(action, reward, terminal, screen, exploration_rate) return action, reward, screen, terminal def play_random(self, random_steps): # play given number of steps for i in xrange(random_steps): # use exploration rate 1 = completely random self.step(1) def train(self, train_steps, epoch = 0): # do not do restart here, continue from testing #self._restartRandom() # play given number of steps for i in xrange(train_steps): # perform game step action, reward, screen, terminal = self.step(self._explorationRate()) self.mem.add(action, reward, screen, terminal) # train after every train_frequency steps if self.mem.count > self.mem.batch_size and i % self.train_frequency == 0: # train for train_repeat times for j in xrange(self.train_repeat): #logger.info("i=%d, j=%d, mem.count=%d" % (i, j, self.mem.count)) # sample minibatch minibatch = self.mem.getMinibatch() # train the network self.net.train(minibatch, epoch) # increase number of training steps for epsilon decay self.total_train_steps += 1 def test(self, test_steps, epoch = 0): # just make sure there is history_length screens to form a state self._restartRandom() # play given number of steps for i in xrange(test_steps): # perform game step self.step(self.exploration_rate_test) def play(self, num_games): # just make sure there is history_length screens to form a state self._restartRandom() for i in xrange(num_games): # play until terminal state terminal = False while not terminal: action, reward, screen, terminal = self.step(self.exploration_rate_test) # add experiences to replay memory for visualization self.mem.add(action, reward, screen, terminal)
def train(self, num_run=1, restore=False): memory = None start_episode = 0 start_updates = 0 start_run = 0 start_total_numsteps = 0 start_running_episode_reward = 0 start_running_episode_reward_100 = 0 start_rewards = [] start_last_episode_steps = 0 start_episode_reward = 0 start_episode_steps = 0 start_timing = 0 start_total_timing = 0 # Restore Phase if restore: # TODO: Not tested deeply yet with open(self.folder + "memory.pkl", "rb") as pickle_out: memory = ReplayMemory(self.replay_size, self.seed) memory.load(pickle_out) with open(self.folder + "context.json", "r+") as pickle_out: (start_episode, start_run, start_updates, start_total_numsteps, start_running_episode_reward, start_running_episode_reward_100, start_last_episode_steps, start_episode_reward, start_episode_steps, start_timing, start_total_timing) = json.load(pickle_out) with open(self.folder + "rewards.pkl", "rb") as pickle_out: start_rewards = pickle.load(pickle_out) self.restore_model() self.logger.important("Load completed!") in_ts = time.time() # Start of the iteration on runs for i_run in range(start_run, num_run): # Break the loop if the phase "Save'n'Close" is triggered if self.env.is_save_and_close(): break self.logger.important(f"START TRAINING RUN {i_run}") # Set Seed for repeatability torch.manual_seed(self.seed + i_run) np.random.seed(self.seed + i_run) self.env.seed(self.seed + i_run) self.env.action_space.np_random.seed(self.seed + i_run) # Setup TensorboardX writer_train = SummaryWriter(log_dir=self.folder + 'run_' + str(i_run) + '/train') writer_learn = SummaryWriter(log_dir=self.folder + 'run_' + str(i_run) + '/learn') writer_test = SummaryWriter(log_dir=self.folder + 'run_' + str(i_run) + '/test') # Setup Replay Memory: create new memory if is not the restore case if not restore: memory = ReplayMemory(self.replay_size, self.seed) # Create a backup memory for Forget-Phase backup_memory = copy.deepcopy(memory) # TRAINING LOOP # All these variables must be backed up and restored updates = start_updates total_numsteps = start_total_numsteps running_episode_reward = start_running_episode_reward running_episode_reward_100 = start_running_episode_reward_100 rewards = start_rewards i_episode = start_episode last_episode_steps = start_last_episode_steps episode_reward = start_episode_reward episode_steps = start_episode_steps timing = start_timing total_timing = start_total_timing updates_episode = 0 episode_images = list() ''' LOOP: Episode ''' while True: # Stop the robot self.env.stop_all_motors() # Wait for the human to leave the command while self.env.is_human_controlled(): pass # Let's forget (if it is the case) if self.env.is_forget_enabled(): # print('forget') i_episode -= 1 print(len(memory)) # Restore Nets self.restore_model() self.env.reset_forget() # Restore Memory memory = copy.deepcopy(backup_memory) print(len(memory)) # memory.forget_last(last_episode_steps) self.logger.info("Last Episode Forgotten") elif i_episode != start_episode: # LEARNING AND PRINTING PHASE ep_print = i_episode - 1 last_episode_steps = episode_steps if self.pics: for i, image in enumerate(episode_images): writer_train.add_image('episode_{}' .format(str(ep_print)), image.unsqueeze(0), i) if len(memory) > self.min_replay_size and ep_print > self.warm_up_episodes: updates = self.learning_phase((last_episode_steps // 10) * 10 + 10, memory, updates, writer_learn) self.print_nets(writer_train, ep_print) rewards.append(episode_reward) running_episode_reward += (episode_reward - running_episode_reward) / (ep_print + 1) if len(rewards) < 100: running_episode_reward_100 = running_episode_reward else: last_100 = rewards[-100:] running_episode_reward_100 = np.array(last_100).mean() writer_train.add_scalar('reward/train', episode_reward, ep_print) writer_train.add_scalar('reward/steps', last_episode_steps, ep_print) writer_train.add_scalar('reward/running_mean', running_episode_reward, ep_print) writer_train.add_scalar('reward/running_mean_last_100', running_episode_reward_100, ep_print) self.logger.info("Ep. {}/{}, t {}, r_t {}, 100_mean {}, time_spent {}s | {}s " .format(ep_print, self.num_episode, episode_steps, round(episode_reward, 2), round(running_episode_reward_100, 2), round(timing, 2), str(datetime.timedelta(seconds=total_timing)))) # Security Wall, useful for longer training Phase while self.env.is_human_controlled(): pass # Let's test (if it is the case) if i_episode % self.eval_every == 0 and self.eval and i_episode != 0 and not restore: # print('test') self.test_phase(writer_test, i_run, updates) # Wait for the human to leave the command while self.env.is_human_controlled(): pass # TODO: HP Checkpoint and check correctness of checkpoint restoring if i_episode % self.eval_every == 0 and i_episode != 0 and not restore: self.logger.important("Saving context...") self.logger.info("To restart from here set this flag: --restore " + self.folder) # Save Replay, net weights, hp, i_episode and i_run with open(self.folder + "memory.pkl", "wb") as pickle_out: memory.dump(pickle_out) with open(self.folder + "context.json", "w+") as pickle_out: json.dump((i_episode, i_run, updates, total_numsteps, running_episode_reward, running_episode_reward_100, last_episode_steps, episode_reward, episode_steps, timing, total_timing), pickle_out) with open(self.folder + "rewards.pkl", "wb") as pickle_out: pickle.dump(rewards, pickle_out) self.backup_model() if os.path.exists(self.folder[:-1] + "_bak" + self.folder[-1:]): shutil.rmtree(self.folder[:-1] + "_bak" + self.folder[-1:]) print(self.folder[:-1] + "_bak" + self.folder[-1:]) shutil.copytree(self.folder, self.folder[:-1] + "_bak" + self.folder[-1:]) self.logger.important("Save completed!") # Limit of episode/run reached. Let's start a new RUN if i_episode > self.num_episode: break # Backup NNs and memory (useful in case of Forget Phase) self.backup_model() backup_memory = copy.deepcopy(memory) # Setup the episode self.logger.important(f"START EPISODE {i_episode}") ts = time.time() episode_reward = episode_steps = 0 done = False info = {'undo': False} state = self.env.reset() state_buffer = None # If you use CNNs, the use of StateBuffer is enabled (see doc). if self.pics: state_buffer = StateBuffer(self.state_buffer_size, state) state = state_buffer.get_state() episode_images = list() updates_episode = 0 # Start of the episode while not done: if self.pics: episode_images.append(state_buffer.get_tensor()[0]) if i_episode < self.warm_up_episodes or len(memory) < self.min_replay_size: # Warm_up phase -> Completely random choice of an action action = self.env.action_space.sample() else: # Training phase -> Action sampled from policy action = self.select_action(state) assert action.shape == self.env.action_space.shape assert action is not None writer_train.add_histogram('action_speed/episode_{}' .format(str(i_episode)), torch.tensor(action[0]), episode_steps) writer_train.add_histogram('action_turn/episode_{}' .format(str(i_episode)), torch.tensor(action[1]), episode_steps) # Make the action next_state, reward, done, info = self.env.step(action) # Save the step if self.pics: state_buffer.push(next_state) next_state = state_buffer.get_state() episode_steps += 1 total_numsteps += 1 episode_reward += reward mask = 1 if done else float(not done) # Push the transition in the memory only if n steps is greater than 5 # print('push') if episode_steps > 5: memory.push(state, action, reward, next_state, mask) state = next_state print("Memory {}/{}".format(len(memory), self.replay_size)) timing = time.time() - ts total_timing = time.time() - in_ts start_episode = 0 i_episode += 1 # Disable restore phase after the restored run restore = False
class GymAgent(object): def __init__(self, env=Breakout - v0, net, replay_memory, exploration_strategy, args): self.env = env self.net = net self.mem = replay_memory self.exporation_strategy = exporation_strategy self.buf = StateBuffer(args) self.history_length = args.history_length #self.exploration_train_strategy = exploration_strategy.args.exploration_train_strategy #self.exploration_test_strategy = exploration_strategy.args.exploration_test_strategy self.train_net_frequency = args.train_net_frequency self.train_net_repeat = args.train_net_repeat def _restart_random(self): self.env.reset() # perform random number of dummy actions to produce more stochastic games for t in xrange( random.randint(self.history_length, self.random_starts) + 1): self.mem.action = self.env.action_space.sample() self.mem.observation, self.mem.reward, self.mem.done, self.mem.info = self.env.step( self.mem.action) assert not self.env.done, "done state occurred during random initialization" # add dummy states to buffer #to be merged in replay_memor=self.mem here self.buf.add(observation) def act(self, exploration_strategy): # FOR BASE AGENT, perhasp use: raise NotImplementedError callbacks.on_act_begin() # determine whether to explore action = exploration_strategy() if action: logger.debug("Explore action = {}".format(action)) else: # otherwise choose action with highest Q-value state = self.buf.getStateMinibatch() # for convenience getStateMinibatch() returns minibatch # where first item is the current state qvalues = self.net.predict(state) assert len(qvalues[0]) == self.env.action_space.n # choose highest Q-value of first state action = np.argmax(qvalues[0]) logger.debug("Predicted action = {}".format(action)) # perform the action, and update replay_memory self.mem.action = action self.mem.observation, self.mem.reward, self.mem.done, self.mem.info = self.env.step( self.mem.action) # add screen to buffer #self.buf.add(observation) # restart the game if over if done: self._restart_random() # call callback to log progress #MOVE THIS TO CALLBACK SELF.AGENT (need to add self stuff above - NO! USE e.g. buf.observations[last (obvisously replace with the actual number)]): ## act_logs = {} ## act_logs['observation'] = observation ## act_logs['done'] = done ## act_logs['reward'] = reward ## act_logs['t'] = t self.callback.on_act_end(act) #see statistics vs monitor return action, observation, reward, done, info def train(self, train_steps, episode=0): #CHECK WHY, INPARTICULAR SURELY WE DON'T NECCESSARILY HAVE 4STATES FOR CONVNET??? # do not do restart here, continue from testing #self._restart_random() # play given number of steps for t in xrange(train_steps): # update agent replay memory regarding t self.mem.t = t # perform game step self.act(self.exploration_train_strategy) # train after every train_frequency steps if self.mem.count > self.mem.batch_size and t % self.train_frequency == 0: # train for train_repeat times for j in xrange(self.train_net_repeat): # sample minibatch minibatch = self.mem.getMinibatch() # train the network self.net.train(minibatch, episode) # restart the game if over if self.mem.done: # just make sure there is history_length screens to form a state # perform random number of dummy actions to produce more stochastic games if t < random.randint(self.history_length, self.random_starts) + 1: self.act(self.exploration_strategy.play_random) def test(self, test_steps, episode=0): # play given number of steps for t in xrange(test_steps): # update agent replay memory regarding t # check if we trained if t == 0: test_start_t = self.mem.t # reset environment self.env.reset() self.mem.t = test_start_t + t # just make sure there is history_length screens to form a state # perform random number of dummy actions to produce more stochastic games if t < random.randint(self.history_length, self.random_starts) + 1: self.act(self.exploration_strategy.play_random) # perform game step self.act(self.exploration_test_strategy) def play(self, num_games): for t in xrange(num_games): # just make sure there is history_length screens to form a state # perform random number of dummy actions to produce more stochastic games if t < random.randint(self.history_length, self.random_starts) + 1: self.act(self.exploration_strategy.play_random) # play until terminal state while not self.mem.done: self.act(t, self.exploration_test_strategy)
def train(self, num_run=1): in_ts = time.time() for i_run in range(num_run): self.logger.important(f"START TRAINING RUN {i_run}") # Make the environment # Set Seed for repeatability torch.manual_seed(self.seed + i_run) np.random.seed(self.seed + i_run) self.env.seed(self.seed + i_run) self.env.action_space.np_random.seed(self.seed + i_run) # Setup TensorboardX writer_train = SummaryWriter(log_dir='runs/' + self.folder + 'run_' + str(i_run) + '/train') writer_test = SummaryWriter(log_dir='runs/' + self.folder + 'run_' + str(i_run) + '/test') # Setup Replay Memory memory = ReplayMemory(self.replay_size) # TRAINING LOOP total_numsteps = updates = running_episode_reward = running_episode_reward_100 = 0 rewards = [] i_episode = 0 last_episode_steps = 0 while True: self.env.stop_all_motors() while self.env.is_human_controlled(): continue if self.env.is_forget_enabled(): self.restore_model() memory.forget_last(last_episode_steps) i_episode -= 1 self.logger.info("Last Episode Forgotten") if self.env.is_test_phase(): self.test_phase(i_run, i_episode, writer_test) continue if i_episode > self.num_episode: break self.backup_model() self.logger.important(f"START EPISODE {i_episode}") ts = time.time() episode_reward = episode_steps = 0 done = False info = {'undo': False} state = self.env.reset() state_buffer = None if self.pics: state_buffer = StateBuffer(self.state_buffer_size, state) state = state_buffer.get_state() critic_1_loss_acc = critic_2_loss_acc = policy_loss_acc = ent_loss_acc = alpha_acc = 0 while not done: if self.pics: writer_train.add_image( 'episode_{}'.format(str(i_episode)), state_buffer.get_tensor(), episode_steps) if len(memory) < self.warm_up_steps: action = self.env.action_space.sample() else: action = self.select_action( state) # Sample action from policy if len(memory) > self.batch_size: # Number of updates per step in environment for i in range(self.updates_per_step): # Update parameters of all the networks critic_1_loss, critic_2_loss, policy_loss, ent_loss, alpha = self.update_parameters( memory, self.batch_size, updates) critic_1_loss_acc += critic_1_loss critic_2_loss_acc += critic_2_loss policy_loss_acc += policy_loss ent_loss_acc += ent_loss alpha_acc += alpha updates += 1 next_state, reward, done, info = self.env.step( action) # Step if self.pics: state_buffer.push(next_state) next_state = state_buffer.get_state() episode_steps += 1 total_numsteps += 1 episode_reward += reward mask = 1 if done else float(not done) memory.push(state, action, reward, next_state, mask) # Append transition to memory state = next_state last_episode_steps = episode_steps i_episode += 1 rewards.append(episode_reward) running_episode_reward += (episode_reward - running_episode_reward) / i_episode if len(rewards) < 100: running_episode_reward_100 = running_episode_reward else: last_100 = rewards[-100:] running_episode_reward_100 = np.array(last_100).mean() writer_train.add_scalar('loss/critic_1', critic_1_loss_acc / episode_steps, i_episode) writer_train.add_scalar('loss/critic_2', critic_2_loss_acc / episode_steps, i_episode) writer_train.add_scalar('loss/policy', policy_loss_acc / episode_steps, i_episode) writer_train.add_scalar('loss/entropy_loss', ent_loss_acc / episode_steps, i_episode) writer_train.add_scalar('entropy_temperature/alpha', alpha_acc / episode_steps, i_episode) writer_train.add_scalar('reward/train', episode_reward, i_episode) writer_train.add_scalar('reward/running_mean', running_episode_reward, i_episode) writer_train.add_scalar('reward/running_mean_last_100', running_episode_reward_100, i_episode) self.logger.info( "Ep. {}/{}, t {}, r_t {}, 100_mean {}, time_spent {}s | {}s " .format( i_episode, self.num_episode, episode_steps, round(episode_reward, 2), round(running_episode_reward_100, 2), round(time.time() - ts, 2), str(datetime.timedelta(seconds=time.time() - in_ts)))) self.env.close()
# Setup Replay Memory memory = ReplayMemory(args.replay_size) # TRAINING LOOP total_numsteps = updates = running_episode_reward = running_episode_reward_100 = 0 rewards = [] for i_episode in itertools.count(1): print(updates) ts = time.time() episode_reward = episode_steps = 0 done = False state = env.reset() if cnn: state_buffer = StateBuffer(args.state_buffer_size, state) state = state_buffer.get_state() critic_1_loss_acc = critic_2_loss_acc = policy_loss_acc = ent_loss_acc = alpha_acc = 0 while not done: # if cnn: # writer_train.add_images('episode_{}'.format(str(i_episode)), state_buffer.get_tensor(), episode_steps) if i_episode < args.warm_up_episode: action = env.action_space.sample() # Sample random action else: action = agent.select_action( state) # Sample action from policy next_state, reward, done, _ = env.step(action) # Step env.render()