def __init__(self, action_set, train=True, load_path=None): #1. Initialize agent params self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.action_set = action_set self.action_number = len(action_set) self.steps_done = 0 self.epsilon = Config.EPS_START self.episode_durations = [] #2. Build networks self.policy_net = DQN().to(self.device) self.target_net = DQN().to(self.device) self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=Config.LEARNING_RATE) if not train: self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=0) self.policy_net.load(load_path, optimizer=self.optimizer) self.policy_net.eval() self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() #3. Create Prioritized Experience Replay Memory self.memory = Memory(Config.MEMORY_SIZE)
def __init__(self, state_size, action_size, actor_lr, critic_lr, tau, gamma, lambd, batch_size, memory_size, epsilon, epsilon_end, decay_step, load_model): self.state_size = state_size self.vel_size = 3 self.action_size = action_size self.action_high = 1.5 self.action_low = -self.action_high self.actor_lr = actor_lr self.critic_lr = critic_lr self.tau = tau self.gamma = gamma self.lambd = lambd self.batch_size = batch_size self.memory_size = memory_size self.epsilon = epsilon self.epsilon_end = epsilon_end self.decay_step = decay_step self.epsilon_decay = (epsilon - epsilon_end) / decay_step self.sess = tf.Session() K.set_session(self.sess) self.actor, self.critic = self.build_model() self.target_actor, self.target_critic = self.build_model() self.actor_update = self.build_actor_optimizer() self.critic_update = self.build_critic_optimizer() self.sess.run(tf.global_variables_initializer()) if load_model: self.load_model('./save_model/'+ agent_name) self.target_actor.set_weights(self.actor.get_weights()) self.target_critic.set_weights(self.critic.get_weights()) self.memory = Memory(self.memory_size)
def __init__( self, state_shape, action_size, network_builder, env_class, actor_num=8, batch_size=50, gamma=0.95, memory_len=100000, epsilon_max=0.4, epsilon_alpha=7, learning_rate=0.001, target_update_step=50, Double_Q=True, train_start_step=10000, local_network_update_step=50, n_step=3, ): self.state_shape = state_shape self.action_size = action_size self.env_class = env_class self.memory = Memory(memory_len) self.gamma = gamma # discount rate self.epsilon_max = epsilon_max # exploration rate self.epsilon_alpha = epsilon_alpha self.learning_rate = learning_rate self.Double_Q = Double_Q self.target_update_step = target_update_step self.network_builder = network_builder self.actor_num = actor_num self.train_start_step = train_start_step self.local_network_update_step = local_network_update_step self.n_step = n_step self.history = {"reward": [], "loss": [], "avg_q": [], "step": 0} self.end_training = [False] self.optimizer = tf.keras.optimizers.RMSprop( learning_rate=learning_rate) self.loss_function = tf.keras.losses.Huber( reduction=tf.keras.losses.Reduction.NONE) self.loss_metric = tf.keras.metrics.Mean(name='loss') self.avg_q_metric = tf.keras.metrics.Mean(name='avg_q') self.total_update_step = 0 self.batch_size = batch_size self.main_network = network_builder() self.target_network = network_builder() self.update_target() self.generate_local_networks() self.device = '/gpu:0'
def __init__( self, state_shape, action_size, main_network, target_network, batch_size=50, gamma=0.95, memory_len=100000, epsilon_init=1, epsilon_min=0.1, epsilon_decay=0.99, learning_rate=0.01, target_update_step=500, Double_Q=True, PER_alpha=0.6, PER_beta=0.4, PER_beta_increment=0.001, ): self.state_shape = state_shape self.action_size = action_size self.memory = Memory(memory_len, alpha=PER_alpha, beta=PER_beta, beta_increment=PER_beta_increment) self.gamma = gamma # discount rate self.epsilon = epsilon_init # exploration rate self.epsilon_min = epsilon_min self.epsilon_decay = epsilon_decay self.learning_rate = learning_rate self.Double_Q = Double_Q self.target_update_step = target_update_step self.optimizer = tf.keras.optimizers.RMSprop( learning_rate=learning_rate) self.loss_function = tf.keras.losses.Huber( reduction=tf.keras.losses.Reduction.NONE) self.loss_metric = tf.keras.metrics.Mean(name='loss') self.avg_q_metric = tf.keras.metrics.Mean(name='avg_q') self.total_update_step = 0 self.batch_size = batch_size self.main_network = main_network self.target_network = target_network self.update_target()
def __init__(self, options, resume_previous_train): self.model = FinalDQN(options).to(device) self.target = FinalDQN(options).to(device) self.double_dqn, self.dueling_dqn, self.per, self.noisy_dqn, self.dist_dqn = options options_list = ["double", "dueling", "per", "noisy", "dist"] if self.per: self.memory = Memory(memory_size) else: self.memory = ExperienceReplay(memory_size) self.loss_func = nn.SmoothL1Loss() self.optimizer = optim.Adam(params=self.model.parameters(), lr=learning_rate) self.num_updates = 0 if all(options): self.PATH = "Plots/RL/all/network_all.pth" elif not any(options): self.PATH = "Plots/RL/vanilla_dqn/network_vanilla_dqn.pth" else: zero_idx = list(options).index(0) self.PATH = "Plots/RL/no_" + options_list[ zero_idx] + "_dqn/network_no_" + options_list[ zero_idx] + "_dqn.pth" if resume_previous_train and os.path.exists(self.PATH): print("Loading previously saved model ... ") self.model.load_state_dict(load_model(self.PATH))
def __init__(self, state_size, action_size, actor_lr, critic_lr, tau, gamma, lambd, batch_size, memory_size, actor_delay, target_noise, epsilon, epsilon_end, decay_step, load_model, play): self.state_size = state_size self.vel_size = 3 self.action_size = action_size self.action_high = 1.5 self.action_low = -self.action_high self.actor_lr = actor_lr self.critic_lr = critic_lr self.tau = tau self.gamma = gamma self.lambd = lambd self.actor_delay = actor_delay self.target_noise = target_noise self.batch_size = batch_size self.memory_size = memory_size self.epsilon = epsilon self.epsilon_end = epsilon_end self.decay_step = decay_step self.epsilon_decay = (epsilon - epsilon_end) / decay_step if play: gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.5) self.sess = tf.Session(config=tf.ConfigProto( gpu_options=gpu_options)) else: self.sess = tf.Session() K.set_session(self.sess) self.actor, self.critic, self.critic2 = self.build_model() self.target_actor, self.target_critic, self.target_critic2 = self.build_model( ) self.actor_update = self.build_actor_optimizer() self.critic_update = self.build_critic_optimizer() self.critic2_update = self.build_critic2_optimizer() self.sess.run(tf.global_variables_initializer()) if load_model: self.load_model('./save_model/' + agent_name) self.target_actor.set_weights(self.actor.get_weights()) self.target_critic.set_weights(self.critic.get_weights()) self.target_critic2.set_weights(self.critic2.get_weights()) self.memory = Memory(self.memory_size)
def __init__(self): self.env = gym.make("SpaceInvaders-v0") self.observation_size = self.env.observation_space self.action_size = self.env.action_space.n self.frame_size = [84, 84] self.stack_size = 4 self.input_shape = [*self.frame_size, self.stack_size] MEMORY_SIZE = 30000 self.memory = Memory(MEMORY_SIZE) self.INITIAL_MEMORY_SIZE = 10000 self.EXPLORATION_RATE = 1 self.EXPLORATION_DECAY = 0.9999 self.EXPLORATION_MIN = 0.01 self.UPDATE_MODEL_STEP = 10000 self.TRAINING_FREQUENCY = 4 self.model = Model(self.input_shape, self.action_size) self.model.update_model() self.sess_path = r"C:\Projects\Personal Projects\Saved_Sessions\Space_Invader\last_train_sess.pkl"
class Agent: def __init__(self): self.env = gym.make("SpaceInvaders-v0") self.observation_size = self.env.observation_space self.action_size = self.env.action_space.n self.frame_size = [84, 84] self.stack_size = 4 self.input_shape = [*self.frame_size, self.stack_size] MEMORY_SIZE = 30000 self.memory = Memory(MEMORY_SIZE) self.INITIAL_MEMORY_SIZE = 10000 self.EXPLORATION_RATE = 1 self.EXPLORATION_DECAY = 0.9999 self.EXPLORATION_MIN = 0.01 self.UPDATE_MODEL_STEP = 10000 self.TRAINING_FREQUENCY = 4 self.model = Model(self.input_shape, self.action_size) self.model.update_model() self.sess_path = r"C:\Projects\Personal Projects\Saved_Sessions\Space_Invader\last_train_sess.pkl" def preprocess_frame(self, frame): ''' FUNCTION: pre-process the frames for training - grey scale - crop the edges - normalise the pixel values - resize the fram ''' gray = rgb2gray(frame) cropped_frame = gray[8:-12, 4:-12] normalized_frame = cropped_frame / 255.0 preprocessed_frame = transform.resize(normalized_frame, self.frame_size) return preprocessed_frame def stack_frames(self, stacked_frames, new_frame, is_new_episode): ''' FUNCTION: Given a stacked frame, append a new frame to this stack ''' # Preprocess frame before stacking frame = self.preprocess_frame(new_frame) # if new episode make copies of frame and stack into np arrays if is_new_episode: stacked_frames = deque([ np.zeros(self.frame_size, dtype=np.int) for i in range(0, self.stack_size) ], maxlen=self.stack_size) for _ in range(0, self.stack_size): stacked_frames.append(frame) stacked_states = np.stack(stacked_frames, axis=2) # else append the frame to the queue else: stacked_frames.append(frame) stacked_states = np.stack(stacked_frames, axis=2) return stacked_states, stacked_frames def test(self, n_episodes, model=None, memory=None, render=False, clip_reward=False): ''' Play a game to test environment ''' avg_rewards = 0 steps = [] for i in range(1, n_episodes + 1): state = self.env.reset() stacked_frames = deque([ np.zeros(self.frame_size, dtype=np.int) for i in range(0, self.stack_size) ], maxlen=self.stack_size) state, stacked_frames = self.stack_frames(stacked_frames, state, True) done = False total_reward = 0 step = 0 while not done: if render: self.env.render() time.sleep(0.01) if model: action = self.model.act(state, 0) else: action = np.random.randint(self.model.action_space) state_next, reward, done, info = self.env.step(action) if clip_reward: reward = np.sign(reward) if done: state_next = np.zeros(self.frame_size, dtype=np.int) state_next, stacked_frames = self.stack_frames( stacked_frames, state_next, False) if memory: memory.store((state, action, reward, state_next, done)) state = state_next total_reward += reward step += 1 if render: self.env.close() avg_rewards = avg_rewards + 1 / (i) * (total_reward - avg_rewards) steps.append(step) print("The average rewards for {} runs is {}".format( n_episodes, avg_rewards)) return steps, avg_rewards def initialise_memory(self): print("Start filling memory") while self.memory.memory_tree.capacity_filled < self.INITIAL_MEMORY_SIZE: steps, total_reward = self.test(1, model=None, memory=self.memory, clip_reward=True) print("Memory filled! The memory length is", self.memory.memory_tree.capacity_filled) def restore_and_test(self): self.model.load_model() self.test(1, model=self.model.DQNetwork, render=True) def run(self, continue_sess=False): # Delete log directory # if os.path.isdir(self.model.log_path): # shutil.rmtree(self.model.log_path) if continue_sess: self.model.load_model() with open(self.sess_path, 'rb') as f: start_episode, total_steps, rewards, losses, self.EXPLORATION_RATE, self.memory = pickle.load( f) print("Continuing training from episode: {}, step: {}, exploration_rate: {:.4f}, Memory Size: {}"\ .format(start_episode, total_steps, self.EXPLORATION_RATE, self.memory.memory_tree.capacity_filled)) else: self.initialise_memory() total_steps = 0 losses = [] rewards = [] start_episode = 0 steps = [] start_time = time.time() total_epsiodes = start_episode + 5 for i in range(start_episode, total_epsiodes): # Make a new episode and observe the first state state = self.env.reset() stacked_frames = deque([ np.zeros(self.frame_size, dtype=np.int) for i in range(0, self.stack_size) ], maxlen=self.stack_size) state, stacked_frames = self.stack_frames(stacked_frames, state, True) # Set step to 0 episode_rewards = 0 episode_steps = 0 done = False while not done: episode_steps += 1 total_steps += 1 # Take a step action = self.model.act(state, self.EXPLORATION_RATE) next_state, reward, done, _ = self.env.step(action) # accumulate rewards reward = np.sign(reward) episode_rewards += reward # Tasks when done if done: next_state = np.zeros(self.frame_size, dtype=np.int) next_state, stacked_frames = self.stack_frames( stacked_frames, next_state, False) print("Episode {}, exploration rate: {:.4f}, final rewards: {}, final loss is {:.4f}, Time elapsed: {:.4f}"\ .format(i+1, self.EXPLORATION_RATE, episode_rewards, loss, time.time() - start_time)) # self.test(1, model = self.model.DQNetwork, render=False) start_time = time.time() else: next_state, stacked_frames = self.stack_frames( stacked_frames, next_state, False) self.memory.store((state, action, reward, next_state, done)) state = next_state # Update target model and save model every UPDATE_STEP if (total_steps % self.UPDATE_MODEL_STEP == 0): self.model.update_model() self.model.save_model() ### LEARNING PROCEDURE ### if total_steps % self.TRAINING_FREQUENCY == 0: tree_idx, IS_weights, batch = self.memory.sample( self.model.BATCH_SIZE) loss, abs_TD_error = self.model.DQN_train( batch, IS_weights, tree_idx, total_steps) losses.append(loss) # Update the sample priority of batch self.memory.update_batch(tree_idx, abs_TD_error) # Reduce the exploreation every step self.EXPLORATION_RATE *= self.EXPLORATION_DECAY self.EXPLORATION_RATE = max(self.EXPLORATION_MIN, self.EXPLORATION_RATE) ### LEARNING PROCEDURE ### # Append values at the end of an episode steps.append(episode_steps) rewards.append(episode_rewards) # Save model at the end of training print("Training Done") self.model.update_model() self.model.save_model() # save variables to continue training with open(self.sess_path, 'wb') as f: # Python 3: open(..., 'wb') pickle.dump([ i + 1, total_steps, rewards, losses, self.EXPLORATION_RATE, self.memory ], f, protocol=-1) # Save plot plt.plot(rewards) plt.ylabel('Rewards') plt.xlabel('Episodes') plt.savefig('rewards.png')
class Agent: """ The intelligent agent of the simulation. Set the model of the neural network used and general parameters. It is responsible to select the actions, optimize the neural network and manage the models. """ def __init__(self, action_set, train=True, load_path=None): #1. Initialize agent params self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.action_set = action_set self.action_number = len(action_set) self.steps_done = 0 self.epsilon = Config.EPS_START self.episode_durations = [] #2. Build networks self.policy_net = DQN().to(self.device) self.target_net = DQN().to(self.device) self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=Config.LEARNING_RATE) if not train: self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=0) self.policy_net.load(load_path, optimizer=self.optimizer) self.policy_net.eval() self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() #3. Create Prioritized Experience Replay Memory self.memory = Memory(Config.MEMORY_SIZE) def append_sample(self, state, action, next_state, reward): """ save sample (error,<s,a,s',r>) to the replay memory """ # Define if is the end of the simulation done = True if next_state is None else False # Compute Q(s_t, a) - the model computes Q(s_t), then we select the columns of actions taken state_action_values = self.policy_net(state) state_action_values = state_action_values.gather(1, action.view(-1,1)) if not done: # Compute argmax Q(s', a; θ) next_state_actions = self.policy_net(next_state).max(1)[1].detach().unsqueeze(1) # Compute Q(s', argmax Q(s', a; θ), θ-) next_state_values = self.target_net(next_state).gather(1, next_state_actions).squeeze(1).detach() # Compute the expected Q values expected_state_action_values = (next_state_values * Config.GAMMA) + reward else: expected_state_action_values = reward error = abs(state_action_values - expected_state_action_values).data.cpu().numpy() self.memory.add(error, state, action, next_state, reward) def select_action(self, state, train=True): """ Selet the best action according to the Q-values outputed from the neural network Parameters ---------- state: float ndarray The current state on the simulation train: bool Define if we are evaluating or trainning the model Returns ------- a.max(1)[1]: int The action with the highest Q-value a.max(0): float The Q-value of the action taken """ global steps_done sample = random.random() #1. Perform a epsilon-greedy algorithm #a. set the value for epsilon self.epsilon = Config.EPS_END + (Config.EPS_START - Config.EPS_END) * \ math.exp(-1. * self.steps_done / Config.EPS_DECAY) self.steps_done += 1 #b. make the decision for selecting a random action or selecting an action from the neural network if sample > self.epsilon or (not train): # select an action from the neural network with torch.no_grad(): # a <- argmax Q(s, theta) a = self.policy_net(state) return a.max(1)[1].view(1, 1), a.max(0) else: # select a random action print('random action') return torch.tensor([[random.randrange(2)]], device=self.device, dtype=torch.long), None """ def select_action(self, state, train=True): Selet the best action according to the Q-values outputed from the neural network Parameters ---------- state: float ndarray The current state on the simulation train: bool Define if we are evaluating or trainning the model Returns ------- a.max(1)[1]: int The action with the highest Q-value a.max(0): float The Q-value of the action taken global steps_done sample = random.random() #1. Perform a epsilon-greedy algorithm #a. set the value for epsilon self.epsilon = Config.EPS_END + (Config.EPS_START - Config.EPS_END) * \ math.exp(-1. * self.steps_done / Config.EPS_DECAY) self.steps_done += 1 #b. make the decision for selecting a random action or selecting an action from the neural network if sample > self.epsilon or (not train): # select an action from the neural network with torch.no_grad(): # a <- argmax Q(s, theta) #set the network to train mode is important to enable dropout self.policy_net.train() output_list = [] # Retrieve the outputs from neural network feedfoward n times to build a statistic model for i in range(Config.STOCHASTIC_PASSES): #print(agent.policy_net(data)) output_list.append(torch.unsqueeze(F.softmax(self.policy_net(state)), 0)) #print(output_list[i]) self.policy_net.eval() # The result of the network is the mean of n passes output_mean = torch.cat(output_list, 0).mean(0) q_value = output_mean.data.cpu().numpy().max() action = output_mean.max(1)[1].view(1, 1) uncertainty = torch.cat(output_list, 0).var(0).mean().item() return action, q_value, uncertainty else: # select a random action print('random action') return torch.tensor([[random.randrange(2)]], device=self.device, dtype=torch.long), None, None """ def optimize_model(self): """ Perform one step of optimization on the neural network """ if self.memory.tree.n_entries < Config.BATCH_SIZE: return transitions, idxs, is_weights = self.memory.sample(Config.BATCH_SIZE) # Transpose the batch (see http://stackoverflow.com/a/19343/3343043 for detailed explanation). batch = Transition(*zip(*transitions)) # Compute a mask of non-final states and concatenate the batch elements non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, batch.next_state)), device=self.device, dtype=torch.uint8) non_final_next_states = torch.cat([s for s in batch.next_state if s is not None]) state_batch = torch.cat(batch.state) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) # Compute Q(s_t, a) - the model computes Q(s_t), then we select the columns of actions taken state_action_values = self.policy_net(state_batch).gather(1, action_batch) # Compute argmax Q(s', a; θ) next_state_actions = self.policy_net(non_final_next_states).max(1)[1].detach().unsqueeze(1) # Compute Q(s', argmax Q(s', a; θ), θ-) next_state_values = torch.zeros(Config.BATCH_SIZE, device=self.device) next_state_values[non_final_mask] = self.target_net(non_final_next_states).gather(1, next_state_actions).squeeze(1).detach() # Compute the expected Q values expected_state_action_values = (next_state_values * Config.GAMMA) + reward_batch # Update priorities errors = torch.abs(state_action_values.squeeze() - expected_state_action_values).data.cpu().numpy() # update priority for i in range(Config.BATCH_SIZE): idx = idxs[i] self.memory.update(idx, errors[i]) # Compute Huber loss loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1)) loss_return = loss.item() # Optimize the model self.optimizer.zero_grad() loss.backward() for param in self.policy_net.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step() return loss_return def save(self, step, logs_path, label): """ Save the model on hard disc Parameters ---------- step: int current step on the simulation logs_path: string path to where we will store the model label: string label that will be used to store the model """ os.makedirs(logs_path + label, exist_ok=True) full_label = label + str(step) + '.pth' logs_path = os.path.join(logs_path, label, full_label) self.policy_net.save(logs_path, step=step, optimizer=self.optimizer) def restore(self, logs_path): """ Load the model from hard disc Parameters ---------- logs_path: string path to where we will store the model """ self.policy_net.load(logs_path) self.target_net.load(logs_path)
class TD3Agent(object): def __init__(self, state_size, action_size, actor_lr, critic_lr, tau, gamma, lambd, batch_size, memory_size, actor_delay, target_noise, epsilon, epsilon_end, decay_step, load_model, play): self.state_size = state_size self.vel_size = 3 self.action_size = action_size self.action_high = 1.5 self.action_low = -self.action_high self.actor_lr = actor_lr self.critic_lr = critic_lr self.tau = tau self.gamma = gamma self.lambd = lambd self.actor_delay = actor_delay self.target_noise = target_noise self.batch_size = batch_size self.memory_size = memory_size self.epsilon = epsilon self.epsilon_end = epsilon_end self.decay_step = decay_step self.epsilon_decay = (epsilon - epsilon_end) / decay_step if play: gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.5) self.sess = tf.Session(config=tf.ConfigProto( gpu_options=gpu_options)) else: self.sess = tf.Session() K.set_session(self.sess) self.actor, self.critic, self.critic2 = self.build_model() self.target_actor, self.target_critic, self.target_critic2 = self.build_model( ) self.actor_update = self.build_actor_optimizer() self.critic_update = self.build_critic_optimizer() self.critic2_update = self.build_critic2_optimizer() self.sess.run(tf.global_variables_initializer()) if load_model: self.load_model('./save_model/' + agent_name) self.target_actor.set_weights(self.actor.get_weights()) self.target_critic.set_weights(self.critic.get_weights()) self.target_critic2.set_weights(self.critic2.get_weights()) self.memory = Memory(self.memory_size) def build_model(self): # shared network # image process image = Input(shape=self.state_size) image_process = BatchNormalization()(image) image_process = TimeDistributed( Conv2D(16, (3, 3), activation='elu', padding='same', kernel_initializer='he_normal'))(image_process) image_process = TimeDistributed( Conv2D(16, (3, 3), activation='elu', kernel_initializer='he_normal'))(image_process) image_process = TimeDistributed( Conv2D(16, (3, 3), activation='elu', kernel_initializer='he_normal'))(image_process) image_process = TimeDistributed(MaxPooling2D((3, 3)))(image_process) image_process = TimeDistributed( Conv2D(16, (3, 3), activation='elu', kernel_initializer='he_normal'))(image_process) image_process = TimeDistributed( Conv2D(16, (3, 3), activation='elu', kernel_initializer='he_normal'))(image_process) image_process = TimeDistributed(MaxPooling2D((2, 2)))(image_process) image_process = TimeDistributed( Conv2D(32, (3, 3), activation='elu', kernel_initializer='he_normal'))(image_process) image_process = TimeDistributed( Conv2D(32, (3, 3), activation='elu', kernel_initializer='he_normal'))(image_process) image_process = TimeDistributed(MaxPooling2D((2, 2)))(image_process) image_process = TimeDistributed( Conv2D(16, (3, 3), activation='elu', kernel_initializer='he_normal'))(image_process) image_process = TimeDistributed(MaxPooling2D((2, 2)))(image_process) image_process = TimeDistributed( Conv2D(8, (1, 1), activation='elu', kernel_initializer='he_normal'))(image_process) image_process = TimeDistributed(Flatten())(image_process) image_process = GRU(48, kernel_initializer='he_normal', use_bias=False)(image_process) image_process = BatchNormalization()(image_process) image_process = Activation('tanh')(image_process) # vel process vel = Input(shape=[self.vel_size]) vel_process = Dense(48, kernel_initializer='he_normal', use_bias=False)(vel) vel_process = BatchNormalization()(vel_process) vel_process = Activation('tanh')(vel_process) # state process # state_process = Concatenate()([image_process, vel_process]) state_process = Add()([image_process, vel_process]) # Actor policy = Dense(32, kernel_initializer='he_normal', use_bias=False)(state_process) policy = BatchNormalization()(policy) policy = ELU()(policy) policy = Dense(32, kernel_initializer='he_normal', use_bias=False)(policy) policy = BatchNormalization()(policy) policy = ELU()(policy) policy = Dense(self.action_size, kernel_initializer=tf.random_uniform_initializer( minval=-3e-3, maxval=3e-3))(policy) policy = Lambda( lambda x: K.clip(x, self.action_low, self.action_high))(policy) actor = Model(inputs=[image, vel], outputs=policy) # Critic action = Input(shape=[self.action_size]) action_process = Dense(48, kernel_initializer='he_normal', use_bias=False)(action) action_process = BatchNormalization()(action_process) action_process = Activation('tanh')(action_process) state_action = Add()([state_process, action_process]) Qvalue = Dense(32, kernel_initializer='he_normal', use_bias=False)(state_action) Qvalue = BatchNormalization()(Qvalue) Qvalue = ELU()(Qvalue) Qvalue = Dense(32, kernel_initializer='he_normal', use_bias=False)(Qvalue) Qvalue = BatchNormalization()(Qvalue) Qvalue = ELU()(Qvalue) Qvalue = Dense(1, kernel_initializer=tf.random_uniform_initializer( minval=-3e-3, maxval=3e-3))(Qvalue) critic = Model(inputs=[image, vel, action], outputs=Qvalue) # Critic2 action = Input(shape=[self.action_size]) action_process2 = Dense(48, kernel_initializer='he_normal', use_bias=False)(action) action_process2 = BatchNormalization()(action_process2) action_process2 = Activation('tanh')(action_process2) state_action2 = Add()([state_process, action_process2]) Qvalue2 = Dense(32, kernel_initializer='he_normal', use_bias=False)(state_action2) Qvalue2 = BatchNormalization()(Qvalue2) Qvalue2 = ELU()(Qvalue2) Qvalue2 = Dense(32, kernel_initializer='he_normal', use_bias=False)(Qvalue2) Qvalue2 = BatchNormalization()(Qvalue2) Qvalue2 = ELU()(Qvalue2) Qvalue2 = Dense(1, kernel_initializer=tf.random_uniform_initializer( minval=-3e-3, maxval=3e-3))(Qvalue2) critic2 = Model(inputs=[image, vel, action], outputs=Qvalue2) actor._make_predict_function() critic._make_predict_function() critic2._make_predict_function() return actor, critic, critic2 def build_actor_optimizer(self): pred_Q = self.critic.output action_grad = tf.gradients(pred_Q, self.critic.input[2]) target = -action_grad[0] / self.batch_size params_grad = tf.gradients(self.actor.output, self.actor.trainable_weights, target) params_grad, global_norm = tf.clip_by_global_norm(params_grad, 5.0) grads = zip(params_grad, self.actor.trainable_weights) optimizer = tf.train.AdamOptimizer(self.actor_lr) updates = optimizer.apply_gradients(grads) train = K.function( [self.actor.input[0], self.actor.input[1], self.critic.input[2]], [global_norm], updates=[updates]) return train def build_critic_optimizer(self): y = K.placeholder(shape=(None, 1), dtype='float32') pred = self.critic.output loss = K.mean(K.square(pred - y)) # Huber Loss # error = K.abs(y - pred) # quadratic = K.clip(error, 0.0, 1.0) # linear = error - quadratic # loss = K.mean(0.5 * K.square(quadratic) + linear) optimizer = Adam(lr=self.critic_lr) updates = optimizer.get_updates(self.critic.trainable_weights, [], loss) train = K.function([ self.critic.input[0], self.critic.input[1], self.critic.input[2], y ], [pred, loss], updates=updates) return train def build_critic2_optimizer(self): y = K.placeholder(shape=(None, 1), dtype='float32') pred = self.critic2.output loss = K.mean(K.square(pred - y)) # # Huber Loss # error = K.abs(y - pred) # quadratic = K.clip(error, 0.0, 1.0) # linear = error - quadratic # loss = K.mean(0.5 * K.square(quadratic) + linear) optimizer = Adam(lr=self.critic_lr) updates = optimizer.get_updates(self.critic2.trainable_weights, [], loss) train = K.function([ self.critic2.input[0], self.critic2.input[1], self.critic2.input[2], y ], [loss], updates=updates) return train def get_action(self, state): policy = self.actor.predict(state)[0] noise = np.random.normal(0, self.epsilon, self.action_size) action = np.clip(policy + noise, self.action_low, self.action_high) return action, policy def train_model(self): batch, idxs, _ = self.memory.sample(self.batch_size) images = np.zeros([self.batch_size] + self.state_size) vels = np.zeros([self.batch_size, self.vel_size]) actions = np.zeros((self.batch_size, self.action_size)) rewards = np.zeros((self.batch_size, 1)) next_images = np.zeros([self.batch_size] + self.state_size) next_vels = np.zeros([self.batch_size, self.vel_size]) dones = np.zeros((self.batch_size, 1)) targets = np.zeros((self.batch_size, 1)) for i, sample in enumerate(batch): images[i], vels[i] = sample[0] actions[i] = sample[1] rewards[i] = sample[2] next_images[i], next_vels[i] = sample[3] dones[i] = sample[4] states = [images, vels] next_states = [next_images, next_vels] policy = self.actor.predict(states) target_actions = self.target_actor.predict(next_states) target_noises = np.random.normal(0, self.target_noise, target_actions.shape) target_actions = np.clip(target_actions + target_noises, self.action_low, self.action_high) target_next_Qs1 = self.target_critic.predict(next_states + [target_actions]) target_next_Qs2 = self.target_critic2.predict(next_states + [target_actions]) target_next_Qs = np.minimum(target_next_Qs1, target_next_Qs2) targets = rewards + self.gamma * (1 - dones) * target_next_Qs critic_loss = 0 for _ in range(self.actor_delay): pred, c_loss = self.critic_update(states + [actions, targets]) c2_loss = self.critic2_update(states + [actions, targets]) critic_loss += c_loss + c2_loss[0] actor_loss = self.actor_update(states + [policy]) tds = np.abs(pred - targets) for i in range(self.batch_size): idx = idxs[i] self.memory.update(idx, tds[i]) return actor_loss[0], critic_loss / (self.actor_delay * 2.0) def append_memory(self, state, action, reward, next_state, done): Q = self.critic.predict(state + [action.reshape(1, -1)])[0] target_action = self.target_actor.predict(next_state)[0] target_Q1 = self.target_critic.predict( next_state + [target_action.reshape(1, -1)])[0] target_Q2 = self.target_critic2.predict( next_state + [target_action.reshape(1, -1)])[0] target_Q = np.minimum(target_Q1, target_Q2) td = reward + (1 - done) * self.gamma * target_Q - Q td = float(abs(td[0])) self.memory.add(td, (state, action, reward, next_state, done)) return td def load_model(self, name): if os.path.exists(name + '_actor.h5'): self.actor.load_weights(name + '_actor.h5') print('Actor loaded') if os.path.exists(name + '_critic.h5'): self.critic.load_weights(name + '_critic.h5') print('Critic loaded') if os.path.exists(name + '_critic2.h5'): self.critic2.load_weights(name + '_critic2.h5') print('Critic2 loaded') def save_model(self, name): self.actor.save_weights(name + '_actor.h5') self.critic.save_weights(name + '_critic.h5') self.critic2.save_weights(name + '_critic2.h5') def update_target_model(self): self.target_actor.set_weights( self.tau * np.array(self.actor.get_weights()) \ + (1 - self.tau) * np.array(self.target_actor.get_weights()) ) self.target_critic.set_weights( self.tau * np.array(self.critic.get_weights()) \ + (1 - self.tau) * np.array(self.target_critic.get_weights()) ) self.target_critic2.set_weights( self.tau * np.array(self.critic2.get_weights()) \ + (1 - self.tau) * np.array(self.target_critic2.get_weights()) )
def deep_q_learning(sess, env, q_estimator, target_estimator, state_processor, num_episodes, experiment_dir, replay_memory_size=500000, replay_memory_init_size=50000, update_target_estimator_every=10000, discount_factor=0.99, epsilon_start=1.0, epsilon_end=0.1, epsilon_decay_steps=500000, batch_size=32, record_video_every=50, selfishness=0.5): """ Q-Learning algorithm for off-policy TD control using Function Approximation. Finds the optimal greedy policy while following an epsilon-greedy policy. Args: sess: Tensorflow Session object env: OpenAI environment q_estimator: Estimator object used for the q values target_estimator: Estimator object used for the targets state_processor: A StateProcessor object num_episodes: Number of episodes to run for experiment_dir: Directory to save Tensorflow summaries in replay_memory_size: Size of the replay memory replay_memory_init_size: Number of random experiences to sampel when initializing the reply memory. update_target_estimator_every: Copy parameters from the Q estimator to the target estimator every N steps discount_factor: Gamma discount factor epsilon_start: Chance to sample a random action when taking an action. Epsilon is decayed over time and this is the start value epsilon_end: The final minimum value of epsilon after decaying is done epsilon_decay_steps: Number of steps to decay epsilon over batch_size: Size of batches to sample from the replay memory record_video_every: Record a video every N episodes Returns: An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards. """ Transition = namedtuple( "Transition", ["state", "action", "reward", "next_state", "done"]) # The replay memory replay_memory = [] # Keeps track of useful statistics stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes), episode_kills=np.zeros(num_episodes), episode_coins=np.zeros(num_episodes), episode_levels=np.zeros(num_episodes), episode_total_death=np.zeros(num_episodes), episode_distance=np.zeros(num_episodes), episode_death=np.zeros(num_episodes)) # Create directories for checkpoints and summaries checkpoint_dir = os.path.join(experiment_dir, "checkpoints") checkpoint_path = os.path.join(checkpoint_dir, "model") monitor_path = os.path.join(experiment_dir, "monitor") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) if not os.path.exists(monitor_path): os.makedirs(monitor_path) saver = tf.train.Saver() # print(latest_checkpoint) # Load a previous checkpoint if we find one latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir) if latest_checkpoint: print("Loading model checkpoint {}...\n".format(latest_checkpoint)) saver.restore(sess, latest_checkpoint) total_t = sess.run(tf.contrib.framework.get_global_step()) # The epsilon decay schedule epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps) # The policy we're following if POLICY == 'BOLTZMAN': policy = make_boltzmann_policy(q_estimator, len(VALID_ACTIONS)) else: policy = make_epsilon_greedy_policy(q_estimator, len(VALID_ACTIONS)) # Populate the replay memory with initial experience print("Populating replay memory...") total_state = env.reset(levelname=LEVEL_NAME) state = state_processor.process(sess, total_state, 1) state = np.stack([state] * WINDOW_LENGTH, axis=0) # state = np.stack([state] * WINDOW_LENGTH, axis=2) total_death = 0 total_state = np.stack([state], axis=0) if USE_MEMORY: fn = '{}_{}'.format(LEVEL_NAME, 100) replay_memory = load_memory(fn, 100, REPLAY_MEMORY_SIZE) print(len(replay_memory)) if PRIORITIZE_MEMORY: print('Creating priority memory') memory = prioritzie_replay(replay_memory) else: if env.headless: for i in range(replay_memory_init_size): action_probs = policy( sess, state, epsilons[min(total_t, epsilon_decay_steps - 1)]) action_probs = action_probs + [0, 1, 0, 0, 1, 0] action_probs = softmax(action_probs) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) next_total_state, reward, done, info = env.step( VALID_ACTIONS[action]) next_state = state_processor.process(sess, next_total_state, 1) next_state = np.append(state[1:, :, :], np.expand_dims(next_state, 0), axis=0) next_total_state = np.stack([next_state], axis=0) replay_memory.append( Transition(total_state, action, reward, next_total_state, done)) if done: total_state = env.reset(levelname=LEVEL_NAME) state = state_processor.process(sess, total_state, 1) state = np.stack([state] * WINDOW_LENGTH, axis=0) total_state = np.stack([state], axis=0) else: state = next_state total_state = next_total_state print('Memory is filled') if PRIORITIZE_MEMORY: memory = Memory(ER_SIZE) for exp in replay_memory: memory.store(exp) replay_memory = memory # Record videos # Use the gym env Monitor wrapper # env = MarioGym(headless=False, level_name='Level-5-coins.json', no_coins=5) env = Monitor(env, directory=monitor_path, resume=True, video_callable=lambda count: count % record_video_every == 0) total_deaths = 0 for i_episode in range(num_episodes): # Save the current checkpoint saver.save(tf.get_default_session(), checkpoint_path) # Reset the environment total_state = env.reset(levelname=LEVEL_NAME) state = state_processor.process(sess, total_state, 1) state = np.stack([state] * WINDOW_LENGTH, axis=0) total_state = np.stack([state], axis=0) loss = None dist = 0 # One step in the environment for t in itertools.count(): # Epsilon for this time step epsilon = epsilons[min(total_t, epsilon_decay_steps - 1)] # Add epsilon to Tensorboard episode_summary = tf.Summary() episode_summary.value.add(simple_value=epsilon, tag="epsilon") q_estimator.summary_writer.add_summary(episode_summary, total_t) # Maybe update the target estimator if total_t % update_target_estimator_every == 0: copy_model_parameters(sess, q_estimator, target_estimator) print("\nCopied model parameters to target network.") # Print out which step we're on, useful for debugging. print("\rStep {} ({}) @ Episode {}/{}, loss: {}".format( t, total_t, i_episode + 1, num_episodes, loss), end="") sys.stdout.flush() # Take a step action_probs = policy(sess, state, epsilon) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) next_total_state, reward, done, info = env.step( VALID_ACTIONS[action]) level_up = 0 next_state = state_processor.process(sess, next_total_state, 1) next_state = np.append(state[1:, :, :], np.expand_dims(next_state, 0), axis=0) next_total_state = np.stack([next_state], axis=0) # If our replay memory is full, pop the first element # if replay_memory.tree.data_pointer == replay_memory_size: # replay_memory.pop(0) # Save transition to replay memory memory.store( Transition(total_state, action, reward, next_total_state, done)) # Update statistics stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t stats.episode_kills[i_episode] += info['num_killed'] stats.episode_coins[i_episode] += info['coins_taken'] stats.episode_levels[i_episode] += level_up stats.episode_death[i_episode] += info['death'] if info['death']: total_deaths += 1 if env.mario.rect.x > dist: dist = env.mario.rect.x stats.episode_total_death[i_episode] = total_deaths tree_idx, batch, ISWeights = memory.sample(batch_size) states_batch = np.array([each[0][0] for each in batch], ndmin=3) action_batch = np.array([each[0][1] for each in batch]) reward_batch = np.array([each[0][2] for each in batch]) next_states_batch = np.array([each[0][3] for each in batch], ndmin=3) done_batch = np.array([each[0][4] for each in batch]) # Calculate q values and targets (Double DQN)coins_left next_states_batch = next_states_batch.squeeze() q_values_next = q_estimator.predict(sess, next_states_batch) q_values_next_total = q_values_next best_actions = np.argmax(q_values_next_total, axis=1) q_values_next_target = target_estimator.predict( sess, next_states_batch) targets_batch = reward_batch + np.invert(done_batch).astype(np.float32) * \ (discount_factor * q_values_next_target[np.arange(batch_size), best_actions]) # Perform gradient descent update states_batch = states_batch.squeeze() targets_batch = targets_batch.squeeze() loss1, abs_error = q_estimator.update(sess, states_batch, action_batch, targets_batch, ISWeights) loss = loss1 if done: break state = next_state total_state = next_total_state total_t += 1 stats.episode_distance[i_episode] = dist stats.episode_levels[i_episode] += 1 # Add summaries to tensorboard episode_summary = tf.Summary() episode_summary.value.add( simple_value=stats.episode_rewards[i_episode], node_name="episode_reward", tag="episode_reward") episode_summary.value.add( simple_value=stats.episode_lengths[i_episode], node_name="episode_length", tag="episode_length") episode_summary.value.add(simple_value=stats.episode_kills[i_episode], node_name="episode_kills", tag="episode_kills") episode_summary.value.add(simple_value=stats.episode_coins[i_episode], node_name="episode_coins", tag="episode_coins") episode_summary.value.add(simple_value=stats.episode_levels[i_episode], node_name="episode_levels", tag="episode_levels") episode_summary.value.add( simple_value=stats.episode_total_death[i_episode], node_name="episode_total_death", tag="episode_total_death") episode_summary.value.add( simple_value=stats.episode_distance[i_episode], node_name="episode_distance", tag="episode_distance") episode_summary.value.add(simple_value=stats.episode_death[i_episode], node_name="episode_death", tag="episode_death") q_estimator.summary_writer.add_summary(episode_summary, total_t) q_estimator.summary_writer.flush() yield total_t, plotting.EpisodeStats( episode_lengths=stats.episode_lengths[:i_episode + 1], episode_rewards=stats.episode_rewards[:i_episode + 1], episode_kills=stats.episode_kills[:i_episode + 1], episode_coins=stats.episode_coins[:i_episode + 1], episode_levels=stats.episode_levels[:i_episode + 1], episode_distance=stats.episode_distance[:i_episode + 1], episode_total_death=stats.episode_total_death[:i_episode + 1], episode_death=stats.episode_death[:i_episode + 1]) env.monitor.close() return stats
def prioritzie_replay(replay_memory): memory = Memory(ER_SIZE) for exp in replay_memory: memory.store(exp) return memory
class PDQNagent(Model.nn_model): def __init__( self, state_shape, action_size, main_network, target_network, batch_size=50, gamma=0.95, memory_len=100000, epsilon_init=1, epsilon_min=0.1, epsilon_decay=0.99, learning_rate=0.01, target_update_step=500, Double_Q=True, PER_alpha=0.6, PER_beta=0.4, PER_beta_increment=0.001, ): self.state_shape = state_shape self.action_size = action_size self.memory = Memory(memory_len, alpha=PER_alpha, beta=PER_beta, beta_increment=PER_beta_increment) self.gamma = gamma # discount rate self.epsilon = epsilon_init # exploration rate self.epsilon_min = epsilon_min self.epsilon_decay = epsilon_decay self.learning_rate = learning_rate self.Double_Q = Double_Q self.target_update_step = target_update_step self.optimizer = tf.keras.optimizers.RMSprop( learning_rate=learning_rate) self.loss_function = tf.keras.losses.Huber( reduction=tf.keras.losses.Reduction.NONE) self.loss_metric = tf.keras.metrics.Mean(name='loss') self.avg_q_metric = tf.keras.metrics.Mean(name='avg_q') self.total_update_step = 0 self.batch_size = batch_size self.main_network = main_network self.target_network = target_network self.update_target() def update_target(self): self.target_network.set_weights(self.main_network.get_weights()) def memorize(self, state, action, reward, next_state, done): act = np.zeros(self.action_size) act[action] = 1 self.memory.store((state, act, reward, next_state, done)) def act(self, state, use_epsilon=True): if np.random.rand() <= self.epsilon and use_epsilon: return random.randrange(self.action_size) act_values = self.main_network.predict(np.expand_dims(state, axis=0)) return np.argmax(act_values[0]) # returns action def get_batch(self): batch_size = self.batch_size if batch_size > self.memory.data_num: batch_size = self.memory.data_num tree_indexs, datas, ISweights = self.memory.sample(batch_size) result = [] for i in range(len(datas[0])): temp = [] for experience in datas: temp.append(experience[i]) result.append(np.array(temp)) return result, tree_indexs, np.array(ISweights) def train(self): experiences, tree_indexs, ISweights = self.get_batch() states, actions, rewards, next_states, dones = experiences if self.Double_Q: indices = np.argmax(self.main_network.predict(next_states), axis=1) indices = np.expand_dims(indices, axis=1) qvalues = self.target_network.predict(next_states) qvalues = np.take_along_axis(qvalues, indices, axis=1) qvalues = np.squeeze(qvalues, axis=1) else: qvalues = self.target_network.predict(next_states) qvalues = np.max(qvalues, axis=1) targets = rewards + dones * (self.gamma * qvalues) with tf.GradientTape() as tape: action_values = self.main_network(states) average_action_value = tf.reduce_mean(action_values) action_value = tf.reduce_sum(tf.multiply(action_values, actions), axis=1) td_errors = self.loss_function(targets, action_value) loss = tf.reduce_mean(td_errors * ISweights) gradients = tape.gradient(loss, self.main_network.trainable_variables) self.optimizer.apply_gradients( zip(gradients, self.main_network.trainable_variables)) #update priority priority = td_errors.numpy() self.memory.batch_update(tree_indexs, priority) self.loss_metric(loss) self.avg_q_metric(average_action_value) if self.epsilon > self.epsilon_min: self.epsilon *= self.epsilon_decay self.total_update_step += 1 if self.total_update_step % self.target_update_step == 0: self.update_target() history = { "loss": self.loss_metric.result().numpy(), "avg_q": self.avg_q_metric.result().numpy() } return history
class ApexDQN(): def __init__( self, state_shape, action_size, network_builder, env_class, actor_num=8, batch_size=50, gamma=0.95, memory_len=100000, epsilon_max=0.4, epsilon_alpha=7, learning_rate=0.001, target_update_step=50, Double_Q=True, train_start_step=10000, local_network_update_step=50, n_step=3, ): self.state_shape = state_shape self.action_size = action_size self.env_class = env_class self.memory = Memory(memory_len) self.gamma = gamma # discount rate self.epsilon_max = epsilon_max # exploration rate self.epsilon_alpha = epsilon_alpha self.learning_rate = learning_rate self.Double_Q = Double_Q self.target_update_step = target_update_step self.network_builder = network_builder self.actor_num = actor_num self.train_start_step = train_start_step self.local_network_update_step = local_network_update_step self.n_step = n_step self.history = {"reward": [], "loss": [], "avg_q": [], "step": 0} self.end_training = [False] self.optimizer = tf.keras.optimizers.RMSprop( learning_rate=learning_rate) self.loss_function = tf.keras.losses.Huber( reduction=tf.keras.losses.Reduction.NONE) self.loss_metric = tf.keras.metrics.Mean(name='loss') self.avg_q_metric = tf.keras.metrics.Mean(name='avg_q') self.total_update_step = 0 self.batch_size = batch_size self.main_network = network_builder() self.target_network = network_builder() self.update_target() self.generate_local_networks() self.device = '/gpu:0' def generate_local_networks(self): local_networks = [] for _ in range(self.actor_num): local_networks.append( (self.network_builder(), self.network_builder())) self.local_networks = local_networks def update_target(self): self.target_network.set_weights(self.main_network.get_weights()) def get_batch(self): batch_size = self.batch_size if batch_size > self.memory.data_num: batch_size = self.memory.data_num tree_indexs, datas, ISweights = self.memory.sample(batch_size) result = [] for i in range(len(datas[0])): temp = [] for experience in datas: temp.append(experience[i]) result.append(np.array(temp)) return result, tree_indexs, np.array(ISweights) def train(self): with tf.device(self.device): experiences, tree_indexs, ISweights = self.get_batch() states, actions, rewards, next_states, dones = experiences if self.Double_Q: indices = np.argmax(self.main_network.predict(next_states), axis=1) indices = np.expand_dims(indices, axis=1) qvalues = self.target_network.predict(next_states) qvalues = np.take_along_axis(qvalues, indices, axis=1) qvalues = np.squeeze(qvalues, axis=1) else: qvalues = self.target_network.predict(next_states) qvalues = np.max(qvalues, axis=1) targets = rewards + dones * (self.gamma * qvalues) with tf.GradientTape() as tape: action_values = self.main_network(states) average_action_value = tf.reduce_mean(action_values) action_value = tf.reduce_sum(tf.multiply( action_values, actions), axis=1) td_errors = self.loss_function(targets, action_value) loss = tf.reduce_mean(td_errors * ISweights) gradients = tape.gradient(loss, self.main_network.trainable_variables) self.optimizer.apply_gradients( zip(gradients, self.main_network.trainable_variables)) #update priority priority = td_errors.numpy() self.memory.batch_update(tree_indexs, priority) self.loss_metric(loss) self.avg_q_metric(average_action_value) self.total_update_step += 1 if self.total_update_step % self.target_update_step == 0: self.update_target() history = { "loss": self.loss_metric.result().numpy(), "avg_q": self.avg_q_metric.result().numpy() } return history def main(self, train_until, plot_every): self.end_training[0] = False history = self.history plot_metrics = ["reward", "loss", "avg_q"] print("generating actors") actors = [] for i in range(self.actor_num): epsilon = self.epsilon_max**( 1 + i / (self.actor_num - 1) * self.epsilon_alpha) actors.append( Actor(self.state_shape, self.action_size, self.local_networks[i][0], self.local_networks[i][1], self.main_network, self.target_network, self.memory, epsilon, self.Double_Q, self.loss_function, self.end_training, self.gamma, self.env_class(), history, self.local_network_update_step, self.n_step)) print("start acting for {} steps".format(self.train_start_step)) for actor in actors: actor.start() while self.history["step"] < self.train_start_step: time.sleep(1) print("step :", self.history["step"]) print("start training") metrics = ["loss", "avg_q"] episode = 0 train_num = 0 while train_num <= train_until: episode = len(history["reward"]) hist = self.train() train_num += 1 for met in metrics: history[met].append(hist[met]) if train_num % plot_every == 0: for met in plot_metrics: plotter.plotWithSmooth(history[met], met) print("episodes :", episode, " train_num :", train_num, " memory len :", self.memory.data_num) self.end_training[0] = True for actor in actors: actor.join()