Exemplo n.º 1
0
	def __init__(self, action_set, train=True, load_path=None):
		#1. Initialize agent params
		self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
		self.action_set = action_set
		self.action_number = len(action_set)
		self.steps_done = 0
		self.epsilon = Config.EPS_START
		self.episode_durations = []

		#2. Build networks
		self.policy_net = DQN().to(self.device)
		self.target_net = DQN().to(self.device)
		
		self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=Config.LEARNING_RATE)

		if not train:		
			self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=0)	
			self.policy_net.load(load_path, optimizer=self.optimizer)
			self.policy_net.eval()

		self.target_net.load_state_dict(self.policy_net.state_dict())
		self.target_net.eval()

		#3. Create Prioritized Experience Replay Memory
		self.memory = Memory(Config.MEMORY_SIZE)
Exemplo n.º 2
0
    def __init__(self, state_size, action_size, actor_lr, critic_lr, tau,
                gamma, lambd, batch_size, memory_size, 
                epsilon, epsilon_end, decay_step, load_model):
        self.state_size = state_size
        self.vel_size = 3
        self.action_size = action_size
        self.action_high = 1.5
        self.action_low = -self.action_high
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.tau = tau
        self.gamma = gamma
        self.lambd = lambd
        self.batch_size = batch_size
        self.memory_size = memory_size
        self.epsilon = epsilon
        self.epsilon_end = epsilon_end
        self.decay_step = decay_step
        self.epsilon_decay = (epsilon - epsilon_end) / decay_step

        self.sess = tf.Session()
        K.set_session(self.sess)

        self.actor, self.critic = self.build_model()
        self.target_actor, self.target_critic = self.build_model()
        self.actor_update = self.build_actor_optimizer()
        self.critic_update = self.build_critic_optimizer()
        self.sess.run(tf.global_variables_initializer())
        if load_model:
            self.load_model('./save_model/'+ agent_name)
        
        self.target_actor.set_weights(self.actor.get_weights())
        self.target_critic.set_weights(self.critic.get_weights())

        self.memory = Memory(self.memory_size)
    def __init__(
        self,
        state_shape,
        action_size,
        network_builder,
        env_class,
        actor_num=8,
        batch_size=50,
        gamma=0.95,
        memory_len=100000,
        epsilon_max=0.4,
        epsilon_alpha=7,
        learning_rate=0.001,
        target_update_step=50,
        Double_Q=True,
        train_start_step=10000,
        local_network_update_step=50,
        n_step=3,
    ):

        self.state_shape = state_shape
        self.action_size = action_size
        self.env_class = env_class
        self.memory = Memory(memory_len)
        self.gamma = gamma  # discount rate
        self.epsilon_max = epsilon_max  # exploration rate
        self.epsilon_alpha = epsilon_alpha
        self.learning_rate = learning_rate
        self.Double_Q = Double_Q
        self.target_update_step = target_update_step
        self.network_builder = network_builder
        self.actor_num = actor_num
        self.train_start_step = train_start_step
        self.local_network_update_step = local_network_update_step
        self.n_step = n_step

        self.history = {"reward": [], "loss": [], "avg_q": [], "step": 0}
        self.end_training = [False]

        self.optimizer = tf.keras.optimizers.RMSprop(
            learning_rate=learning_rate)
        self.loss_function = tf.keras.losses.Huber(
            reduction=tf.keras.losses.Reduction.NONE)

        self.loss_metric = tf.keras.metrics.Mean(name='loss')
        self.avg_q_metric = tf.keras.metrics.Mean(name='avg_q')

        self.total_update_step = 0
        self.batch_size = batch_size

        self.main_network = network_builder()
        self.target_network = network_builder()
        self.update_target()

        self.generate_local_networks()

        self.device = '/gpu:0'
Exemplo n.º 4
0
    def __init__(
        self,
        state_shape,
        action_size,
        main_network,
        target_network,
        batch_size=50,
        gamma=0.95,
        memory_len=100000,
        epsilon_init=1,
        epsilon_min=0.1,
        epsilon_decay=0.99,
        learning_rate=0.01,
        target_update_step=500,
        Double_Q=True,
        PER_alpha=0.6,
        PER_beta=0.4,
        PER_beta_increment=0.001,
    ):

        self.state_shape = state_shape
        self.action_size = action_size
        self.memory = Memory(memory_len,
                             alpha=PER_alpha,
                             beta=PER_beta,
                             beta_increment=PER_beta_increment)
        self.gamma = gamma  # discount rate
        self.epsilon = epsilon_init  # exploration rate
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        self.learning_rate = learning_rate
        self.Double_Q = Double_Q
        self.target_update_step = target_update_step

        self.optimizer = tf.keras.optimizers.RMSprop(
            learning_rate=learning_rate)
        self.loss_function = tf.keras.losses.Huber(
            reduction=tf.keras.losses.Reduction.NONE)

        self.loss_metric = tf.keras.metrics.Mean(name='loss')
        self.avg_q_metric = tf.keras.metrics.Mean(name='avg_q')

        self.total_update_step = 0
        self.batch_size = batch_size

        self.main_network = main_network
        self.target_network = target_network
        self.update_target()
Exemplo n.º 5
0
    def __init__(self, options, resume_previous_train):
        self.model = FinalDQN(options).to(device)
        self.target = FinalDQN(options).to(device)

        self.double_dqn, self.dueling_dqn, self.per, self.noisy_dqn, self.dist_dqn = options
        options_list = ["double", "dueling", "per", "noisy", "dist"]

        if self.per:
            self.memory = Memory(memory_size)
        else:
            self.memory = ExperienceReplay(memory_size)

        self.loss_func = nn.SmoothL1Loss()
        self.optimizer = optim.Adam(params=self.model.parameters(),
                                    lr=learning_rate)
        self.num_updates = 0

        if all(options):
            self.PATH = "Plots/RL/all/network_all.pth"
        elif not any(options):
            self.PATH = "Plots/RL/vanilla_dqn/network_vanilla_dqn.pth"
        else:
            zero_idx = list(options).index(0)
            self.PATH = "Plots/RL/no_" + options_list[
                zero_idx] + "_dqn/network_no_" + options_list[
                    zero_idx] + "_dqn.pth"

        if resume_previous_train and os.path.exists(self.PATH):
            print("Loading previously saved model ... ")
            self.model.load_state_dict(load_model(self.PATH))
Exemplo n.º 6
0
    def __init__(self, state_size, action_size, actor_lr, critic_lr, tau,
                 gamma, lambd, batch_size, memory_size, actor_delay,
                 target_noise, epsilon, epsilon_end, decay_step, load_model,
                 play):
        self.state_size = state_size
        self.vel_size = 3
        self.action_size = action_size
        self.action_high = 1.5
        self.action_low = -self.action_high
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.tau = tau
        self.gamma = gamma
        self.lambd = lambd
        self.actor_delay = actor_delay
        self.target_noise = target_noise

        self.batch_size = batch_size
        self.memory_size = memory_size
        self.epsilon = epsilon
        self.epsilon_end = epsilon_end
        self.decay_step = decay_step
        self.epsilon_decay = (epsilon - epsilon_end) / decay_step

        if play:
            gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.5)
            self.sess = tf.Session(config=tf.ConfigProto(
                gpu_options=gpu_options))
        else:
            self.sess = tf.Session()
        K.set_session(self.sess)

        self.actor, self.critic, self.critic2 = self.build_model()
        self.target_actor, self.target_critic, self.target_critic2 = self.build_model(
        )
        self.actor_update = self.build_actor_optimizer()
        self.critic_update = self.build_critic_optimizer()
        self.critic2_update = self.build_critic2_optimizer()
        self.sess.run(tf.global_variables_initializer())
        if load_model:
            self.load_model('./save_model/' + agent_name)

        self.target_actor.set_weights(self.actor.get_weights())
        self.target_critic.set_weights(self.critic.get_weights())
        self.target_critic2.set_weights(self.critic2.get_weights())

        self.memory = Memory(self.memory_size)
    def __init__(self):
        self.env = gym.make("SpaceInvaders-v0")
        self.observation_size = self.env.observation_space
        self.action_size = self.env.action_space.n
        self.frame_size = [84, 84]
        self.stack_size = 4
        self.input_shape = [*self.frame_size, self.stack_size]

        MEMORY_SIZE = 30000
        self.memory = Memory(MEMORY_SIZE)
        self.INITIAL_MEMORY_SIZE = 10000

        self.EXPLORATION_RATE = 1
        self.EXPLORATION_DECAY = 0.9999
        self.EXPLORATION_MIN = 0.01
        self.UPDATE_MODEL_STEP = 10000
        self.TRAINING_FREQUENCY = 4

        self.model = Model(self.input_shape, self.action_size)
        self.model.update_model()
        self.sess_path = r"C:\Projects\Personal Projects\Saved_Sessions\Space_Invader\last_train_sess.pkl"
class Agent:
    def __init__(self):
        self.env = gym.make("SpaceInvaders-v0")
        self.observation_size = self.env.observation_space
        self.action_size = self.env.action_space.n
        self.frame_size = [84, 84]
        self.stack_size = 4
        self.input_shape = [*self.frame_size, self.stack_size]

        MEMORY_SIZE = 30000
        self.memory = Memory(MEMORY_SIZE)
        self.INITIAL_MEMORY_SIZE = 10000

        self.EXPLORATION_RATE = 1
        self.EXPLORATION_DECAY = 0.9999
        self.EXPLORATION_MIN = 0.01
        self.UPDATE_MODEL_STEP = 10000
        self.TRAINING_FREQUENCY = 4

        self.model = Model(self.input_shape, self.action_size)
        self.model.update_model()
        self.sess_path = r"C:\Projects\Personal Projects\Saved_Sessions\Space_Invader\last_train_sess.pkl"

    def preprocess_frame(self, frame):
        ''' 
        FUNCTION: pre-process the frames for training 
        - grey scale
        - crop the edges
        - normalise the pixel values
        - resize the fram 
        '''
        gray = rgb2gray(frame)
        cropped_frame = gray[8:-12, 4:-12]
        normalized_frame = cropped_frame / 255.0
        preprocessed_frame = transform.resize(normalized_frame,
                                              self.frame_size)

        return preprocessed_frame

    def stack_frames(self, stacked_frames, new_frame, is_new_episode):
        '''
        FUNCTION: Given a stacked frame, append a new frame to this stack    
        '''
        # Preprocess frame before stacking
        frame = self.preprocess_frame(new_frame)

        # if new episode make copies of frame and stack into np arrays
        if is_new_episode:
            stacked_frames = deque([
                np.zeros(self.frame_size, dtype=np.int)
                for i in range(0, self.stack_size)
            ],
                                   maxlen=self.stack_size)
            for _ in range(0, self.stack_size):
                stacked_frames.append(frame)
            stacked_states = np.stack(stacked_frames, axis=2)
        # else append the frame to the queue
        else:
            stacked_frames.append(frame)
            stacked_states = np.stack(stacked_frames, axis=2)

        return stacked_states, stacked_frames

    def test(self,
             n_episodes,
             model=None,
             memory=None,
             render=False,
             clip_reward=False):
        ''' 
        Play a game to test environment 
        '''
        avg_rewards = 0
        steps = []

        for i in range(1, n_episodes + 1):
            state = self.env.reset()
            stacked_frames = deque([
                np.zeros(self.frame_size, dtype=np.int)
                for i in range(0, self.stack_size)
            ],
                                   maxlen=self.stack_size)
            state, stacked_frames = self.stack_frames(stacked_frames, state,
                                                      True)
            done = False
            total_reward = 0
            step = 0

            while not done:
                if render:
                    self.env.render()
                    time.sleep(0.01)
                if model:
                    action = self.model.act(state, 0)
                else:
                    action = np.random.randint(self.model.action_space)
                state_next, reward, done, info = self.env.step(action)
                if clip_reward:
                    reward = np.sign(reward)
                if done:
                    state_next = np.zeros(self.frame_size, dtype=np.int)
                state_next, stacked_frames = self.stack_frames(
                    stacked_frames, state_next, False)
                if memory:
                    memory.store((state, action, reward, state_next, done))
                state = state_next
                total_reward += reward
                step += 1

            if render:
                self.env.close()

            avg_rewards = avg_rewards + 1 / (i) * (total_reward - avg_rewards)
            steps.append(step)

        print("The average rewards for {} runs is {}".format(
            n_episodes, avg_rewards))

        return steps, avg_rewards

    def initialise_memory(self):
        print("Start filling memory")

        while self.memory.memory_tree.capacity_filled < self.INITIAL_MEMORY_SIZE:
            steps, total_reward = self.test(1,
                                            model=None,
                                            memory=self.memory,
                                            clip_reward=True)

        print("Memory filled! The memory length is",
              self.memory.memory_tree.capacity_filled)

    def restore_and_test(self):
        self.model.load_model()
        self.test(1, model=self.model.DQNetwork, render=True)

    def run(self, continue_sess=False):
        # Delete log directory
        # if os.path.isdir(self.model.log_path):
        #     shutil.rmtree(self.model.log_path)

        if continue_sess:
            self.model.load_model()
            with open(self.sess_path, 'rb') as f:
                start_episode, total_steps, rewards, losses, self.EXPLORATION_RATE, self.memory = pickle.load(
                    f)
            print("Continuing training from episode: {},  step: {}, exploration_rate: {:.4f}, Memory Size: {}"\
                  .format(start_episode, total_steps, self.EXPLORATION_RATE, self.memory.memory_tree.capacity_filled))
        else:
            self.initialise_memory()
            total_steps = 0
            losses = []
            rewards = []
            start_episode = 0

        steps = []
        start_time = time.time()
        total_epsiodes = start_episode + 5
        for i in range(start_episode, total_epsiodes):

            # Make a new episode and observe the first state
            state = self.env.reset()
            stacked_frames = deque([
                np.zeros(self.frame_size, dtype=np.int)
                for i in range(0, self.stack_size)
            ],
                                   maxlen=self.stack_size)
            state, stacked_frames = self.stack_frames(stacked_frames, state,
                                                      True)

            # Set step to 0
            episode_rewards = 0
            episode_steps = 0
            done = False

            while not done:
                episode_steps += 1
                total_steps += 1

                # Take a step
                action = self.model.act(state, self.EXPLORATION_RATE)
                next_state, reward, done, _ = self.env.step(action)

                # accumulate rewards
                reward = np.sign(reward)
                episode_rewards += reward

                # Tasks when done
                if done:
                    next_state = np.zeros(self.frame_size, dtype=np.int)
                    next_state, stacked_frames = self.stack_frames(
                        stacked_frames, next_state, False)
                    print("Episode {}, exploration rate: {:.4f}, final rewards: {}, final loss is {:.4f}, Time elapsed: {:.4f}"\
                          .format(i+1, self.EXPLORATION_RATE, episode_rewards, loss, time.time() - start_time))
                    # self.test(1, model = self.model.DQNetwork, render=False)
                    start_time = time.time()
                else:
                    next_state, stacked_frames = self.stack_frames(
                        stacked_frames, next_state, False)
                self.memory.store((state, action, reward, next_state, done))
                state = next_state

                # Update target model and save model every UPDATE_STEP
                if (total_steps % self.UPDATE_MODEL_STEP == 0):
                    self.model.update_model()
                    self.model.save_model()

                ### LEARNING PROCEDURE ###
                if total_steps % self.TRAINING_FREQUENCY == 0:
                    tree_idx, IS_weights, batch = self.memory.sample(
                        self.model.BATCH_SIZE)
                    loss, abs_TD_error = self.model.DQN_train(
                        batch, IS_weights, tree_idx, total_steps)
                    losses.append(loss)
                    # Update the sample priority of batch
                    self.memory.update_batch(tree_idx, abs_TD_error)
                    # Reduce the exploreation every step
                    self.EXPLORATION_RATE *= self.EXPLORATION_DECAY
                    self.EXPLORATION_RATE = max(self.EXPLORATION_MIN,
                                                self.EXPLORATION_RATE)
                ### LEARNING PROCEDURE ###

            # Append values at the end of an episode
            steps.append(episode_steps)
            rewards.append(episode_rewards)

        # Save model at the end of training
        print("Training Done")
        self.model.update_model()
        self.model.save_model()

        # save variables to continue training
        with open(self.sess_path, 'wb') as f:  # Python 3: open(..., 'wb')
            pickle.dump([
                i + 1, total_steps, rewards, losses, self.EXPLORATION_RATE,
                self.memory
            ],
                        f,
                        protocol=-1)

        # Save plot
        plt.plot(rewards)
        plt.ylabel('Rewards')
        plt.xlabel('Episodes')
        plt.savefig('rewards.png')
Exemplo n.º 9
0
class Agent:
	"""
	The intelligent agent of the simulation. Set the model of the neural network used and general parameters.
	It is responsible to select the actions, optimize the neural network and manage the models.
	"""

	def __init__(self, action_set, train=True, load_path=None):
		#1. Initialize agent params
		self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
		self.action_set = action_set
		self.action_number = len(action_set)
		self.steps_done = 0
		self.epsilon = Config.EPS_START
		self.episode_durations = []

		#2. Build networks
		self.policy_net = DQN().to(self.device)
		self.target_net = DQN().to(self.device)
		
		self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=Config.LEARNING_RATE)

		if not train:		
			self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=0)	
			self.policy_net.load(load_path, optimizer=self.optimizer)
			self.policy_net.eval()

		self.target_net.load_state_dict(self.policy_net.state_dict())
		self.target_net.eval()

		#3. Create Prioritized Experience Replay Memory
		self.memory = Memory(Config.MEMORY_SIZE)


	 
	def append_sample(self, state, action, next_state, reward):
		"""
		save sample (error,<s,a,s',r>) to the replay memory
		"""

		# Define if is the end of the simulation
		done = True if next_state is None else False

		# Compute Q(s_t, a) - the model computes Q(s_t), then we select the columns of actions taken
		state_action_values = self.policy_net(state)
		state_action_values = state_action_values.gather(1, action.view(-1,1))

		
		if not done:
			# Compute argmax Q(s', a; θ)		
			next_state_actions = self.policy_net(next_state).max(1)[1].detach().unsqueeze(1)

			# Compute Q(s', argmax Q(s', a; θ), θ-)
			next_state_values = self.target_net(next_state).gather(1, next_state_actions).squeeze(1).detach()

			# Compute the expected Q values
			expected_state_action_values = (next_state_values * Config.GAMMA) + reward
		else:
			expected_state_action_values = reward


		error = abs(state_action_values - expected_state_action_values).data.cpu().numpy()


		self.memory.add(error, state, action, next_state, reward)

	def select_action(self, state, train=True):
		"""
		Selet the best action according to the Q-values outputed from the neural network

		Parameters
		----------
			state: float ndarray
				The current state on the simulation
			train: bool
				Define if we are evaluating or trainning the model
		Returns
		-------
			a.max(1)[1]: int
				The action with the highest Q-value
			a.max(0): float
				The Q-value of the action taken
		"""
		global steps_done
		sample = random.random()
		#1. Perform a epsilon-greedy algorithm
		#a. set the value for epsilon
		self.epsilon = Config.EPS_END + (Config.EPS_START - Config.EPS_END) * \
			math.exp(-1. * self.steps_done / Config.EPS_DECAY)
			
		self.steps_done += 1

		#b. make the decision for selecting a random action or selecting an action from the neural network
		if sample > self.epsilon or (not train):
			# select an action from the neural network
			with torch.no_grad():
				# a <- argmax Q(s, theta)
				a = self.policy_net(state)
				return a.max(1)[1].view(1, 1), a.max(0)
		else:
			# select a random action
			print('random action')
			return torch.tensor([[random.randrange(2)]], device=self.device, dtype=torch.long), None

	"""
	def select_action(self, state, train=True):
		
		Selet the best action according to the Q-values outputed from the neural network

		Parameters
		----------
			state: float ndarray
				The current state on the simulation
			train: bool
				Define if we are evaluating or trainning the model
		Returns
		-------
			a.max(1)[1]: int
				The action with the highest Q-value
			a.max(0): float
				The Q-value of the action taken
		
		global steps_done
		sample = random.random()
		#1. Perform a epsilon-greedy algorithm
		#a. set the value for epsilon
		self.epsilon = Config.EPS_END + (Config.EPS_START - Config.EPS_END) * \
			math.exp(-1. * self.steps_done / Config.EPS_DECAY)
			
		self.steps_done += 1

		#b. make the decision for selecting a random action or selecting an action from the neural network
		if sample > self.epsilon or (not train):
			# select an action from the neural network
			with torch.no_grad():
				# a <- argmax Q(s, theta)
				#set the network to train mode is important to enable dropout
				self.policy_net.train()
				output_list = []
				# Retrieve the outputs from neural network feedfoward n times to build a statistic model
				for i in range(Config.STOCHASTIC_PASSES):
					#print(agent.policy_net(data))
					output_list.append(torch.unsqueeze(F.softmax(self.policy_net(state)), 0))
					#print(output_list[i])

				self.policy_net.eval()
				# The result of the network is the mean of n passes
				output_mean = torch.cat(output_list, 0).mean(0)
				q_value = output_mean.data.cpu().numpy().max()
				action = output_mean.max(1)[1].view(1, 1)

				uncertainty = torch.cat(output_list, 0).var(0).mean().item()
				
				return action, q_value, uncertainty
				
		else:
			# select a random action
			print('random action')
			return torch.tensor([[random.randrange(2)]], device=self.device, dtype=torch.long), None, None

	"""
	def optimize_model(self):
		"""
		Perform one step of optimization on the neural network
		"""

		if self.memory.tree.n_entries < Config.BATCH_SIZE:
			return
		transitions, idxs, is_weights = self.memory.sample(Config.BATCH_SIZE)

		# Transpose the batch (see http://stackoverflow.com/a/19343/3343043 for detailed explanation).
		batch = Transition(*zip(*transitions))

		# Compute a mask of non-final states and concatenate the batch elements
		non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
											  batch.next_state)), device=self.device, dtype=torch.uint8)
		non_final_next_states = torch.cat([s for s in batch.next_state
													if s is not None])
		
		state_batch = torch.cat(batch.state)
		action_batch = torch.cat(batch.action)
		reward_batch = torch.cat(batch.reward)
		
		# Compute Q(s_t, a) - the model computes Q(s_t), then we select the columns of actions taken
		state_action_values = self.policy_net(state_batch).gather(1, action_batch)
		
	
		# Compute argmax Q(s', a; θ)		
		next_state_actions = self.policy_net(non_final_next_states).max(1)[1].detach().unsqueeze(1)

		# Compute Q(s', argmax Q(s', a; θ), θ-)
		next_state_values = torch.zeros(Config.BATCH_SIZE, device=self.device)
		next_state_values[non_final_mask] = self.target_net(non_final_next_states).gather(1, next_state_actions).squeeze(1).detach()

		# Compute the expected Q values
		expected_state_action_values = (next_state_values * Config.GAMMA) + reward_batch

		# Update priorities
		errors = torch.abs(state_action_values.squeeze() - expected_state_action_values).data.cpu().numpy()
		
		# update priority
		for i in range(Config.BATCH_SIZE):
			idx = idxs[i]
			self.memory.update(idx, errors[i])


		# Compute Huber loss
		loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1))
		loss_return = loss.item()

		# Optimize the model
		self.optimizer.zero_grad()
		loss.backward()
		for param in self.policy_net.parameters():
			param.grad.data.clamp_(-1, 1)
		self.optimizer.step()

		return loss_return

	def save(self, step, logs_path, label):
		"""
		Save the model on hard disc

		Parameters
		----------
			step: int
				current step on the simulation
			logs_path: string
				path to where we will store the model
			label: string
				label that will be used to store the model
		"""

		os.makedirs(logs_path + label, exist_ok=True)

		full_label = label + str(step) + '.pth'
		logs_path = os.path.join(logs_path, label, full_label)

		self.policy_net.save(logs_path, step=step, optimizer=self.optimizer)
	
	def restore(self, logs_path):
		"""
		Load the model from hard disc

		Parameters
		----------
			logs_path: string
				path to where we will store the model
		"""
		self.policy_net.load(logs_path)
		self.target_net.load(logs_path)
Exemplo n.º 10
0
class TD3Agent(object):
    def __init__(self, state_size, action_size, actor_lr, critic_lr, tau,
                 gamma, lambd, batch_size, memory_size, actor_delay,
                 target_noise, epsilon, epsilon_end, decay_step, load_model,
                 play):
        self.state_size = state_size
        self.vel_size = 3
        self.action_size = action_size
        self.action_high = 1.5
        self.action_low = -self.action_high
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.tau = tau
        self.gamma = gamma
        self.lambd = lambd
        self.actor_delay = actor_delay
        self.target_noise = target_noise

        self.batch_size = batch_size
        self.memory_size = memory_size
        self.epsilon = epsilon
        self.epsilon_end = epsilon_end
        self.decay_step = decay_step
        self.epsilon_decay = (epsilon - epsilon_end) / decay_step

        if play:
            gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.5)
            self.sess = tf.Session(config=tf.ConfigProto(
                gpu_options=gpu_options))
        else:
            self.sess = tf.Session()
        K.set_session(self.sess)

        self.actor, self.critic, self.critic2 = self.build_model()
        self.target_actor, self.target_critic, self.target_critic2 = self.build_model(
        )
        self.actor_update = self.build_actor_optimizer()
        self.critic_update = self.build_critic_optimizer()
        self.critic2_update = self.build_critic2_optimizer()
        self.sess.run(tf.global_variables_initializer())
        if load_model:
            self.load_model('./save_model/' + agent_name)

        self.target_actor.set_weights(self.actor.get_weights())
        self.target_critic.set_weights(self.critic.get_weights())
        self.target_critic2.set_weights(self.critic2.get_weights())

        self.memory = Memory(self.memory_size)

    def build_model(self):
        # shared network
        # image process
        image = Input(shape=self.state_size)
        image_process = BatchNormalization()(image)
        image_process = TimeDistributed(
            Conv2D(16, (3, 3),
                   activation='elu',
                   padding='same',
                   kernel_initializer='he_normal'))(image_process)
        image_process = TimeDistributed(
            Conv2D(16, (3, 3),
                   activation='elu',
                   kernel_initializer='he_normal'))(image_process)
        image_process = TimeDistributed(
            Conv2D(16, (3, 3),
                   activation='elu',
                   kernel_initializer='he_normal'))(image_process)
        image_process = TimeDistributed(MaxPooling2D((3, 3)))(image_process)
        image_process = TimeDistributed(
            Conv2D(16, (3, 3),
                   activation='elu',
                   kernel_initializer='he_normal'))(image_process)
        image_process = TimeDistributed(
            Conv2D(16, (3, 3),
                   activation='elu',
                   kernel_initializer='he_normal'))(image_process)
        image_process = TimeDistributed(MaxPooling2D((2, 2)))(image_process)
        image_process = TimeDistributed(
            Conv2D(32, (3, 3),
                   activation='elu',
                   kernel_initializer='he_normal'))(image_process)
        image_process = TimeDistributed(
            Conv2D(32, (3, 3),
                   activation='elu',
                   kernel_initializer='he_normal'))(image_process)
        image_process = TimeDistributed(MaxPooling2D((2, 2)))(image_process)
        image_process = TimeDistributed(
            Conv2D(16, (3, 3),
                   activation='elu',
                   kernel_initializer='he_normal'))(image_process)
        image_process = TimeDistributed(MaxPooling2D((2, 2)))(image_process)
        image_process = TimeDistributed(
            Conv2D(8, (1, 1), activation='elu',
                   kernel_initializer='he_normal'))(image_process)
        image_process = TimeDistributed(Flatten())(image_process)
        image_process = GRU(48, kernel_initializer='he_normal',
                            use_bias=False)(image_process)
        image_process = BatchNormalization()(image_process)
        image_process = Activation('tanh')(image_process)

        # vel process
        vel = Input(shape=[self.vel_size])
        vel_process = Dense(48, kernel_initializer='he_normal',
                            use_bias=False)(vel)
        vel_process = BatchNormalization()(vel_process)
        vel_process = Activation('tanh')(vel_process)

        # state process
        # state_process = Concatenate()([image_process, vel_process])
        state_process = Add()([image_process, vel_process])

        # Actor
        policy = Dense(32, kernel_initializer='he_normal',
                       use_bias=False)(state_process)
        policy = BatchNormalization()(policy)
        policy = ELU()(policy)
        policy = Dense(32, kernel_initializer='he_normal',
                       use_bias=False)(policy)
        policy = BatchNormalization()(policy)
        policy = ELU()(policy)
        policy = Dense(self.action_size,
                       kernel_initializer=tf.random_uniform_initializer(
                           minval=-3e-3, maxval=3e-3))(policy)
        policy = Lambda(
            lambda x: K.clip(x, self.action_low, self.action_high))(policy)
        actor = Model(inputs=[image, vel], outputs=policy)

        # Critic
        action = Input(shape=[self.action_size])
        action_process = Dense(48,
                               kernel_initializer='he_normal',
                               use_bias=False)(action)
        action_process = BatchNormalization()(action_process)
        action_process = Activation('tanh')(action_process)
        state_action = Add()([state_process, action_process])

        Qvalue = Dense(32, kernel_initializer='he_normal',
                       use_bias=False)(state_action)
        Qvalue = BatchNormalization()(Qvalue)
        Qvalue = ELU()(Qvalue)
        Qvalue = Dense(32, kernel_initializer='he_normal',
                       use_bias=False)(Qvalue)
        Qvalue = BatchNormalization()(Qvalue)
        Qvalue = ELU()(Qvalue)
        Qvalue = Dense(1,
                       kernel_initializer=tf.random_uniform_initializer(
                           minval=-3e-3, maxval=3e-3))(Qvalue)
        critic = Model(inputs=[image, vel, action], outputs=Qvalue)

        # Critic2
        action = Input(shape=[self.action_size])
        action_process2 = Dense(48,
                                kernel_initializer='he_normal',
                                use_bias=False)(action)
        action_process2 = BatchNormalization()(action_process2)
        action_process2 = Activation('tanh')(action_process2)
        state_action2 = Add()([state_process, action_process2])

        Qvalue2 = Dense(32, kernel_initializer='he_normal',
                        use_bias=False)(state_action2)
        Qvalue2 = BatchNormalization()(Qvalue2)
        Qvalue2 = ELU()(Qvalue2)
        Qvalue2 = Dense(32, kernel_initializer='he_normal',
                        use_bias=False)(Qvalue2)
        Qvalue2 = BatchNormalization()(Qvalue2)
        Qvalue2 = ELU()(Qvalue2)
        Qvalue2 = Dense(1,
                        kernel_initializer=tf.random_uniform_initializer(
                            minval=-3e-3, maxval=3e-3))(Qvalue2)
        critic2 = Model(inputs=[image, vel, action], outputs=Qvalue2)

        actor._make_predict_function()
        critic._make_predict_function()
        critic2._make_predict_function()

        return actor, critic, critic2

    def build_actor_optimizer(self):
        pred_Q = self.critic.output
        action_grad = tf.gradients(pred_Q, self.critic.input[2])
        target = -action_grad[0] / self.batch_size
        params_grad = tf.gradients(self.actor.output,
                                   self.actor.trainable_weights, target)
        params_grad, global_norm = tf.clip_by_global_norm(params_grad, 5.0)
        grads = zip(params_grad, self.actor.trainable_weights)
        optimizer = tf.train.AdamOptimizer(self.actor_lr)
        updates = optimizer.apply_gradients(grads)
        train = K.function(
            [self.actor.input[0], self.actor.input[1], self.critic.input[2]],
            [global_norm],
            updates=[updates])
        return train

    def build_critic_optimizer(self):
        y = K.placeholder(shape=(None, 1), dtype='float32')
        pred = self.critic.output

        loss = K.mean(K.square(pred - y))
        # Huber Loss
        # error = K.abs(y - pred)
        # quadratic = K.clip(error, 0.0, 1.0)
        # linear = error - quadratic
        # loss = K.mean(0.5 * K.square(quadratic) + linear)

        optimizer = Adam(lr=self.critic_lr)
        updates = optimizer.get_updates(self.critic.trainable_weights, [],
                                        loss)
        train = K.function([
            self.critic.input[0], self.critic.input[1], self.critic.input[2], y
        ], [pred, loss],
                           updates=updates)
        return train

    def build_critic2_optimizer(self):
        y = K.placeholder(shape=(None, 1), dtype='float32')
        pred = self.critic2.output

        loss = K.mean(K.square(pred - y))
        # # Huber Loss
        # error = K.abs(y - pred)
        # quadratic = K.clip(error, 0.0, 1.0)
        # linear = error - quadratic
        # loss = K.mean(0.5 * K.square(quadratic) + linear)

        optimizer = Adam(lr=self.critic_lr)
        updates = optimizer.get_updates(self.critic2.trainable_weights, [],
                                        loss)
        train = K.function([
            self.critic2.input[0], self.critic2.input[1],
            self.critic2.input[2], y
        ], [loss],
                           updates=updates)
        return train

    def get_action(self, state):
        policy = self.actor.predict(state)[0]
        noise = np.random.normal(0, self.epsilon, self.action_size)
        action = np.clip(policy + noise, self.action_low, self.action_high)
        return action, policy

    def train_model(self):
        batch, idxs, _ = self.memory.sample(self.batch_size)

        images = np.zeros([self.batch_size] + self.state_size)
        vels = np.zeros([self.batch_size, self.vel_size])
        actions = np.zeros((self.batch_size, self.action_size))
        rewards = np.zeros((self.batch_size, 1))
        next_images = np.zeros([self.batch_size] + self.state_size)
        next_vels = np.zeros([self.batch_size, self.vel_size])
        dones = np.zeros((self.batch_size, 1))

        targets = np.zeros((self.batch_size, 1))

        for i, sample in enumerate(batch):
            images[i], vels[i] = sample[0]
            actions[i] = sample[1]
            rewards[i] = sample[2]
            next_images[i], next_vels[i] = sample[3]
            dones[i] = sample[4]
        states = [images, vels]
        next_states = [next_images, next_vels]
        policy = self.actor.predict(states)
        target_actions = self.target_actor.predict(next_states)
        target_noises = np.random.normal(0, self.target_noise,
                                         target_actions.shape)
        target_actions = np.clip(target_actions + target_noises,
                                 self.action_low, self.action_high)

        target_next_Qs1 = self.target_critic.predict(next_states +
                                                     [target_actions])
        target_next_Qs2 = self.target_critic2.predict(next_states +
                                                      [target_actions])
        target_next_Qs = np.minimum(target_next_Qs1, target_next_Qs2)
        targets = rewards + self.gamma * (1 - dones) * target_next_Qs

        critic_loss = 0
        for _ in range(self.actor_delay):
            pred, c_loss = self.critic_update(states + [actions, targets])
            c2_loss = self.critic2_update(states + [actions, targets])
            critic_loss += c_loss + c2_loss[0]
        actor_loss = self.actor_update(states + [policy])
        tds = np.abs(pred - targets)
        for i in range(self.batch_size):
            idx = idxs[i]
            self.memory.update(idx, tds[i])

        return actor_loss[0], critic_loss / (self.actor_delay * 2.0)

    def append_memory(self, state, action, reward, next_state, done):
        Q = self.critic.predict(state + [action.reshape(1, -1)])[0]
        target_action = self.target_actor.predict(next_state)[0]
        target_Q1 = self.target_critic.predict(
            next_state + [target_action.reshape(1, -1)])[0]
        target_Q2 = self.target_critic2.predict(
            next_state + [target_action.reshape(1, -1)])[0]
        target_Q = np.minimum(target_Q1, target_Q2)
        td = reward + (1 - done) * self.gamma * target_Q - Q
        td = float(abs(td[0]))
        self.memory.add(td, (state, action, reward, next_state, done))
        return td

    def load_model(self, name):
        if os.path.exists(name + '_actor.h5'):
            self.actor.load_weights(name + '_actor.h5')
            print('Actor loaded')
        if os.path.exists(name + '_critic.h5'):
            self.critic.load_weights(name + '_critic.h5')
            print('Critic loaded')
        if os.path.exists(name + '_critic2.h5'):
            self.critic2.load_weights(name + '_critic2.h5')
            print('Critic2 loaded')

    def save_model(self, name):
        self.actor.save_weights(name + '_actor.h5')
        self.critic.save_weights(name + '_critic.h5')
        self.critic2.save_weights(name + '_critic2.h5')

    def update_target_model(self):
        self.target_actor.set_weights(
            self.tau * np.array(self.actor.get_weights()) \
            + (1 - self.tau) * np.array(self.target_actor.get_weights())
        )
        self.target_critic.set_weights(
            self.tau * np.array(self.critic.get_weights()) \
            + (1 - self.tau) * np.array(self.target_critic.get_weights())
        )
        self.target_critic2.set_weights(
            self.tau * np.array(self.critic2.get_weights()) \
            + (1 - self.tau) * np.array(self.target_critic2.get_weights())
        )
Exemplo n.º 11
0
def deep_q_learning(sess,
                    env,
                    q_estimator,
                    target_estimator,
                    state_processor,
                    num_episodes,
                    experiment_dir,
                    replay_memory_size=500000,
                    replay_memory_init_size=50000,
                    update_target_estimator_every=10000,
                    discount_factor=0.99,
                    epsilon_start=1.0,
                    epsilon_end=0.1,
                    epsilon_decay_steps=500000,
                    batch_size=32,
                    record_video_every=50,
                    selfishness=0.5):
    """
    Q-Learning algorithm for off-policy TD control using Function Approximation.
    Finds the optimal greedy policy while following an epsilon-greedy policy.

    Args:
        sess: Tensorflow Session object
        env: OpenAI environment
        q_estimator: Estimator object used for the q values
        target_estimator: Estimator object used for the targets
        state_processor: A StateProcessor object
        num_episodes: Number of episodes to run for
        experiment_dir: Directory to save Tensorflow summaries in
        replay_memory_size: Size of the replay memory
        replay_memory_init_size: Number of random experiences to sampel when initializing
          the reply memory.
        update_target_estimator_every: Copy parameters from the Q estimator to the
          target estimator every N steps
        discount_factor: Gamma discount factor
        epsilon_start: Chance to sample a random action when taking an action.
          Epsilon is decayed over time and this is the start value
        epsilon_end: The final minimum value of epsilon after decaying is done
        epsilon_decay_steps: Number of steps to decay epsilon over
        batch_size: Size of batches to sample from the replay memory
        record_video_every: Record a video every N episodes

    Returns:
        An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.
    """

    Transition = namedtuple(
        "Transition", ["state", "action", "reward", "next_state", "done"])

    # The replay memory
    replay_memory = []

    # Keeps track of useful statistics
    stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),
                                  episode_rewards=np.zeros(num_episodes),
                                  episode_kills=np.zeros(num_episodes),
                                  episode_coins=np.zeros(num_episodes),
                                  episode_levels=np.zeros(num_episodes),
                                  episode_total_death=np.zeros(num_episodes),
                                  episode_distance=np.zeros(num_episodes),
                                  episode_death=np.zeros(num_episodes))

    # Create directories for checkpoints and summaries
    checkpoint_dir = os.path.join(experiment_dir, "checkpoints")
    checkpoint_path = os.path.join(checkpoint_dir, "model")
    monitor_path = os.path.join(experiment_dir, "monitor")

    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    if not os.path.exists(monitor_path):
        os.makedirs(monitor_path)

    saver = tf.train.Saver()
    # print(latest_checkpoint)
    # Load a previous checkpoint if we find one
    latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)

    if latest_checkpoint:
        print("Loading model checkpoint {}...\n".format(latest_checkpoint))
        saver.restore(sess, latest_checkpoint)

    total_t = sess.run(tf.contrib.framework.get_global_step())

    # The epsilon decay schedule
    epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps)

    # The policy we're following
    if POLICY == 'BOLTZMAN':
        policy = make_boltzmann_policy(q_estimator, len(VALID_ACTIONS))
    else:
        policy = make_epsilon_greedy_policy(q_estimator, len(VALID_ACTIONS))

    # Populate the replay memory with initial experience
    print("Populating replay memory...")
    total_state = env.reset(levelname=LEVEL_NAME)
    state = state_processor.process(sess, total_state, 1)
    state = np.stack([state] * WINDOW_LENGTH, axis=0)

    # state = np.stack([state] * WINDOW_LENGTH, axis=2)
    total_death = 0

    total_state = np.stack([state], axis=0)

    if USE_MEMORY:
        fn = '{}_{}'.format(LEVEL_NAME, 100)
        replay_memory = load_memory(fn, 100, REPLAY_MEMORY_SIZE)
        print(len(replay_memory))
        if PRIORITIZE_MEMORY:
            print('Creating priority memory')
            memory = prioritzie_replay(replay_memory)

    else:
        if env.headless:

            for i in range(replay_memory_init_size):
                action_probs = policy(
                    sess, state, epsilons[min(total_t,
                                              epsilon_decay_steps - 1)])

                action_probs = action_probs + [0, 1, 0, 0, 1, 0]
                action_probs = softmax(action_probs)
                action = np.random.choice(np.arange(len(action_probs)),
                                          p=action_probs)

                next_total_state, reward, done, info = env.step(
                    VALID_ACTIONS[action])
                next_state = state_processor.process(sess, next_total_state, 1)
                next_state = np.append(state[1:, :, :],
                                       np.expand_dims(next_state, 0),
                                       axis=0)
                next_total_state = np.stack([next_state], axis=0)

                replay_memory.append(
                    Transition(total_state, action, reward, next_total_state,
                               done))

                if done:
                    total_state = env.reset(levelname=LEVEL_NAME)
                    state = state_processor.process(sess, total_state, 1)

                    state = np.stack([state] * WINDOW_LENGTH, axis=0)
                    total_state = np.stack([state], axis=0)
                else:
                    state = next_state
                    total_state = next_total_state

        print('Memory is filled')

        if PRIORITIZE_MEMORY:
            memory = Memory(ER_SIZE)
            for exp in replay_memory:
                memory.store(exp)

        replay_memory = memory

    # Record videos
    # Use the gym env Monitor wrapper
    # env = MarioGym(headless=False, level_name='Level-5-coins.json', no_coins=5)
    env = Monitor(env,
                  directory=monitor_path,
                  resume=True,
                  video_callable=lambda count: count % record_video_every == 0)

    total_deaths = 0

    for i_episode in range(num_episodes):

        # Save the current checkpoint
        saver.save(tf.get_default_session(), checkpoint_path)

        # Reset the environment
        total_state = env.reset(levelname=LEVEL_NAME)
        state = state_processor.process(sess, total_state, 1)
        state = np.stack([state] * WINDOW_LENGTH, axis=0)
        total_state = np.stack([state], axis=0)

        loss = None
        dist = 0

        # One step in the environment
        for t in itertools.count():

            # Epsilon for this time step
            epsilon = epsilons[min(total_t, epsilon_decay_steps - 1)]

            # Add epsilon to Tensorboard
            episode_summary = tf.Summary()
            episode_summary.value.add(simple_value=epsilon, tag="epsilon")
            q_estimator.summary_writer.add_summary(episode_summary, total_t)

            # Maybe update the target estimator
            if total_t % update_target_estimator_every == 0:
                copy_model_parameters(sess, q_estimator, target_estimator)
                print("\nCopied model parameters to target network.")

            # Print out which step we're on, useful for debugging.
            print("\rStep {} ({}) @ Episode {}/{}, loss: {}".format(
                t, total_t, i_episode + 1, num_episodes, loss),
                  end="")
            sys.stdout.flush()

            # Take a step
            action_probs = policy(sess, state, epsilon)
            action = np.random.choice(np.arange(len(action_probs)),
                                      p=action_probs)
            next_total_state, reward, done, info = env.step(
                VALID_ACTIONS[action])
            level_up = 0
            next_state = state_processor.process(sess, next_total_state, 1)
            next_state = np.append(state[1:, :, :],
                                   np.expand_dims(next_state, 0),
                                   axis=0)

            next_total_state = np.stack([next_state], axis=0)

            # If our replay memory is full, pop the first element
            # if replay_memory.tree.data_pointer == replay_memory_size:
            #     replay_memory.pop(0)

            # Save transition to replay memory
            memory.store(
                Transition(total_state, action, reward, next_total_state,
                           done))

            # Update statistics
            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = t
            stats.episode_kills[i_episode] += info['num_killed']
            stats.episode_coins[i_episode] += info['coins_taken']
            stats.episode_levels[i_episode] += level_up
            stats.episode_death[i_episode] += info['death']

            if info['death']:
                total_deaths += 1
            if env.mario.rect.x > dist:
                dist = env.mario.rect.x
            stats.episode_total_death[i_episode] = total_deaths

            tree_idx, batch, ISWeights = memory.sample(batch_size)

            states_batch = np.array([each[0][0] for each in batch], ndmin=3)
            action_batch = np.array([each[0][1] for each in batch])
            reward_batch = np.array([each[0][2] for each in batch])
            next_states_batch = np.array([each[0][3] for each in batch],
                                         ndmin=3)
            done_batch = np.array([each[0][4] for each in batch])

            # Calculate q values and targets (Double DQN)coins_left
            next_states_batch = next_states_batch.squeeze()
            q_values_next = q_estimator.predict(sess, next_states_batch)

            q_values_next_total = q_values_next

            best_actions = np.argmax(q_values_next_total, axis=1)
            q_values_next_target = target_estimator.predict(
                sess, next_states_batch)

            targets_batch = reward_batch + np.invert(done_batch).astype(np.float32) * \
                            (discount_factor * q_values_next_target[np.arange(batch_size), best_actions])

            # Perform gradient descent update
            states_batch = states_batch.squeeze()
            targets_batch = targets_batch.squeeze()

            loss1, abs_error = q_estimator.update(sess, states_batch,
                                                  action_batch, targets_batch,
                                                  ISWeights)

            loss = loss1

            if done:
                break

            state = next_state
            total_state = next_total_state
            total_t += 1

        stats.episode_distance[i_episode] = dist
        stats.episode_levels[i_episode] += 1

        # Add summaries to tensorboard
        episode_summary = tf.Summary()
        episode_summary.value.add(
            simple_value=stats.episode_rewards[i_episode],
            node_name="episode_reward",
            tag="episode_reward")
        episode_summary.value.add(
            simple_value=stats.episode_lengths[i_episode],
            node_name="episode_length",
            tag="episode_length")
        episode_summary.value.add(simple_value=stats.episode_kills[i_episode],
                                  node_name="episode_kills",
                                  tag="episode_kills")
        episode_summary.value.add(simple_value=stats.episode_coins[i_episode],
                                  node_name="episode_coins",
                                  tag="episode_coins")
        episode_summary.value.add(simple_value=stats.episode_levels[i_episode],
                                  node_name="episode_levels",
                                  tag="episode_levels")
        episode_summary.value.add(
            simple_value=stats.episode_total_death[i_episode],
            node_name="episode_total_death",
            tag="episode_total_death")
        episode_summary.value.add(
            simple_value=stats.episode_distance[i_episode],
            node_name="episode_distance",
            tag="episode_distance")
        episode_summary.value.add(simple_value=stats.episode_death[i_episode],
                                  node_name="episode_death",
                                  tag="episode_death")
        q_estimator.summary_writer.add_summary(episode_summary, total_t)
        q_estimator.summary_writer.flush()

        yield total_t, plotting.EpisodeStats(
            episode_lengths=stats.episode_lengths[:i_episode + 1],
            episode_rewards=stats.episode_rewards[:i_episode + 1],
            episode_kills=stats.episode_kills[:i_episode + 1],
            episode_coins=stats.episode_coins[:i_episode + 1],
            episode_levels=stats.episode_levels[:i_episode + 1],
            episode_distance=stats.episode_distance[:i_episode + 1],
            episode_total_death=stats.episode_total_death[:i_episode + 1],
            episode_death=stats.episode_death[:i_episode + 1])
    env.monitor.close()
    return stats
Exemplo n.º 12
0
def prioritzie_replay(replay_memory):
    memory = Memory(ER_SIZE)
    for exp in replay_memory:
        memory.store(exp)
    return memory
Exemplo n.º 13
0
class PDQNagent(Model.nn_model):
    def __init__(
        self,
        state_shape,
        action_size,
        main_network,
        target_network,
        batch_size=50,
        gamma=0.95,
        memory_len=100000,
        epsilon_init=1,
        epsilon_min=0.1,
        epsilon_decay=0.99,
        learning_rate=0.01,
        target_update_step=500,
        Double_Q=True,
        PER_alpha=0.6,
        PER_beta=0.4,
        PER_beta_increment=0.001,
    ):

        self.state_shape = state_shape
        self.action_size = action_size
        self.memory = Memory(memory_len,
                             alpha=PER_alpha,
                             beta=PER_beta,
                             beta_increment=PER_beta_increment)
        self.gamma = gamma  # discount rate
        self.epsilon = epsilon_init  # exploration rate
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        self.learning_rate = learning_rate
        self.Double_Q = Double_Q
        self.target_update_step = target_update_step

        self.optimizer = tf.keras.optimizers.RMSprop(
            learning_rate=learning_rate)
        self.loss_function = tf.keras.losses.Huber(
            reduction=tf.keras.losses.Reduction.NONE)

        self.loss_metric = tf.keras.metrics.Mean(name='loss')
        self.avg_q_metric = tf.keras.metrics.Mean(name='avg_q')

        self.total_update_step = 0
        self.batch_size = batch_size

        self.main_network = main_network
        self.target_network = target_network
        self.update_target()

    def update_target(self):
        self.target_network.set_weights(self.main_network.get_weights())

    def memorize(self, state, action, reward, next_state, done):
        act = np.zeros(self.action_size)
        act[action] = 1

        self.memory.store((state, act, reward, next_state, done))

    def act(self, state, use_epsilon=True):
        if np.random.rand() <= self.epsilon and use_epsilon:
            return random.randrange(self.action_size)
        act_values = self.main_network.predict(np.expand_dims(state, axis=0))
        return np.argmax(act_values[0])  # returns action

    def get_batch(self):

        batch_size = self.batch_size
        if batch_size > self.memory.data_num:
            batch_size = self.memory.data_num

        tree_indexs, datas, ISweights = self.memory.sample(batch_size)

        result = []
        for i in range(len(datas[0])):
            temp = []
            for experience in datas:
                temp.append(experience[i])
            result.append(np.array(temp))

        return result, tree_indexs, np.array(ISweights)

    def train(self):
        experiences, tree_indexs, ISweights = self.get_batch()

        states, actions, rewards, next_states, dones = experiences

        if self.Double_Q:
            indices = np.argmax(self.main_network.predict(next_states), axis=1)
            indices = np.expand_dims(indices, axis=1)
            qvalues = self.target_network.predict(next_states)
            qvalues = np.take_along_axis(qvalues, indices, axis=1)
            qvalues = np.squeeze(qvalues, axis=1)
        else:
            qvalues = self.target_network.predict(next_states)
            qvalues = np.max(qvalues, axis=1)

        targets = rewards + dones * (self.gamma * qvalues)

        with tf.GradientTape() as tape:
            action_values = self.main_network(states)
            average_action_value = tf.reduce_mean(action_values)

            action_value = tf.reduce_sum(tf.multiply(action_values, actions),
                                         axis=1)

            td_errors = self.loss_function(targets, action_value)
            loss = tf.reduce_mean(td_errors * ISweights)

        gradients = tape.gradient(loss, self.main_network.trainable_variables)
        self.optimizer.apply_gradients(
            zip(gradients, self.main_network.trainable_variables))

        #update priority
        priority = td_errors.numpy()
        self.memory.batch_update(tree_indexs, priority)

        self.loss_metric(loss)
        self.avg_q_metric(average_action_value)

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

        self.total_update_step += 1
        if self.total_update_step % self.target_update_step == 0:
            self.update_target()

        history = {
            "loss": self.loss_metric.result().numpy(),
            "avg_q": self.avg_q_metric.result().numpy()
        }
        return history
Exemplo n.º 14
0
class ApexDQN():
    def __init__(
        self,
        state_shape,
        action_size,
        network_builder,
        env_class,
        actor_num=8,
        batch_size=50,
        gamma=0.95,
        memory_len=100000,
        epsilon_max=0.4,
        epsilon_alpha=7,
        learning_rate=0.001,
        target_update_step=50,
        Double_Q=True,
        train_start_step=10000,
        local_network_update_step=50,
        n_step=3,
    ):

        self.state_shape = state_shape
        self.action_size = action_size
        self.env_class = env_class
        self.memory = Memory(memory_len)
        self.gamma = gamma  # discount rate
        self.epsilon_max = epsilon_max  # exploration rate
        self.epsilon_alpha = epsilon_alpha
        self.learning_rate = learning_rate
        self.Double_Q = Double_Q
        self.target_update_step = target_update_step
        self.network_builder = network_builder
        self.actor_num = actor_num
        self.train_start_step = train_start_step
        self.local_network_update_step = local_network_update_step
        self.n_step = n_step

        self.history = {"reward": [], "loss": [], "avg_q": [], "step": 0}
        self.end_training = [False]

        self.optimizer = tf.keras.optimizers.RMSprop(
            learning_rate=learning_rate)
        self.loss_function = tf.keras.losses.Huber(
            reduction=tf.keras.losses.Reduction.NONE)

        self.loss_metric = tf.keras.metrics.Mean(name='loss')
        self.avg_q_metric = tf.keras.metrics.Mean(name='avg_q')

        self.total_update_step = 0
        self.batch_size = batch_size

        self.main_network = network_builder()
        self.target_network = network_builder()
        self.update_target()

        self.generate_local_networks()

        self.device = '/gpu:0'

    def generate_local_networks(self):
        local_networks = []
        for _ in range(self.actor_num):
            local_networks.append(
                (self.network_builder(), self.network_builder()))

        self.local_networks = local_networks

    def update_target(self):
        self.target_network.set_weights(self.main_network.get_weights())

    def get_batch(self):

        batch_size = self.batch_size
        if batch_size > self.memory.data_num:
            batch_size = self.memory.data_num

        tree_indexs, datas, ISweights = self.memory.sample(batch_size)

        result = []
        for i in range(len(datas[0])):
            temp = []
            for experience in datas:
                temp.append(experience[i])
            result.append(np.array(temp))

        return result, tree_indexs, np.array(ISweights)

    def train(self):
        with tf.device(self.device):
            experiences, tree_indexs, ISweights = self.get_batch()

            states, actions, rewards, next_states, dones = experiences

            if self.Double_Q:
                indices = np.argmax(self.main_network.predict(next_states),
                                    axis=1)
                indices = np.expand_dims(indices, axis=1)
                qvalues = self.target_network.predict(next_states)
                qvalues = np.take_along_axis(qvalues, indices, axis=1)
                qvalues = np.squeeze(qvalues, axis=1)
            else:
                qvalues = self.target_network.predict(next_states)
                qvalues = np.max(qvalues, axis=1)

            targets = rewards + dones * (self.gamma * qvalues)

            with tf.GradientTape() as tape:
                action_values = self.main_network(states)
                average_action_value = tf.reduce_mean(action_values)

                action_value = tf.reduce_sum(tf.multiply(
                    action_values, actions),
                                             axis=1)

                td_errors = self.loss_function(targets, action_value)
                loss = tf.reduce_mean(td_errors * ISweights)

            gradients = tape.gradient(loss,
                                      self.main_network.trainable_variables)
            self.optimizer.apply_gradients(
                zip(gradients, self.main_network.trainable_variables))

            #update priority
            priority = td_errors.numpy()
            self.memory.batch_update(tree_indexs, priority)

            self.loss_metric(loss)
            self.avg_q_metric(average_action_value)

            self.total_update_step += 1
            if self.total_update_step % self.target_update_step == 0:
                self.update_target()

            history = {
                "loss": self.loss_metric.result().numpy(),
                "avg_q": self.avg_q_metric.result().numpy()
            }
        return history

    def main(self, train_until, plot_every):
        self.end_training[0] = False

        history = self.history
        plot_metrics = ["reward", "loss", "avg_q"]

        print("generating actors")

        actors = []
        for i in range(self.actor_num):

            epsilon = self.epsilon_max**(
                1 + i / (self.actor_num - 1) * self.epsilon_alpha)

            actors.append(
                Actor(self.state_shape, self.action_size,
                      self.local_networks[i][0], self.local_networks[i][1],
                      self.main_network, self.target_network, self.memory,
                      epsilon, self.Double_Q, self.loss_function,
                      self.end_training, self.gamma, self.env_class(), history,
                      self.local_network_update_step, self.n_step))

        print("start acting for {} steps".format(self.train_start_step))
        for actor in actors:
            actor.start()

        while self.history["step"] < self.train_start_step:
            time.sleep(1)
            print("step :", self.history["step"])

        print("start training")

        metrics = ["loss", "avg_q"]

        episode = 0
        train_num = 0

        while train_num <= train_until:
            episode = len(history["reward"])

            hist = self.train()
            train_num += 1

            for met in metrics:
                history[met].append(hist[met])

            if train_num % plot_every == 0:
                for met in plot_metrics:
                    plotter.plotWithSmooth(history[met], met)

            print("episodes :", episode, "  train_num :", train_num,
                  "  memory len :", self.memory.data_num)

        self.end_training[0] = True
        for actor in actors:
            actor.join()