def __init__(self, env_name, gamma=0.98):
     self.env_name = env_name
     self.action_space = gym.make(self.env_name).action_space.n
     self.q_network = QNetwork(self.action_space)
     self.target_q_network = QNetwork(self.action_space)
     self.gamma = gamma
     self.optimizer = tf.keras.optimizers.Adam(lr=0.001)
     self.setup()
    def __init__(self, env_id: str, logdir: Path):

        self.env_id = env_id

        self.summary_writer = tf.summary.create_file_writer(
            str(logdir)) if logdir else None

        self.action_space = gym.make(self.env_id).action_space.shape[0]

        self.replay_buffer = ReplayBuffer(maxlen=10000)

        self.policy = GaussianPolicyNetwork(action_space=self.action_space)
        self.target_policy = GaussianPolicyNetwork(
            action_space=self.action_space)

        self.critic = QNetwork()
        self.target_critic = QNetwork()

        self.log_temperature = tf.Variable(1.)

        self.log_alpha_mu = tf.Variable(1.)
        self.log_alpha_sigma = tf.Variable(1.)

        self.eps = 0.1

        self.eps_mu = 0.01
        self.eps_sigma = 0.001

        self.policy_optimizer = tf.keras.optimizers.Adam(lr=0.0005)
        self.critic_optimizer = tf.keras.optimizers.Adam(lr=0.0005)
        self.temperature_optimizer = tf.keras.optimizers.Adam(lr=0.0005)
        self.alpha_optimizer = tf.keras.optimizers.Adam(lr=0.0005)

        self.batch_size = 128

        self.n_samples = 10

        self.update_period = 4

        self.gamma = 0.99

        self.target_policy_update_period = 400

        self.target_critic_update_period = 400

        self.global_steps = 0

        self.episode_count = 0

        self.setup()
Пример #3
0
  def __init__(self, game, agentsTypes, agent_index, parameters=None, render=False, use_replay=False,
               deep=0, monitor=False):

    # Create an instance of the network itself, as well as the memory.
    # Here is also a good place to set environmental parameters,
    # as well as training parameters - number of episodes / iterations, etc.

    self.gamma = 0.99
    self.RLalpha = 0.1
    self.SLalpha = 0.005

    self.RLBufferSize = 1000
    self.SLBufferSize = 50000

    self.epsilon_initial = 0.5
    self.epsilon = self.epsilon_initial

    self.episodes = 1000000
    self.env = game.env
    self.agentsTypes = agentsTypes
    self.agent_index = agent_index
    self.state_size = self.env.state_size
    self.action_size = self.env.action_size
    self.eta = 0.1

    self.deep = deep

    self.policynet = PNetwork(self.env, self, deep=deep)
    self.valuenet = QNetwork(self.env, self, deep=deep)


    self.target_update_period = 100

    self.network_update_period = 128
    self.network_updates = 2

    self.iteration = 0

    self.brp = True
    self.sigma = self.brp_action

    self.replayRL = IS_Replay_Memory(game, agentsTypes, self.agent_index,
        memory_size=self.RLBufferSize)

    #self.replayRL = Prioritized_Replay_Memory(game, memory_size=self.RLBufferSize)

    self.replaySL = Replay_Memory(game, memory_size=self.SLBufferSize,
                                  kind=replay.RESERVOIR)
    def __init__(self, pid, env_name, epsilon, gamma=0.98):

        self.pid = pid
        self.env_name = env_name
        self.env = gym.make(self.env_name)
        self.action_space = self.env.action_space.n

        self.q_network = QNetwork(self.action_space)
        self.epsilon = epsilon
        self.gamma = gamma
        self.buffer = []

        self.state = self.env.reset()
        self.setup()

        self.episode_rewards = 0
Пример #5
0
	def __init__(self, env, render=False,model_type=None,save_folder=None):

		self.net=QNetwork(env,model_type=model_type)
		self.obs_space=env.observation_space.shape[0]
		self.ac_space=env.action_space.n
		self.render=render
		######################Hyperparameters###########################
		self.env=env
		self.epsilon=0.7
		self.epsilon_min=0.05
		self.epsilon_decay=0.999
		self.gamma=0.99
		self.max_itr=1000000
		self.batch_size=32
		self.max_reward=160 #Used for saving a model with a reward above a certain threshold
		self.memory_queue=Replay_Memory(memory_size=50000, burn_in=30000)
		###############################################################
		self.avg_rew_buffer=10
		self.avg_rew_queue=deque(maxlen=self.avg_rew_buffer)
		self.model_save=50
		self.test_model_interval=50
		self.save_folder=save_folder
Пример #6
0
    def __init__(self, observation_space, action_space, args):
        """
        Constructor
        :param observation_space: observation space of the environment
        :param action_space: action space of the environment
        :param args: command line args to set hyperparameters
        """

        # set hyperparameters
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.gamma = args.gamma
        self.state_dim = observation_space.shape[0]
        self.action_dim = action_space.shape[0]
        self.hidden_dim = args.hidden_units
        self.tau = args.tau
        self.lr = args.lr
        self.target_update_interval = args.target_update_interval

        # build and initialize networks
        self.q_net_1 = QNetwork(self.state_dim, self.action_dim,
                                self.hidden_dim).to(self.device)
        self.q_net_2 = QNetwork(self.state_dim, self.action_dim,
                                self.hidden_dim).to(self.device)
        self.target_q_net_1 = QNetwork(self.state_dim, self.action_dim,
                                       self.hidden_dim).to(self.device)
        self.target_q_net_2 = QNetwork(self.state_dim, self.action_dim,
                                       self.hidden_dim).to(self.device)
        hard_update(self.q_net_1, self.target_q_net_1)
        hard_update(self.q_net_2, self.target_q_net_2)
        self.policy_net = PolicyNetwork(self.state_dim, self.action_dim,
                                        self.hidden_dim,
                                        self.device).to(self.device)

        # build criterions and optimizers
        self.q1_criterion = nn.MSELoss()
        self.q2_criterion = nn.MSELoss()
        self.q1_optim = optim.Adam(self.q_net_1.parameters(), lr=self.lr)
        self.q2_optim = optim.Adam(self.q_net_2.parameters(), lr=self.lr)
        self.policy_optim = optim.Adam(self.policy_net.parameters(),
                                       lr=self.lr)

        # for optimizing alpha (see Harnojaa et al. section 5)
        if args.initial_alpha is not None:
            self.alpha = torch.tensor(args.initial_alpha,
                                      requires_grad=True,
                                      device=self.device,
                                      dtype=torch.float)
        else:
            self.alpha = torch.rand(1,
                                    requires_grad=True,
                                    device=self.device,
                                    dtype=torch.float)

        if args.entropy_target is not None:
            self.entropy_target = torch.tensor(args.target_alpha,
                                               device=self.device,
                                               dtype=torch.float)
        else:
            self.entropy_target = -1. * torch.tensor(
                action_space.shape, device=self.device, dtype=torch.float)

        self.alpha_optim = optim.Adam([self.alpha], lr=self.lr)
Пример #7
0
class SAC:
    """
    A class used to represent a SAC agent

    Attributes
    ----------
    device : cuda or cpu
        the device on which all the computation occurs
    gamma : float[0,1]
        discount factor
    state_dim : int
        dimension of the environment observation space
    action_dim : int
        dimension of the environment action space
    hidden_dim : int
        dimension of the hidden layers of the networks
    tau : float[0,1]
        coefficient of soft update of target networks
    lr : float
        learning rate of the optimizers
    target_update_interval : int
        number of updates in between soft updates of target networks
    q_net_1 : QNetwork
        soft Q value network 1
    q_net_2 : QNetwork
        soft Q value network 2
    target_q_net_1 : QNetwork
        target Q value network 1
    target_q_net_2 : QNetwork
        target Q value network 2
    policy_net : PolicyNetwork
        policy network
    q1_criterion :
        torch optimization criterion for q_net_1
    q2_criterion :
        torch optimization criterion for q_net_2
    q1_optim :
        torch optimizer for q_net_1
    q2_optim :
        torch optimizer for q_net_2
    policy_optim :
        torch optimizer for policy_net
    alpha : torch float scalar
        entropy temperature (controls policy stochasticity)
    entropy_target : torch float scalar
        entropy target for the environment (see Haarnoja et al. Section 5)

    Methods
    -------
    update(replay_buffer, batch_size, updates) : q1_loss, q2_loss, policy_loss, alpha_loss
         Performs a gradient step of the algorithm, optimizing Q networks and policy network and optimizing alpha

    choose_action(state) : action
        Returns the appropriate action in given state according to current policy

    save_networks_parameters(params_dir)
        Saves the relevant parameters (q1_net's, q2_net's, policy_net's, alpha) from the networks

    load_networks_parameters(params_dir)
        Loads the relevant parameters (q1_net's, q2_net's, policy_net's, alpha) into the networks

    """
    def __init__(self, observation_space, action_space, args):
        """
        Constructor
        :param observation_space: observation space of the environment
        :param action_space: action space of the environment
        :param args: command line args to set hyperparameters
        """

        # set hyperparameters
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.gamma = args.gamma
        self.state_dim = observation_space.shape[0]
        self.action_dim = action_space.shape[0]
        self.hidden_dim = args.hidden_units
        self.tau = args.tau
        self.lr = args.lr
        self.target_update_interval = args.target_update_interval

        # build and initialize networks
        self.q_net_1 = QNetwork(self.state_dim, self.action_dim,
                                self.hidden_dim).to(self.device)
        self.q_net_2 = QNetwork(self.state_dim, self.action_dim,
                                self.hidden_dim).to(self.device)
        self.target_q_net_1 = QNetwork(self.state_dim, self.action_dim,
                                       self.hidden_dim).to(self.device)
        self.target_q_net_2 = QNetwork(self.state_dim, self.action_dim,
                                       self.hidden_dim).to(self.device)
        hard_update(self.q_net_1, self.target_q_net_1)
        hard_update(self.q_net_2, self.target_q_net_2)
        self.policy_net = PolicyNetwork(self.state_dim, self.action_dim,
                                        self.hidden_dim,
                                        self.device).to(self.device)

        # build criterions and optimizers
        self.q1_criterion = nn.MSELoss()
        self.q2_criterion = nn.MSELoss()
        self.q1_optim = optim.Adam(self.q_net_1.parameters(), lr=self.lr)
        self.q2_optim = optim.Adam(self.q_net_2.parameters(), lr=self.lr)
        self.policy_optim = optim.Adam(self.policy_net.parameters(),
                                       lr=self.lr)

        # for optimizing alpha (see Harnojaa et al. section 5)
        if args.initial_alpha is not None:
            self.alpha = torch.tensor(args.initial_alpha,
                                      requires_grad=True,
                                      device=self.device,
                                      dtype=torch.float)
        else:
            self.alpha = torch.rand(1,
                                    requires_grad=True,
                                    device=self.device,
                                    dtype=torch.float)

        if args.entropy_target is not None:
            self.entropy_target = torch.tensor(args.target_alpha,
                                               device=self.device,
                                               dtype=torch.float)
        else:
            self.entropy_target = -1. * torch.tensor(
                action_space.shape, device=self.device, dtype=torch.float)

        self.alpha_optim = optim.Adam([self.alpha], lr=self.lr)

    def update(self, replay_buffer, batch_size, updates):
        """
        Performs a gradient step of the algorithm, optimizing Q networks and policy network and optimizing alpha
        :param replay_buffer: replay buffer to sample batches of transitions from
        :param batch_size: size of the batches
        :param updates: number of updates so far
        :return: losses of the four optimizers (q1_optim, q2_optim, policy_optim, alpha_optim)
        :rtype: tuple of torch scalar floats
        """

        # sample a transition batch from replay buffer and cast it to tensor of the correct shape
        state_batch, action_batch, reward_batch, next_state_batch, done_batch = replay_buffer.sample(
            batch_size)
        state_batch = torch.from_numpy(state_batch).to(self.device,
                                                       dtype=torch.float)
        next_state_batch = torch.from_numpy(next_state_batch).to(
            self.device, dtype=torch.float)
        action_batch = torch.from_numpy(action_batch).to(self.device,
                                                         dtype=torch.float)
        reward_batch = torch.from_numpy(reward_batch).unsqueeze(1).to(
            self.device, dtype=torch.float)
        done_batch = torch.from_numpy(np.float32(done_batch)).unsqueeze(1).to(
            self.device, dtype=torch.float)

        # sample actions from the policy to be used for expectations updates
        sampled_action, log_prob, epsilon, mean, log_std = self.policy_net.sample(
            state_batch)

        ### evaluation step
        target_next_value = torch.min(
            self.target_q_net_1(next_state_batch, sampled_action),
            self.target_q_net_2(next_state_batch,
                                sampled_action)) - self.alpha * log_prob

        current_q_value_1 = self.q_net_1(state_batch, action_batch)
        current_q_value_2 = self.q_net_2(state_batch, action_batch)

        expected_next_value = reward_batch + (
            1 - done_batch) * self.gamma * target_next_value
        q1_loss = self.q1_criterion(current_q_value_1,
                                    expected_next_value.detach())
        q2_loss = self.q2_criterion(current_q_value_2,
                                    expected_next_value.detach())

        # optimize q1 and q1 nets
        self.q1_optim.zero_grad()
        q1_loss.backward()
        self.q1_optim.step()
        self.q2_optim.zero_grad()
        q2_loss.backward()
        self.q2_optim.step()

        ### improvement step
        sampled_q_value = torch.min(self.q_net_1(state_batch, sampled_action),
                                    self.q_net_2(state_batch, sampled_action))
        policy_loss = (self.alpha * log_prob - sampled_q_value).mean()

        # optimize policy net
        self.policy_optim.zero_grad()
        policy_loss.backward()
        self.policy_optim.step()

        # optimize alpha
        alpha_loss = (self.alpha *
                      (-log_prob - self.entropy_target).detach()).mean()

        self.alpha_optim.zero_grad()
        alpha_loss.backward()
        self.alpha_optim.step()

        # update Q target value
        if updates % self.target_update_interval == 0:
            soft_update(self.q_net_1, self.target_q_net_1, self.tau)
            soft_update(self.q_net_2, self.target_q_net_2, self.tau)

        return q1_loss.item(), q2_loss.item(), policy_loss.item(
        ), alpha_loss.item()

    def choose_action(self, state):
        """
        Returns the appropriate action in given state according to current policy
        :param state: state
        :return: action
        :rtype numpy float array
        """

        action = self.policy_net.get_action(state)
        # move to cpu, remove from gradient graph, cast to numpy
        return action.cpu().detach().numpy()

    def save_networks_parameters(self, params_dir=None):
        """
        Saves the relevant parameters (q1_net's, q2_net's, policy_net's, alpha) from the networks
        :param params_dir: directory where to save parameters to (optional)
        :return: None
        """
        if params_dir is None:
            params_dir = "SavedAgents/"

        # create a subfolder with current timestamp
        prefix = os.path.join(
            params_dir,
            datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
        if not os.path.exists(prefix):
            os.makedirs(prefix)

        policy_path = os.path.join(prefix, "policy_net_params")
        q1_path = os.path.join(prefix, "q1_net_params")
        q2_path = os.path.join(prefix, "q2_net_params")
        alpha_path = os.path.join(prefix, "alpha_param")

        print("Saving parameters to {}, {}, {}".format(q1_path, q2_path,
                                                       policy_path))

        torch.save(self.q_net_1.state_dict(), q1_path)
        torch.save(self.q_net_2.state_dict(), q2_path)
        torch.save(self.policy_net.state_dict(), policy_path)
        torch.save(self.alpha, alpha_path)

        return params_dir

    def load_networks_parameters(self, params_dir):
        """
        Loads the relevant parameters (q1_net's, q2_net's, policy_net's, alpha) into the networks
        :param params_dir: directory where to load parameters from
        :return: None
        """
        if params_dir is not None:
            print("Loading parameters from {}".format(params_dir))

            policy_path = os.path.join(params_dir, "policy_net_params")
            self.policy_net.load_state_dict(torch.load(policy_path))

            q1_path = os.path.join(params_dir, "q1_net_params")
            q2_path = os.path.join(params_dir, "q2_net_params")
            self.q_net_1.load_state_dict(torch.load(q1_path))
            self.q_net_2.load_state_dict(torch.load(q2_path))

            alpha_path = os.path.join(params_dir, "alpha_param")
            self.alpha = torch.load(alpha_path)
Пример #8
0
class DQN_Agent():

  # In this class, we will implement functions to do the following.
  # (1) Create an instance of the Q Network class.
  # (2) Create a function that constructs a policy from the Q values predicted by the Q Network.
  #   (a) Epsilon Greedy Policy.
  #     (b) Greedy Policy.
  # (3) Create a function to train the Q Network, by interacting with the environment.
  # (4) Create a function to test the Q Network's performance on the environment.
  # (5) Create a function for Experience Replay.

  def __init__(self, game, agentsTypes, agent_index, parameters=None, render=False, use_replay=False,
               deep=0, monitor=False):

    # Create an instance of the network itself, as well as the memory.
    # Here is also a good place to set environmental parameters,
    # as well as training parameters - number of episodes / iterations, etc.

    self.gamma = 0.99
    self.RLalpha = 0.1
    self.SLalpha = 0.005

    self.RLBufferSize = 1000
    self.SLBufferSize = 50000

    self.epsilon_initial = 0.5
    self.epsilon = self.epsilon_initial

    self.episodes = 1000000
    self.env = game.env
    self.agentsTypes = agentsTypes
    self.agent_index = agent_index
    self.state_size = self.env.state_size
    self.action_size = self.env.action_size
    self.eta = 0.1

    self.deep = deep

    self.policynet = PNetwork(self.env, self, deep=deep)
    self.valuenet = QNetwork(self.env, self, deep=deep)


    self.target_update_period = 100

    self.network_update_period = 128
    self.network_updates = 2

    self.iteration = 0

    self.brp = True
    self.sigma = self.brp_action

    self.replayRL = IS_Replay_Memory(game, agentsTypes, self.agent_index,
        memory_size=self.RLBufferSize)

    #self.replayRL = Prioritized_Replay_Memory(game, memory_size=self.RLBufferSize)

    self.replaySL = Replay_Memory(game, memory_size=self.SLBufferSize,
                                  kind=replay.RESERVOIR)

  # q_values: State * Action -> Value
  def brp_action(self, state):
    if random.random() < self.epsilon:
      action = random.randint(0, self.action_size - 1)
      return action
    else:
      best_action, _ = self.valuenet.best_action(state)
      return best_action

  # greedy policy
  def average_policy_action(self, state):
    best_action = self.policynet.best_action(state)
    return best_action

  def resetepisode(self, average_only=False):
    if not average_only and random.random() < self.eta:
      self.brp = True
      self.sigma = self.brp_action
    else:
      self.brp = False
      self.sigma = self.average_policy_action

  def act(self, state):
    action = self.sigma(state)
    return action

  def updatereplay(self, state, action, reward, next_state, done, actionset, stateset):
    # See paper for recommended epsilon decay
    self.epsilon = self.epsilon_initial / math.ceil(math.sqrt((self.iteration + 1)  / 10000))

    self.iteration += 1
    if self.brp:
      action_onehot = [0 for _ in range(self.action_size)]
      action_onehot[action] = 1
      self.replaySL.append([state, action_onehot])
    else:
      # If we are using Importance Sampling Experience Replay,
      # the last two fields are the likelihood at the time the item was added
      # and the current opponent likelihood

      # These fields should be ignored for Normal Exp Replay
      self.replayRL.append([state, action, reward, next_state, done, actionset, stateset])

    if self.iteration % self.network_update_period == 0:
      for i in range(self.network_updates):
        batch = self.network_update_period

        # replayRLbatch = self.replayRL.sample_batch(self,batch_size=batch)
        replayRLbatch, batch_importance_weights = self.replayRL.sample_batch(batch)

        # should use mse loss, just have # action_size results
        self.valuenet.update_batch(batch, replayRLbatch, batch_importance_weights)

        if self.replaySL.size >= 1000:
          replaySLbatch = self.replaySL.sample_batch(batch)
          # should use crossentropy loss, softmax activation
          self.policynet.update_batch(batch, replaySLbatch)

    if self.iteration % self.target_update_period == 0:
      self.valuenet.update_target()

  def appendreplay(self, state, action, reward, next_state, done, actionset, stateset):
    self.replayRL.append([state, action, reward, next_state, done, actionset, stateset])

  # Not Essential
  def surveySLMemory(self):
    freqs = [0 for i in range(self.action_size)]
    for item in self.replaySL.cache[:self.replaySL.size]:
      for j in range(self.action_size):
        if item[1][j] == 1:
          freqs[j] += 1

    print(freqs)

  # Not Essential
  def surveyRLMemory(self):
    for item in self.replayRL.cache[:self.replayRL.size]:
      print(item)
Пример #9
0
class DQN_Agent():

	def __init__(self, env, render=False,model_type=None,save_folder=None):

		self.net=QNetwork(env,model_type=model_type)
		self.obs_space=env.observation_space.shape[0]
		self.ac_space=env.action_space.n
		self.render=render
		######################Hyperparameters###########################
		self.env=env
		self.epsilon=0.7
		self.epsilon_min=0.05
		self.epsilon_decay=0.999
		self.gamma=0.99
		self.max_itr=1000000
		self.batch_size=32
		self.max_reward=160 #Used for saving a model with a reward above a certain threshold
		self.memory_queue=Replay_Memory(memory_size=50000, burn_in=30000)
		###############################################################
		self.avg_rew_buffer=10
		self.avg_rew_queue=deque(maxlen=self.avg_rew_buffer)
		self.model_save=50
		self.test_model_interval=50
		self.save_folder=save_folder
		
	def epsilon_greedy_policy(self, q_values,epsi):
		# Creating epsilon greedy probabilities to sample from.             
		if random.uniform(0,1)<=epsi:
			return random.randint(0,self.ac_space-1) #Q-Values shape is batch_size x ac
		else:
			return np.argmax(q_values[0])

	def greedy_policy(self, q_values):
		# Creating greedy policy for test time. 
		return np.argmax(q_values[0]) 

	def train(self):
		testX,testY=[],[]
		batch_size,max_,avg_rew_test,itr=self.batch_size,0,0,0

		print("Using Experience Replay")
		#Burn In 
		self.burn_in_memory()

		if(self.save_folder!=None):
			self.env=Monitor(self.env, self.save_folder,video_callable=lambda episode_id:episode_id%500==0,force=True)		

		for epi in range(EPISODES):
			state=np.reshape(self.env.reset(),[1,self.obs_space])#Reset the state
			total_rew=0
			
			while True:
				itr+=1
				if(self.render):
					self.env.render()
				#get action by e-greedy
				ac=self.epsilon_greedy_policy(self.net.model.predict(state),self.epsilon)	
				#Find out next state and rew for current action
				n_s,rew,is_t, _ = self.env.step(ac) 
				
				#Append to queue
				n_s=np.reshape(n_s,[1,self.obs_space])
				self.memory_queue.append([state,ac,rew,is_t,n_s])
				#Get samples of size batch_size
				batch=self.memory_queue.sample_batch(batch_size=batch_size)

				#Create array of states and next states
				batch_states=np.zeros((len(batch),self.obs_space))
				batch_next_states=np.zeros((len(batch),self.obs_space))
				actions,rewards,terminals=[],[],[]

				for i in range(0,len(batch)):
					b_state, b_ac, b_rew, b_is_t, b_ns=batch[i] #Returns already reshaped b_state and b_ns
					batch_states[i]=b_state
					batch_next_states[i]=b_ns
					actions.append(b_ac)
					rewards.append(b_rew)
					terminals.append(b_is_t)

				#Get Predictions
				batch_q_values=self.net.model.predict(batch_states)
				batch_next_q_values=self.net.model.predict(batch_next_states)
				
				for i in range(0,len(batch)):
					if terminals[i]: #Corresponds to is_terminal in sampled batch
						batch_q_values[i][actions[i]]=rewards[i]

					else: 
					#If not
						batch_q_values[i][actions[i]]=rewards[i]+self.gamma*(np.amax(batch_next_q_values[i]))  
				#Perform one step of SGD
				self.net.model.fit(batch_states,batch_q_values,batch_size=batch_size,epochs=1,verbose=0)
				self.epsilon*=self.epsilon_decay
				self.epsilon=max(self.epsilon,self.epsilon_min)
				total_rew+=rew
				state=n_s
				
				if is_t:
					break
			
			#test model at intervals
			if((epi+1)%self.test_model_interval==0):
				testX.append(epi)
				avg_rew_test=self.test()
				testY.append(avg_rew_test)

			#Remove and add rewards to calculate avg reward
			if(len(self.avg_rew_queue)>self.avg_rew_buffer):
				self.avg.rew_queue.popleft()
			self.avg_rew_queue.append(total_rew)
			avg_rew=sum(self.avg_rew_queue)/len(self.avg_rew_queue)
			
			######################SAVING SECTION###############################
			#Save at intervals
			#if((epi+1)%self.model_save==0):
			#	self.net.model.save('CartPole_linearwExpReplay_{}.h5'.format(epi))
			if max_<avg_rew_test and avg_rew_test>self.max_reward:
				#self.net.model.save('CartPole_linear_comp_8.h5')
				max_=avg_rew_test
			######################################################################
			print(epi,itr,avg_rew,total_rew)

		plot_eval(testX,testY) #Plotting after episodes are done
			

	def test(self, model_file=None):
		test_episodes=20
		rewards=[]
		if(model_file!=None):
			self.net.load_model(model_file)
		for e in range(test_episodes):
			state = np.reshape(self.env.reset(),[1,self.obs_space])
			time_steps = 0
			total_reward_per_episode = 0
			while True:
				if(self.render):
					self.env.render()
				action = self.epsilon_greedy_policy(self.net.model.predict(state),0.05)
				next_state, reward, is_t, _ = self.env.step(action)
				next_state=np.reshape(next_state,[1,self.obs_space])
				state = next_state
				total_reward_per_episode+=reward
				time_steps+=1
				if is_t:
					break
			rewards.append(total_reward_per_episode)
			print("Total Reward for Episode {} is {}".format(e,total_reward_per_episode))
		
		avg_rewards_=np.mean(np.array(rewards))
		std_dev=np.std(rewards)
		print("AvgRew={},Std={}".format(avg_rewards_,std_dev))
		return avg_rewards_

	def burn_in_memory(self):
		# Initialize replay memory with a burn_in number of episodes / transitions. 
		memory_size=0
		state=np.reshape(self.env.reset(),[1,self.obs_space])
		
		while(memory_size<self.memory_queue.burn_in):
			ac=random.randint(0,self.ac_space-1)
			n_s,rew,is_t,_=self.env.step(ac)
			n_s=np.reshape(n_s,[1,self.obs_space])
			
			transition=[state,ac,rew,is_t,n_s]
			self.memory_queue.append(transition)
			state=n_s
			if is_t:
				state=np.reshape(self.env.reset(),[1,self.obs_space])
			memory_size+=1

		print("Burned Memory Queue")
class MPOAgent:
    def __init__(self, env_id: str, logdir: Path):

        self.env_id = env_id

        self.summary_writer = tf.summary.create_file_writer(
            str(logdir)) if logdir else None

        self.action_space = gym.make(self.env_id).action_space.shape[0]

        self.replay_buffer = ReplayBuffer(maxlen=10000)

        self.policy = GaussianPolicyNetwork(action_space=self.action_space)
        self.target_policy = GaussianPolicyNetwork(
            action_space=self.action_space)

        self.critic = QNetwork()
        self.target_critic = QNetwork()

        self.log_temperature = tf.Variable(1.)

        self.log_alpha_mu = tf.Variable(1.)
        self.log_alpha_sigma = tf.Variable(1.)

        self.eps = 0.1

        self.eps_mu = 0.01
        self.eps_sigma = 0.001

        self.policy_optimizer = tf.keras.optimizers.Adam(lr=0.0005)
        self.critic_optimizer = tf.keras.optimizers.Adam(lr=0.0005)
        self.temperature_optimizer = tf.keras.optimizers.Adam(lr=0.0005)
        self.alpha_optimizer = tf.keras.optimizers.Adam(lr=0.0005)

        self.batch_size = 128

        self.n_samples = 10

        self.update_period = 4

        self.gamma = 0.99

        self.target_policy_update_period = 400

        self.target_critic_update_period = 400

        self.global_steps = 0

        self.episode_count = 0

        self.setup()

    def setup(self):
        """ Initialize network weights """

        env = gym.make(self.env_id)

        dummy_state = env.reset()
        dummy_state = (dummy_state[np.newaxis, ...]).astype(np.float32)

        dummy_action = np.random.normal(0, 0.1, size=self.action_space)
        dummy_action = (dummy_action[np.newaxis, ...]).astype(np.float32)

        self.policy(dummy_state)
        self.target_policy(dummy_state)

        self.critic(dummy_state, dummy_action)
        self.target_critic(dummy_state, dummy_action)

        self.target_policy.set_weights(self.policy.get_weights())
        self.target_critic.set_weights(self.critic.get_weights())

    def save(self, save_dir):
        save_dir = Path(save_dir)

        self.policy.save_weights(str(save_dir / "policy"))
        self.critic.save_weights(str(save_dir / "critic"))

    def load(self, load_dir=None):
        load_dir = Path(load_dir)

        self.policy.load_weights(str(load_dir / "policy"))
        self.target_policy.load_weights(str(load_dir / "policy"))

        self.critic.load_weights(str(load_dir / "critic"))
        self.target_critic.load_weights(str(load_dir / "critic"))

    def rollout(self):

        episode_rewards, episode_steps = 0, 0

        done = False

        env = gym.make(self.env_id)

        state = env.reset()

        while not done:

            action = self.policy.sample_action(np.atleast_2d(state))

            action = action.numpy()[0]

            try:
                next_state, reward, done, _ = env.step(action)
            except Exception as err:
                print(err)
                import pdb
                pdb.set_trace()

            #: Bipedalwalkerの転倒ペナルティ-100は大きすぎるためclip
            transition = Transition(state, action, np.clip(reward, -1., 1.),
                                    next_state, done)

            self.replay_buffer.add(transition)

            state = next_state

            episode_rewards += reward

            episode_steps += 1

            self.global_steps += 1

            if (len(self.replay_buffer) >= 5000
                    and self.global_steps % self.update_period == 0):
                self.update_networks()

            if self.global_steps % self.target_critic_update_period == 0:
                self.target_critic.set_weights(self.critic.get_weights())

            if self.global_steps % self.target_policy_update_period == 0:
                self.target_policy.set_weights(self.policy.get_weights())

        self.episode_count += 1
        with self.summary_writer.as_default():
            tf.summary.scalar("episode_reward_stp",
                              episode_rewards,
                              step=self.global_steps)
            tf.summary.scalar("episode_steps_stp",
                              episode_steps,
                              step=self.global_steps)
            tf.summary.scalar("episode_reward",
                              episode_rewards,
                              step=self.episode_count)
            tf.summary.scalar("episode_steps",
                              episode_steps,
                              step=self.episode_count)

        return episode_rewards, episode_steps

    def update_networks(self):

        (states, actions, rewards, next_states,
         dones) = self.replay_buffer.get_minibatch(batch_size=self.batch_size)

        B, M = self.batch_size, self.n_samples

        # [B, obs_dim] -> [B, obs_dim * M] -> [B * M, obs_dim]
        next_states_tiled = tf.reshape(tf.tile(next_states, multiples=(1, M)),
                                       shape=(B * M, -1))

        target_mu, target_sigma = self.target_policy(next_states_tiled)

        # For MultivariateGaussianPolicy
        #target_dist = tfd.MultivariateNormalFullCovariance(loc=target_mu, covariance_matrix=target_sigma)

        # For IndependentGaussianPolicy
        target_dist = tfd.Independent(tfd.Normal(loc=target_mu,
                                                 scale=target_sigma),
                                      reinterpreted_batch_ndims=1)

        sampled_actions = target_dist.sample()  # [B * M,  action_dim]
        #sampled_actions = tf.clip_by_value(sampled_actions, -1.0, 1.0)

        # Update Q-network:
        sampled_qvalues = tf.reshape(self.target_critic(
            next_states_tiled, sampled_actions),
                                     shape=(B, M, -1))
        mean_qvalues = tf.reduce_mean(sampled_qvalues, axis=1)
        TQ = rewards + self.gamma * (1.0 - dones) * mean_qvalues

        with tf.GradientTape() as tape1:
            Q = self.critic(states, actions)
            loss_critic = tf.reduce_mean(tf.square(TQ - Q))

        variables = self.critic.trainable_variables
        grads = tape1.gradient(loss_critic, variables)
        grads, _ = tf.clip_by_global_norm(grads, 40.)
        self.critic_optimizer.apply_gradients(zip(grads, variables))

        # E-step:
        # Obtain η* by minimising g(η)
        with tf.GradientTape() as tape2:
            temperature = tf.math.softplus(self.log_temperature)
            q_logsumexp = tf.math.reduce_logsumexp(sampled_qvalues /
                                                   temperature,
                                                   axis=1)
            loss_temperature = temperature * (
                self.eps + tf.reduce_mean(q_logsumexp, axis=0))

        grad = tape2.gradient(loss_temperature, self.log_temperature)
        if tf.math.is_nan(grad).numpy().sum() != 0:
            print("NAN GRAD in TEMPERATURE !!!!!!!!!")
            import pdb
            pdb.set_trace()
        else:
            self.temperature_optimizer.apply_gradients([
                (grad, self.log_temperature)
            ])

        # Obtain sample-based variational distribution q(a|s)
        temperature = tf.math.softplus(self.log_temperature)

        # M-step: Optimize the lower bound J with respect to θ
        weights = tf.squeeze(tf.math.softmax(sampled_qvalues / temperature,
                                             axis=1),
                             axis=2)  # [B, M, 1]

        if tf.math.is_nan(weights).numpy().sum() != 0:
            print("NAN in weights !!!!!!!!!")
            import pdb
            pdb.set_trace()

        with tf.GradientTape(persistent=True) as tape3:

            online_mu, online_sigma = self.policy(next_states_tiled)

            # For MultivariateGaussianPolicy
            #online_dist = tfd.MultivariateNormalFullCovariance(loc=online_mu, covariance_matrix=online_sigma)

            # For IndependentGaussianPolicy
            online_dist = tfd.Independent(tfd.Normal(loc=online_mu,
                                                     scale=online_sigma),
                                          reinterpreted_batch_ndims=1)

            log_probs = tf.reshape(online_dist.log_prob(sampled_actions) +
                                   1e-6,
                                   shape=(B, M))  # [B * M, ] -> [B, M]

            cross_entropy_qp = tf.reduce_sum(weights * log_probs,
                                             axis=1)  # [B, M] -> [B,]

            # For MultivariateGaussianPolicy
            # online_dist_fixedmu = tfd.MultivariateNormalFullCovariance(loc=target_mu, covariance_matrix=online_sigma)
            # online_dist_fixedsigma = tfd.MultivariateNormalFullCovariance(loc=online_mu, covariance_matrix=target_sigma)

            # For IndependentGaussianPolicy
            online_dist_fixedmu = tfd.Independent(tfd.Normal(
                loc=target_mu, scale=online_sigma),
                                                  reinterpreted_batch_ndims=1)
            online_dist_fixedsigma = tfd.Independent(
                tfd.Normal(loc=online_mu, scale=target_sigma),
                reinterpreted_batch_ndims=1)

            kl_mu = tf.reshape(
                target_dist.kl_divergence(online_dist_fixedsigma),
                shape=(B, M))  # [B * M, ] -> [B, M]

            kl_sigma = tf.reshape(
                target_dist.kl_divergence(online_dist_fixedmu),
                shape=(B, M))  # [B * M, ] -> [B, M]

            alpha_mu = tf.math.softplus(self.log_alpha_mu)
            alpha_sigma = tf.math.softplus(self.log_alpha_sigma)

            loss_policy = -cross_entropy_qp  # [B,]
            loss_policy += tf.stop_gradient(alpha_mu) * tf.reduce_mean(kl_mu,
                                                                       axis=1)
            loss_policy += tf.stop_gradient(alpha_sigma) * tf.reduce_mean(
                kl_sigma, axis=1)

            loss_policy = tf.reduce_mean(loss_policy)  # [B,] -> [1]

            loss_alpha_mu = tf.reduce_mean(
                alpha_mu *
                tf.stop_gradient(self.eps_mu - tf.reduce_mean(kl_mu, axis=1)))

            loss_alpha_sigma = tf.reduce_mean(
                alpha_sigma *
                tf.stop_gradient(self.eps_sigma -
                                 tf.reduce_mean(kl_sigma, axis=1)))

            loss_alpha = loss_alpha_mu + loss_alpha_sigma

        variables = self.policy.trainable_variables
        grads = tape3.gradient(loss_policy, variables)
        grads, _ = tf.clip_by_global_norm(grads, 40.)
        self.policy_optimizer.apply_gradients(zip(grads, variables))

        variables = [self.log_alpha_mu, self.log_alpha_sigma]
        grads = tape3.gradient(loss_alpha, variables)
        grads, _ = tf.clip_by_global_norm(grads, 40.)
        self.alpha_optimizer.apply_gradients(zip(grads, variables))

        del tape3

        with self.summary_writer.as_default():
            tf.summary.scalar("loss_policy",
                              loss_policy,
                              step=self.global_steps)
            tf.summary.scalar("loss_critic",
                              loss_critic,
                              step=self.global_steps)
            tf.summary.scalar("sigma",
                              tf.reduce_mean(online_sigma),
                              step=self.global_steps)
            tf.summary.scalar("kl_mu",
                              tf.reduce_mean(kl_mu),
                              step=self.global_steps)
            tf.summary.scalar("kl_sigma",
                              tf.reduce_mean(kl_sigma),
                              step=self.global_steps)
            tf.summary.scalar("temperature",
                              temperature,
                              step=self.global_steps)
            tf.summary.scalar("alpha_mu", alpha_mu, step=self.global_steps)
            tf.summary.scalar("alpha_sigma",
                              alpha_sigma,
                              step=self.global_steps)
            tf.summary.scalar("replay_buffer",
                              len(self.replay_buffer),
                              step=self.global_steps)

    def testplay(self, name, monitor_dir):

        total_rewards = []

        env = wrappers.RecordVideo(gym.make(self.env_id),
                                   video_folder=monitor_dir,
                                   step_trigger=lambda i: True,
                                   name_prefix=name)

        state = env.reset()

        done = False

        total_reward = 0

        while not done:

            action = self.policy.sample_action(np.atleast_2d(state))

            action = action.numpy()[0]

            next_state, reward, done, _ = env.step(action)

            total_reward += reward

            state = next_state

        total_rewards.append(total_reward)

        print(f"{name}", total_reward)
class Learner:
    def __init__(self, env_name, gamma=0.98):
        self.env_name = env_name
        self.action_space = gym.make(self.env_name).action_space.n
        self.q_network = QNetwork(self.action_space)
        self.target_q_network = QNetwork(self.action_space)
        self.gamma = gamma
        self.optimizer = tf.keras.optimizers.Adam(lr=0.001)
        self.setup()

    def setup(self):
        env = gym.make(self.env_name)
        state = env.reset()
        self.q_network(np.atleast_2d(state))
        self.target_q_network(np.atleast_2d(state))
        self.target_q_network.set_weights(self.q_network.get_weights())

    def get_weights(self):
        current_weights = self.q_network.get_weights()
        return current_weights

    def update_network(self, minibatchs):

        indices_all = []
        td_errors_all = []
        losses = []

        for (indices, weights, transitions) in minibatchs:

            states, actions, rewards, next_states, dones = zip(*transitions)

            states = np.vstack(states)
            actions = np.array(actions)
            rewards = np.vstack(rewards)
            next_states = np.vstack(next_states)
            dones = np.vstack(dones)

            next_qvalues = self.q_network(next_states)
            next_actions = tf.cast(tf.argmax(next_qvalues, axis=1), tf.int32)
            next_actions_onehot = tf.one_hot(next_actions, self.action_space)
            next_maxQ = tf.reduce_sum(next_qvalues * next_actions_onehot,
                                      axis=1,
                                      keepdims=True)
            TQ = rewards + self.gamma * (1 - dones) * next_maxQ

            with tf.GradientTape() as tape:
                qvalues = self.q_network(states)
                actions_onehot = tf.one_hot(actions, self.action_space)
                Q = tf.reduce_sum(qvalues * actions_onehot,
                                  axis=1,
                                  keepdims=True)
                td_errors = tf.square(TQ - Q)
                loss = tf.reduce_mean(weights * td_errors)

            grads = tape.gradient(loss, self.q_network.trainable_variables)
            grads, _ = tf.clip_by_global_norm(grads, 40.0)
            self.optimizer.apply_gradients(
                zip(grads, self.q_network.trainable_variables))

            indices_all += indices
            td_errors_all += td_errors.numpy().flatten().tolist()
            losses.append(loss)

        loss_info = np.array(losses).mean()
        current_weights = self.q_network.get_weights()

        return current_weights, indices_all, td_errors_all, loss_info
class Actor:
    def __init__(self, pid, env_name, epsilon, gamma=0.98):

        self.pid = pid
        self.env_name = env_name
        self.env = gym.make(self.env_name)
        self.action_space = self.env.action_space.n

        self.q_network = QNetwork(self.action_space)
        self.epsilon = epsilon
        self.gamma = gamma
        self.buffer = []

        self.state = self.env.reset()
        self.setup()

        self.episode_rewards = 0

    def setup(self):
        env = gym.make(self.env_name)
        state = env.reset()
        self.q_network(np.atleast_2d(state))

    def rollout(self, current_weights):
        #: グローバルQ関数と重みを同期
        self.q_network.set_weights(current_weights)

        #: rollout 100step
        for _ in range(100):
            state = self.state
            action = self.q_network.sample_action(state, self.epsilon)
            next_state, reward, done, _ = self.env.step(action)
            self.episode_rewards += reward
            transition = (state, action, reward, next_state, done)
            self.buffer.append(transition)

            if done:
                #print(self.episode_rewards)
                self.state = self.env.reset()
                self.episode_rewards = 0
            else:
                self.state = next_state

        #: 初期優先度の計算
        states = np.vstack([transition[0] for transition in self.buffer])
        actions = np.array([transition[1] for trainsition in self.buffer])
        rewards = np.vstack([transition[2] for trainsition in self.buffer])
        next_states = np.vstack([transition[3] for transition in self.buffer])
        dones = np.vstack([transition[4] for transition in self.buffer])

        next_qvalues = self.q_network(next_states)
        next_actions = tf.cast(tf.argmax(next_qvalues, axis=1), tf.int32)
        next_actions_onehot = tf.one_hot(next_actions, self.action_space)
        next_maxQ = tf.reduce_sum(next_qvalues * next_actions_onehot,
                                  axis=1,
                                  keepdims=True)

        TQ = rewards + self.gamma * (1 - dones) * next_maxQ

        qvalues = self.q_network(states)
        actions_onehot = tf.one_hot(actions, self.action_space)
        Q = tf.reduce_sum(qvalues * actions_onehot, axis=1, keepdims=True)

        td_errors = (TQ - Q).numpy().flatten()
        transitions = self.buffer
        self.buffer = []

        return td_errors, transitions, self.pid

    def test_play(self, current_weights):

        self.q_network.set_weights(current_weights)

        env = gym.make(self.env_name)
        state = env.reset()
        episode_rewards = 0
        done = False
        while not done:
            action = self.q_network.sample_action(state, self.epsilon)
            next_state, reward, done, _ = env.step(action)
            episode_rewards += reward
            state = next_state

        return episode_rewards
Пример #13
0
                                        outputLength,
                                        args.hidden_layers_policy,
                                        policyActivations,
                                        nextObsPh,
                                        aPh,
                                        "Target",
                                        actionMeanScale=np.expand_dims(
                                            clip[1, :], 0),
                                        logStdInit=None,
                                        logStdTrainable=False,
                                        actionClip=clip)
 Q1 = QNetwork(sess,
               inputLength,
               outputLength,
               args.hidden_layers_q,
               qActivations,
               obsPh,
               aPh,
               hiddenLayerMergeWithAction,
               suffix="Orig1")  # original Q network 1
 Q2 = QNetwork(sess,
               inputLength,
               outputLength,
               args.hidden_layers_q,
               qActivations,
               obsPh,
               aPh,
               hiddenLayerMergeWithAction,
               suffix="Orig2")  # original Q network 2
 QAux1 = QNetwork(
     sess,