Exemplo n.º 1
0
    def __init__(self, env, state_dim=None):
        self.name = 'DDPG'  # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # along with their target networks
        if state_dim:
            self.state_dim = state_dim
            print(self.state_dim)
        else:
            self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

        # Flag to signal save
        self.not_saved = True

        #For normalisation
        self.state_mean = 0
        self.state_std = 1
        self.target_mean = 0
        self.target_std = 1
Exemplo n.º 2
0
 def __init__(self, target_model=None, train=True, replaybuffer=ReplayBuffer(HP.BUFFER_SIZE)):
   assert isinstance(target_model, DDPG) or target_model == None
   config = tf.ConfigProto()
   sess = tf.Session(config=config)
   K.set_session(sess)
   target_actor = target_model.actor if target_model != None else None
   target_critic = target_model.critic if target_model != None else None
   self.actor = ActorNetwork(**DDPG.actorParams(sess, target_actor))
   self.critic = CriticNetwork(**DDPG.criticParams(sess, target_critic))
   self.target_model = target_model
   self.replaybuffer = replaybuffer
   self.train = train
   self.epsilon = 1
Exemplo n.º 3
0
def main():
    with tf.Session() as sess:

        env = gym.make(ENV_NAME)
        np.random.seed(RANDOM_SEED)
        tf.set_random_seed(RANDOM_SEED)
        env.seed(RANDOM_SEED)

        # Check environment dimensions
        state_dim = env.observation_space.shape[0]
        action_dim = env.action_space.n
        action_bound = 1
        # print("Sample Action: ")
        # print(env.action_space.sample())
        # print("Sample Shape")
        # print(np.shape(env.action_space.sample()))
        # print("Valid Action")
        # val_act = np.array([[1.05],[0.5],[-1.3],[0.2]])
        # print(env.action_space.contains(val_act))
        # Ensure action bound is symmetric
        # assert (env.action_space.high == -env.action_space.low)

        # Build actor and critic networks
        actor = ActorNetwork(sess, state_dim, action_dim, action_bound,
                             ACTOR_LEARNING_RATE, TAU)

        critic = CriticNetwork(sess, state_dim, action_dim,
                               CRITIC_LEARNING_RATE, TAU,
                               actor.get_num_trainable_vars())

        # Film training videos if applicable
        # env = wrappers.Monitor(env, MONITOR_DIR, force=True, video_callable=lambda episode_id: episode_id%49==0)

        train(sess, env, actor, critic, RESTORE)
Exemplo n.º 4
0
    def __init__(self, env):
        self.name = 'DDPG'  # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # along with their target networks
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)
Exemplo n.º 5
0
def main():

    numHalls = 4
    hallWidth = 1.5
    hallLength = 20
    turns = ['right', 'right', 'right', 'right']
    car_dist_s = hallWidth / 2.0
    car_dist_f = hallLength / 2.0
    car_heading = 0
    time_step = 0.1

    with tf.Session() as sess:

        env = World(numHalls, hallWidth, hallLength, turns,\
                    car_dist_s, car_dist_f, car_heading, MAX_EP_STEPS,\
                    time_step, LIDAR_FIELD_OF_VIEW, LIDAR_NUM_RAYS,\
                    lidar_noise = LIDAR_NOISE, lidar_missing_rays = LIDAR_MISSING_RAYS)

        #np.random.seed(RANDOM_SEED)
        tf.set_random_seed(RANDOM_SEED)

        # Check environment dimensions
        state_dim = env.observation_space.shape[0]
        action_dim = env.action_space.shape[0]
        action_bound = env.action_space.high

        # Build actor and critic networks
        actor = ActorNetwork(
            sess,
            state_dim,
            action_dim,
            action_bound,
            MAX_ACTOR_LEARNING_RATE,
            TAU,
            layer1_size=l1size,
            layer2_size=l2size,
        )

        critic = CriticNetwork(sess, state_dim, action_dim,
                               MIN_CRITIC_LEARNING_RATE, TAU,
                               actor.get_num_trainable_vars())

        train(sess, env, actor, critic, RESTORE)
Exemplo n.º 6
0
class DDPG:
    """docstring for DDPG"""
    def __init__(self, env):
        self.name = 'DDPG'  # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        # state_dim = 2, action_dim = 2
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

    def train(self):
        # print "train step",self.time_step
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        # Calculate y_batch
        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,
                                                     next_action_batch)
        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch, state_batch, action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(
            state_batch, action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch, state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

        return self.critic_network.q_value(state_batch, action_batch)

    def noise_action(self, state):
        # Select action a_t according to the current policy and exploration noise
        action = self.actor_network.action(state)
        return action + self.exploration_noise.noise()

    def action(self, state):
        action = self.actor_network.action(state)
        return action

    def perceive(self, state, action, reward, next_state, done):
        q_value = 0
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state, action, reward, next_state, done)

        # Store transitions to replay start size then start training
        if self.replay_buffer.size() > REPLAY_START_SIZE:
            q_value = self.train()

        # if self.time_step % 10000 == 0:
        # self.actor_network.save_network(self.time_step)
        # self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()

        return q_value
Exemplo n.º 7
0
class DDPG:
  @staticmethod
  def actorParams(sess, target):
    return {'sess': sess,
            'state_size':HP.STATE_DIM,
            'action_size':HP.ACTION_DIM,
            'tau':HP.ACTOR_TAU,
            'lr':HP.ACTOR_LR,
            'target': target}

  @staticmethod
  def criticParams(sess, target):
    return {'sess': sess,
            'state_size':HP.STATE_DIM,
            'action_size':HP.ACTION_DIM,
            'tau':HP.CRITIC_TAU,
            'lr':HP.CRITIC_LR,
            'target': target}

  def __init__(self, target_model=None, train=True, replaybuffer=ReplayBuffer(HP.BUFFER_SIZE)):
    assert isinstance(target_model, DDPG) or target_model == None
    config = tf.ConfigProto()
    sess = tf.Session(config=config)
    K.set_session(sess)
    target_actor = target_model.actor if target_model != None else None
    target_critic = target_model.critic if target_model != None else None
    self.actor = ActorNetwork(**DDPG.actorParams(sess, target_actor))
    self.critic = CriticNetwork(**DDPG.criticParams(sess, target_critic))
    self.target_model = target_model
    self.replaybuffer = replaybuffer
    self.train = train
    self.epsilon = 1

  def act(self, obs):
    action = self.actor.model.predict(obs.reshape(1,HP.STATE_DIM))
    action = action * (HP.MAX_ACTION-HP.MIN_ACTION) + HP.MIN_ACTION
    if self.train and self.epsilon > 0:
      self.epsilon -= 1e-6
      action = self.addOU(action)
    return action

  def addOU(self, action):
    return action + OU.ou(action, HP.OU_MEAN, HP.OU_THETA, HP.OU_SIGMA)

  def train_models(self):
    batch = self.replaybuffer.getBatch(HP.BATCH_SIZE)
    experiences = [np.asarray([i[j] for i in batch]) for j in range(5)]
    states, actions, rewards, nstates, dones = experiences
    target_q = self.compute_target_q(nstates, rewards, dones)
    loss = self.critic.model.train_on_batch([states, actions], target_q)
    a_for_grad = self.actor.model.predict(states)
    grads = self.critic.gradients(states, a_for_grad)
    self.actor.train(states, grads)
    self.actor.target_train()
    self.critic.target_train()

  def remember(self, obs, action, reward, next_obs, done):
    self.replaybuffer.add(obs, action, reward, next_obs, done)

  def compute_target_q(self, nstates, rewards, dones):
    target_q_values = self.target_model.critic.model.predict([nstates, \
                      self.target_model.actor.model.predict(nstates)])
    y_t = np.zeros(len(nstates))
    for idx, reward in enumerate(rewards):
      y_t[idx] = reward
      if not dones[idx]:
        y_t += HP.GAMMA*target_q_values[idx]
    return y_t

  def copy_from_target(self):
    self.actor.copy_from_target()
    self.critic.copy_from_target()

  def save(self, location, epoch):
    self.actor.model.save(location+'/actor_model_{}.h5'.format(epoch))
    self.critic.model.save(location+'/critic_model_{}.h5'.format(epoch))
Exemplo n.º 8
0
class DDPG:
    def __init__(self, env, state_dim=None):
        self.name = 'DDPG'  # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # along with their target networks
        if state_dim:
            self.state_dim = state_dim
            print(self.state_dim)
        else:
            self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

        # Flag to signal save
        self.not_saved = True

        #For normalisation
        self.state_mean = 0
        self.state_std = 1
        self.target_mean = 0
        self.target_std = 1

    def train(self):
        #print "train step",self.time_step
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        #For Normalisation
        states = np.array(state_batch)
        targets = np.array(next_state_batch)
        self.state_mean = states.mean(axis=0)
        self.state_std = states.std(axis=0) + 0.00000001
        self.target_mean = targets.mean(axis=0)
        self.target_std = targets.std(axis=0) + 0.00000001
        states = (state_batch - self.state_mean) / self.state_std
        targets = (next_state_batch - self.target_mean) / self.target_std
        state_batch = states.tolist()
        next_state_batch = targets.tolist()

        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        # Calculate y_batch
        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,
                                                     next_action_batch)
        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch, state_batch, action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(
            state_batch, action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch, state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def noise_action(self, state):
        # Select action a_t according to the current policy and exploration noise

        # Normalising first
        state = np.array(state)
        state = (state - self.state_mean) / self.state_std
        state = state.tolist()

        action = self.actor_network.action(state)
        #         print ("State-: ", state)
        #         print ("Action-: ", action)
        return action + self.exploration_noise.noise()

    def action(self, state):
        # Normalising first
        state = np.array(state)
        state = (state - self.state_mean) / self.state_std
        state = state.tolist()
        # Taking action
        action = self.actor_network.action(state)
        return action

    def perceive(self, state, action, reward, next_state, done, episode):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state, action, reward, next_state, done)

        # Store transitions to replay start size then start training
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            self.train()

        self.time_step = self.critic_network.time_step
        if episode % 20 == 0:  #self.time_step % 400 == 0:
            if self.not_saved:
                self.actor_network.save_network(episode)  #(self.time_step)
                self.critic_network.save_network(episode)  #(self.time_step)
                self.not_saved = False
        else:
            self.not_saved = True

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()