class NeuralQLearner(object):

  def __init__(self, session,
                     optimizer,
                     q_network,
                     state_dim,
                     num_actions,
                     batch_size=32,
                     init_exp=0.5,       # initial exploration prob
                     final_exp=0.1,      # final exploration prob
                     anneal_steps=10000, # N steps for annealing exploration 
                     replay_buffer_size=10000,
                     store_replay_every=5, # how frequent to store experience
                     discount_factor=0.9, # discount future rewards
                     target_update_rate=0.01,
                     reg_param=0.01, # regularization constants
                     max_gradient=5, # max gradient norms
                     double_q_learning=False,
                     summary_writer=None,
                     summary_every=100):

    # tensorflow machinery
    self.session        = session
    self.optimizer      = optimizer
    self.summary_writer = summary_writer

    # model components
    self.q_network     = q_network
    self.replay_buffer = ReplayBuffer(buffer_size=replay_buffer_size)

    # Q learning parameters
    self.batch_size      = batch_size
    self.state_dim       = state_dim
    self.num_actions     = num_actions
    self.exploration     = init_exp
    self.init_exp        = init_exp
    self.final_exp       = final_exp
    self.anneal_steps    = anneal_steps
    self.discount_factor = discount_factor
    self.target_update_rate = target_update_rate
    self.double_q_learning = double_q_learning

    # training parameters
    self.max_gradient = max_gradient
    self.reg_param    = reg_param

    # counters
    self.store_replay_every   = store_replay_every
    self.store_experience_cnt = 0
    self.train_iteration      = 0

    # create and initialize variables
    self.create_variables()
    var_lists = tf.get_collection(tf.GraphKeys.VARIABLES)
    self.session.run(tf.initialize_variables(var_lists))

    # make sure all variables are initialized
    self.session.run(tf.assert_variables_initialized())

    if self.summary_writer is not None:
      # graph was not available when journalist was created
      self.summary_writer.add_graph(self.session.graph)
      self.summary_every = summary_every

  def create_variables(self):
    # compute action from a state: a* = argmax_a Q(s_t,a)
    with tf.name_scope("predict_actions"):
      # raw state representation
      self.states = tf.placeholder(tf.float32, (None, self.state_dim), name="states")
      # initialize Q network
      with tf.variable_scope("q_network"):
        self.q_outputs = self.q_network(self.states)
      # predict actions from Q network
      self.action_scores = tf.identity(self.q_outputs, name="action_scores")
      tf.histogram_summary("action_scores", self.action_scores)
      self.predicted_actions = tf.argmax(self.action_scores, dimension=1, name="predicted_actions")

    # estimate rewards using the next state: r(s_t,a_t) + argmax_a Q(s_{t+1}, a)
    with tf.name_scope("estimate_future_rewards"):
      self.next_states = tf.placeholder(tf.float32, (None, self.state_dim), name="next_states")
      self.next_state_mask = tf.placeholder(tf.float32, (None,), name="next_state_masks")

      if self.double_q_learning:
        # reuse Q network for action selection
        with tf.variable_scope("q_network", reuse=True):
          self.q_next_outputs = self.q_network(self.next_states)
        self.action_selection = tf.argmax(tf.stop_gradient(self.q_next_outputs), 1, name="action_selection")
        tf.histogram_summary("action_selection", self.action_selection)
        self.action_selection_mask = tf.one_hot(self.action_selection, self.num_actions, 1, 0)
        # use target network for action evaluation
        with tf.variable_scope("target_network"):
          self.target_outputs = self.q_network(self.next_states) * tf.cast(self.action_selection_mask, tf.float32)
        self.action_evaluation = tf.reduce_sum(self.target_outputs, reduction_indices=[1,])
        tf.histogram_summary("action_evaluation", self.action_evaluation)
        self.target_values = self.action_evaluation * self.next_state_mask
      else:
        # initialize target network
        with tf.variable_scope("target_network"):
          self.target_outputs = self.q_network(self.next_states)
        # compute future rewards
        self.next_action_scores = tf.stop_gradient(self.target_outputs)
        self.target_values = tf.reduce_max(self.next_action_scores, reduction_indices=[1,]) * self.next_state_mask
        tf.histogram_summary("next_action_scores", self.next_action_scores)

      self.rewards = tf.placeholder(tf.float32, (None,), name="rewards")
      self.future_rewards = self.rewards + self.discount_factor * self.target_values

    # compute loss and gradients
    with tf.name_scope("compute_temporal_differences"):
      # compute temporal difference loss
      self.action_mask = tf.placeholder(tf.float32, (None, self.num_actions), name="action_mask")
      self.masked_action_scores = tf.reduce_sum(self.action_scores * self.action_mask, reduction_indices=[1,])
      self.temp_diff = self.masked_action_scores - self.future_rewards
      self.td_loss = tf.reduce_mean(tf.square(self.temp_diff))
      # regularization loss
      q_network_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="q_network")
      self.reg_loss = self.reg_param * tf.reduce_sum([tf.reduce_sum(tf.square(x)) for x in q_network_variables])
      # compute total loss and gradients
      self.loss = self.td_loss + self.reg_loss
      gradients = self.optimizer.compute_gradients(self.loss)
      # clip gradients by norm
      for i, (grad, var) in enumerate(gradients):
        if grad is not None:
          gradients[i] = (tf.clip_by_norm(grad, self.max_gradient), var)
      # add histograms for gradients.
      for grad, var in gradients:
        tf.histogram_summary(var.name, var)
        if grad is not None:
          tf.histogram_summary(var.name + '/gradients', grad)
      self.train_op = self.optimizer.apply_gradients(gradients)

    # update target network with Q network
    with tf.name_scope("update_target_network"):
      self.target_network_update = []
      # slowly update target network parameters with Q network parameters
      q_network_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="q_network")
      target_network_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="target_network")
      for v_source, v_target in zip(q_network_variables, target_network_variables):
        # this is equivalent to target = (1-alpha) * target + alpha * source
        update_op = v_target.assign_sub(self.target_update_rate * (v_target - v_source))
        self.target_network_update.append(update_op)
      self.target_network_update = tf.group(*self.target_network_update)

    # scalar summaries
    tf.scalar_summary("td_loss", self.td_loss)
    tf.scalar_summary("reg_loss", self.reg_loss)
    tf.scalar_summary("total_loss", self.loss)
    tf.scalar_summary("exploration", self.exploration)

    self.summarize = tf.merge_all_summaries()
    self.no_op = tf.no_op()

  def storeExperience(self, state, action, reward, next_state, done):
    # always store end states
    if self.store_experience_cnt % self.store_replay_every == 0 or done:
      self.replay_buffer.add(state, action, reward, next_state, done)
    self.store_experience_cnt += 1

  def eGreedyAction(self, states, explore=True):
    if explore and self.exploration > random.random():
      return random.randint(0, self.num_actions-1)
    else:
      return self.session.run(self.predicted_actions, {self.states: states})[0]

  def annealExploration(self, stategy='linear'):
    ratio = max((self.anneal_steps - self.train_iteration)/float(self.anneal_steps), 0)
    self.exploration = (self.init_exp - self.final_exp) * ratio + self.final_exp

  def updateModel(self):
    # not enough experiences yet
    if self.replay_buffer.count() < self.batch_size:
      return

    batch           = self.replay_buffer.getBatch(self.batch_size)
    states          = np.zeros((self.batch_size, self.state_dim))
    rewards         = np.zeros((self.batch_size,))
    action_mask     = np.zeros((self.batch_size, self.num_actions))
    next_states     = np.zeros((self.batch_size, self.state_dim))
    next_state_mask = np.zeros((self.batch_size,))

    for k, (s0, a, r, s1, done) in enumerate(batch):
      states[k] = s0
      rewards[k] = r
      action_mask[k][a] = 1
      # check terminal state
      if not done:
        next_states[k] = s1
        next_state_mask[k] = 1

    # whether to calculate summaries
    calculate_summaries = self.train_iteration % self.summary_every == 0 and self.summary_writer is not None

    # perform one update of training
    cost, _, summary_str = self.session.run([
      self.loss,
      self.train_op,
      self.summarize if calculate_summaries else self.no_op
    ], {
      self.states:          states,
      self.next_states:     next_states,
      self.next_state_mask: next_state_mask,
      self.action_mask:     action_mask,
      self.rewards:         rewards
    })

    # update target network using Q-network
    self.session.run(self.target_network_update)

    # emit summaries
    if calculate_summaries:
      self.summary_writer.add_summary(summary_str, self.train_iteration)

    self.annealExploration()
    self.train_iteration += 1
示例#2
0
class DeepDeterministicPolicyGradient(object):

  def __init__(self, session,
                     optimizer,
                     actor_network,
                     critic_network,
                     state_dim,
                     action_dim,
                     batch_size=32,
                     replay_buffer_size=1000000, # size of replay buffer
                     store_replay_every=1,       # how frequent to store experience
                     discount_factor=0.99,       # discount future rewards
                     target_update_rate=0.01,
                     reg_param=0.01,             # regularization constants
                     max_gradient=5,             # max gradient norms
                     noise_sigma=0.20,
                     noise_theta=0.15,
                     summary_writer=None,
                     summary_every=100):

    # tensorflow machinery
    self.session        = session
    self.optimizer      = optimizer
    self.summary_writer = summary_writer

    # model components
    self.actor_network  = actor_network
    self.critic_network = critic_network
    self.replay_buffer  = ReplayBuffer(buffer_size=replay_buffer_size)

    # training parameters
    self.batch_size         = batch_size
    self.state_dim          = state_dim
    self.action_dim         = action_dim
    self.discount_factor    = discount_factor
    self.target_update_rate = target_update_rate
    self.max_gradient       = max_gradient
    self.reg_param          = reg_param

    # Ornstein-Uhlenbeck noise for exploration
    self.noise_var = tf.Variable(tf.zeros([1, action_dim]))
    noise_random = tf.random_normal([1, action_dim], stddev=noise_sigma)
    self.noise = self.noise_var.assign_sub((noise_theta) * self.noise_var - noise_random)

    # counters
    self.store_replay_every   = store_replay_every
    self.store_experience_cnt = 0
    self.train_iteration      = 0

    # create and initialize variables
    self.create_variables()
    var_lists = tf.get_collection(tf.GraphKeys.VARIABLES)
    self.session.run(tf.initialize_variables(var_lists))

    # make sure all variables are initialized
    self.session.run(tf.assert_variables_initialized())

    if self.summary_writer is not None:
      # graph was not available when journalist was created
      self.summary_writer.add_graph(self.session.graph)
      self.summary_every = summary_every

  def create_variables(self):
    
    with tf.name_scope("model_inputs"):
      # raw state representation
      self.states = tf.placeholder(tf.float32, (None, self.state_dim), name="states")
      # action input used by critic network
      self.action = tf.placeholder(tf.float32, (None, self.action_dim), name="action")

    # define outputs from the actor and the critic
    with tf.name_scope("predict_actions"):
      # initialize actor-critic network
      with tf.variable_scope("actor_network"):
        self.policy_outputs = self.actor_network(self.states)
      with tf.variable_scope("critic_network"):
        self.value_outputs    = self.critic_network(self.states, self.action)
        self.action_gradients = tf.gradients(self.value_outputs, self.action)[0]

      # predict actions from policy network
      self.predicted_actions = tf.identity(self.policy_outputs, name="predicted_actions")
      tf.histogram_summary("predicted_actions", self.predicted_actions)
      tf.histogram_summary("action_scores", self.value_outputs)

    # get variable list
    actor_network_variables  = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="actor_network")
    critic_network_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="critic_network")

    # estimate rewards using the next state: r + argmax_a Q'(s_{t+1}, u'(a))
    with tf.name_scope("estimate_future_rewards"):
      self.next_states = tf.placeholder(tf.float32, (None, self.state_dim), name="next_states")
      self.next_state_mask = tf.placeholder(tf.float32, (None,), name="next_state_masks")
      self.rewards = tf.placeholder(tf.float32, (None,), name="rewards")

      # initialize target network
      with tf.variable_scope("target_actor_network"):
        self.target_actor_outputs = self.actor_network(self.next_states)
      with tf.variable_scope("target_critic_network"):
        self.target_critic_outputs = self.critic_network(self.next_states, self.target_actor_outputs)

      # compute future rewards
      self.next_action_scores = tf.stop_gradient(self.target_critic_outputs)[:,0] * self.next_state_mask
      tf.histogram_summary("next_action_scores", self.next_action_scores)
      self.future_rewards = self.rewards + self.discount_factor * self.next_action_scores

    # compute loss and gradients
    with tf.name_scope("compute_pg_gradients"):

      # compute gradients for critic network
      self.temp_diff        = self.value_outputs[:,0] - self.future_rewards
      self.mean_square_loss = tf.reduce_mean(tf.square(self.temp_diff))
      self.critic_reg_loss  = tf.reduce_sum([tf.reduce_sum(tf.square(x)) for x in critic_network_variables])
      self.critic_loss      = self.mean_square_loss + self.reg_param * self.critic_reg_loss
      self.critic_gradients = self.optimizer.compute_gradients(self.critic_loss, critic_network_variables)

      # compute actor gradients (we don't do weight decay for actor network)
      self.q_action_grad = tf.placeholder(tf.float32, (None, self.action_dim), name="q_action_grad")
      actor_policy_gradients = tf.gradients(self.policy_outputs, actor_network_variables, -self.q_action_grad)
      self.actor_gradients = zip(actor_policy_gradients, actor_network_variables)

      # collect all gradients
      self.gradients = self.actor_gradients + self.critic_gradients

      # clip gradients
      for i, (grad, var) in enumerate(self.gradients):
        # clip gradients by norm
        if grad is not None:
          self.gradients[i] = (tf.clip_by_norm(grad, self.max_gradient), var)

      # summarize gradients
      for grad, var in self.gradients:
        tf.histogram_summary(var.name, var)
        if grad is not None:
          tf.histogram_summary(var.name + '/gradients', grad)

      # emit summaries
      tf.scalar_summary("critic_loss", self.critic_loss)
      tf.scalar_summary("critic_td_loss", self.mean_square_loss)
      tf.scalar_summary("critic_reg_loss", self.critic_reg_loss)

      # apply gradients to update actor network
      self.train_op = self.optimizer.apply_gradients(self.gradients)

    # update target network with Q network
    with tf.name_scope("update_target_network"):
      self.target_network_update = []

      # slowly update target network parameters with the actor network parameters
      actor_network_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="actor_network")
      target_actor_network_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="target_actor_network")
      for v_source, v_target in zip(actor_network_variables, target_actor_network_variables):
        # this is equivalent to target = (1-alpha) * target + alpha * source
        update_op = v_target.assign_sub(self.target_update_rate * (v_target - v_source))
        self.target_network_update.append(update_op)

      # same for the critic network
      critic_network_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="critic_network")
      target_critic_network_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="target_critic_network")
      for v_source, v_target in zip(critic_network_variables, target_critic_network_variables):
        # this is equivalent to target = (1-alpha) * target + alpha * source
        update_op = v_target.assign_sub(self.target_update_rate * (v_target - v_source))
        self.target_network_update.append(update_op)

      # group all assignment operations together
      self.target_network_update = tf.group(*self.target_network_update)

    self.summarize = tf.merge_all_summaries()
    self.no_op = tf.no_op()

  def sampleAction(self, states, exploration=True):
    policy_outs, ou_noise = self.session.run([
      self.policy_outputs,
      self.noise
    ], {
      self.states: states
    })
    # add OU noise for exploration
    policy_outs = policy_outs + ou_noise if exploration else policy_outs
    return policy_outs

  def updateModel(self):

    # not enough experiences yet
    if self.replay_buffer.count() < self.batch_size:
      return

    batch           = self.replay_buffer.getBatch(self.batch_size)
    states          = np.zeros((self.batch_size, self.state_dim))
    rewards         = np.zeros((self.batch_size,))
    actions         = np.zeros((self.batch_size, self.action_dim))
    next_states     = np.zeros((self.batch_size, self.state_dim))
    next_state_mask = np.zeros((self.batch_size,))

    for k, (s0, a, r, s1, done) in enumerate(batch):
      states[k]  = s0
      rewards[k] = r
      actions[k] = a
      if not done:
        next_states[k] = s1
        next_state_mask[k] = 1

    # whether to calculate summaries
    calculate_summaries = self.train_iteration % self.summary_every == 0 and self.summary_writer is not None

    # compute a = u(s)
    policy_outs = self.session.run(self.policy_outputs, {
      self.states: states
    })

    # compute d_a Q(s,a) where s=s_i, a=u(s)
    action_grads = self.session.run(self.action_gradients, {
      self.states: states,
      self.action: policy_outs
    })

    critic_loss, _, summary_str = self.session.run([
      self.critic_loss,
      self.train_op,
      self.summarize if calculate_summaries else self.no_op
    ], {
      self.states:          states,
      self.next_states:     next_states,
      self.next_state_mask: next_state_mask,
      self.action:          actions,
      self.rewards:         rewards,
      self.q_action_grad:   action_grads
    })

    # update target network using Q-network
    self.session.run(self.target_network_update)

    # emit summaries
    if calculate_summaries:
      self.summary_writer.add_summary(summary_str, self.train_iteration)

    self.train_iteration += 1

  def storeExperience(self, state, action, reward, next_state, done):
    # always store end states
    if self.store_experience_cnt % self.store_replay_every == 0 or done:
      self.replay_buffer.add(state, action, reward, next_state, done)
    self.store_experience_cnt += 1
示例#3
0
文件: ddpg.py 项目: ZhichenML/IPPS
def run_ddpg(amodel,
             cmodel,
             train_indicator=0,
             seeded=1337,
             track_name='practgt2.xml'):
    OU = FunctionOU()
    BUFFER_SIZE = 100000
    BATCH_SIZE = 32
    GAMMA = 0.99
    TAU = 0.001  # Target Network HyperParameters
    LRA = 0.0001  # Learning rate for Actor
    LRC = 0.001  # Lerning rate for Critic
    ALPHA = 0.9

    action_dim = 3  # Steering/Acceleration/Brake
    state_dim = 29  # of sensors input

    np.random.seed(seeded)

    vision = False

    EXPLORE = 100000.
    if train_indicator:
        episode_count = 600
    else:
        episode_count = 3
    max_steps = 20000
    reward = 0
    done = False
    step = 0
    epsilon = 1
    indicator = 0

    # Tensorflow GPU optimization
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    from keras import backend as K
    K.set_session(sess)

    actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA)
    critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC)
    buff = ReplayBuffer(BUFFER_SIZE)  # Create replay buffer

    # Generate a Torcs environment
    env = TorcsEnv(vision=vision,
                   throttle=True,
                   gear_change=False,
                   track_name=track_name)

    if not train_indicator:
        # Now load the weight
        #logging.info("Now we load the weight")
        print("Now we load the weight")
        try:
            actor.model.load_weights(amodel)
            critic.model.load_weights(cmodel)
            actor.target_model.load_weights(amodel)
            critic.target_model.load_weights(cmodel)
            #logging.info(" Weight load successfully")
            print("Weight load successfully")
        except:
            #ogging.info("Cannot find the weight")
            print("Cannot find the weight")
            exit()

    #logging.info("TORCS Experiment Start.")
    print("TORCS Experiment Start.")
    best_lap = 500

    for i_episode in range(episode_count):
        print("Episode : " + str(i_episode) + " Replay Buffer " +
              str(buff.count()))
        #logging.info("Episode : " + str(i_episode) + " Replay Buffer " + str(buff.count()))
        if np.mod(i_episode, 3) == 0:
            ob = env.reset(
                relaunch=True
            )  # relaunch TORCS every 3 episode because of the memory leak error
        else:
            ob = env.reset()

        s_t = np.hstack((ob.speedX, ob.angle, ob.trackPos, ob.speedY,
                         ob.speedZ, ob.rpm, ob.wheelSpinVel / 100.0, ob.track))

        total_reward = 0.

        for j_iter in range(max_steps):
            loss = 0
            epsilon -= 1.0 / EXPLORE
            a_t = np.zeros([1, action_dim])
            noise_t = np.zeros([1, action_dim])

            a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0]))
            noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][0], 0.0, 0.60, 0.30)
            noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][1], 0.5, 1.00, 0.10)
            noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][2], -0.1, 1.00, 0.05)

            a_t[0][0] = a_t_original[0][0] + noise_t[0][0]
            a_t[0][1] = a_t_original[0][1] + noise_t[0][1]
            a_t[0][2] = a_t_original[0][2] + noise_t[0][2]

            ob, r_t, done, info = env.step(a_t[0])

            s_t1 = np.hstack(
                (ob.speedX, ob.angle, ob.trackPos, ob.speedY, ob.speedZ,
                 ob.rpm, ob.wheelSpinVel / 100.0, ob.track))

            buff.add(s_t, a_t[0], r_t, s_t1, done)  # Add replay buffer

            # Do the batch update
            batch = buff.getBatch(BATCH_SIZE)
            states = np.asarray([e[0] for e in batch])
            actions = np.asarray([e[1] for e in batch])
            rewards = np.asarray([e[2] for e in batch])
            new_states = np.asarray([e[3] for e in batch])
            dones = np.asarray([e[4] for e in batch])
            y_t = np.asarray([e[1] for e in batch])

            target_q_values = critic.target_model.predict(
                [new_states,
                 actor.target_model.predict(new_states)])

            for k in range(len(batch)):
                if dones[k]:
                    y_t[k] = rewards[k]
                else:
                    y_t[k] = rewards[k] + GAMMA * target_q_values[k]

            if train_indicator:
                loss += critic.model.train_on_batch([states, actions], y_t)
                a_for_grad = actor.model.predict(states)
                grads = critic.gradients(states, a_for_grad)
                actor.train(states, grads)
                actor.target_train()
                critic.target_train()

            total_reward += r_t
            s_t = s_t1

            print("Episode", i_episode, "Step", step, "Action", a_t, "Reward",
                  r_t, "Loss", loss)

            if np.mod(step, 1000) == 0:
                logging.info("Episode {}, Distance {}, Last Lap {}".format(
                    i_episode, ob.distRaced, ob.lastLapTime))
                if ob.lastLapTime > 0:
                    if best_lap < ob.lastLapTime:
                        best_lap = ob.lastLapTime

            step += 1
            if done:
                break

        if train_indicator and i_episode > 20:
            if np.mod(i_episode, 3) == 0:
                logging.info("Now we save model")
                actor.model.save_weights("ddpg_actor_weights_periodic.h5",
                                         overwrite=True)
                critic.model.save_weights("ddpg_critic_weights_periodic.h5",
                                          overwrite=True)

        print("TOTAL REWARD @ " + str(i_episode) + "-th Episode  : Reward " +
              str(total_reward))
        print("Total Step: " + str(step))
        print("Best Lap {}".format(best_lap))
        print("")
        logging.info("TOTAL REWARD @ " + str(i_episode) +
                     "-th Episode  : Reward " + str(total_reward))
        logging.info("Best Lap {}".format(best_lap))
    env.end()  # This is for shutting down TORCS
    logging.info("Finish.")
示例#4
0
class DeepQLearner(object):
    def __init__(self, session,
                       optimizer,
                       q_network,
                       state_dim,
                       num_actions,
                       batch_size=32,
                       init_exp=0.5,       # initial exploration prob
                       final_exp=0.1,      # final exploration prob
                       anneal_steps=10000, # N steps for annealing exploration 
                       replay_buffer_size=10000,
                       store_replay_every=5, # how frequent to store experience
                       discount_factor=0.9, # discount future rewards
                       target_update_rate=0.01,
                       name="DeepQLearner"
                       ):
        """ Initializes the Deep Q Network.

            Args:
                session: A TensorFlow session.
                optimizer: A TensorFlow optimizer.
                q_network: A TensorFlow network that takes in a state and output the Q-values over
                           all actions. 
                state_dim: Dimension of states.
                num_actions: Number of actions.
                batch_size: Batch size for training with experience replay.
                init_exp: Initial exploration probability for eps-greedy policy.
                final_exp: Final exploration probability for eps-greedy policy.
                anneal_steps: Number of steps to anneal from init_exp to final_exp.
                replay_buffer_size: Size of replay buffer.
                store_replay_every: Frequency with which to store replay.
                discount_factor: For discounting future rewards.
                target_update_rate: For the slow update of the target network.
                name: Used to create a variable scope. Useful for creating multiple
                      networks.
        """
        self.session = session
        self.optimizer = optimizer
        self.q_network = q_network # tensorflow constructor for Q network
        self.state_dim = state_dim
        self.num_actions = num_actions
        self.batch_size = batch_size

        # initialize exploration
        self.exploration = init_exp
        self.init_exp = init_exp
        self.final_exp = final_exp
        self.anneal_steps = anneal_steps

        self.discount_factor = discount_factor
        self.target_update_rate = target_update_rate

        # Initialize the replay buffer.
        self.replay_buffer_size = replay_buffer_size
        self.replay_buffer = ReplayBuffer(replay_buffer_size)
        self.store_replay_every = store_replay_every
        self.experience_cnt = 0

        self.name = name

        self.train_iteration = 0
        self.constructModel()
        self.session.run(tf.global_variables_initializer())
        self.saver = tf.train.Saver()

    def constructModel(self):
        """ Constructs the model to do Q-learning.
        """

        # ensure that we don't have conflicts when initializing multiple models
        with tf.variable_scope(self.name):
            # this part of the model is for predicting actions using the learned Q_network.
            with tf.name_scope("predict_actions"):

                # input: vectors of states (in a batch)
                self.states = tf.placeholder(tf.float32, (None, self.state_dim), name="states")

                # use new scope to differentiate this q_network from one used for target evaluation
                # note that this will differentiate the weights, for example "learn_q_network/W1"
                with tf.variable_scope("learn_q_network"):
                    # the current q_network that we train
                    self.action_scores = self.q_network(self.states, self.state_dim, self.num_actions)
                self.predicted_actions = tf.argmax(self.action_scores, axis=1, name="predicted_actions")

            # this part of the model is for estimating future rewards, to be used for the Q-learning
            # update for estimating the target Q-value.
            with tf.name_scope("estimate_future_rewards"):

                # input: vectors of next states (in a batch)
                self.next_states = tf.placeholder(tf.float32, (None, self.state_dim), name="next_states")

                # input: binary inputs that indicate whether states are unfinished or terminal
                # this is important to compute the target and do the Bellman update correctly, since
                # it tells us whether to include the optimal Q value for the next state or not.
                self.unfinished_states_flags = tf.placeholder(tf.float32, (None,), name="unfinished_states_flags")

                # input: rewards from last state and action
                self.rewards = tf.placeholder(tf.float32, (None,), name="rewards")

                # use new scope to differentiate this q_network from one we are training
                # note that this will differentiate the weights, for example "target_q_network/W1"
                with tf.variable_scope("target_q_network"):
                    # the q_network used for evaluation
                    self.eval_q_vals = self.q_network(self.next_states, self.state_dim, self.num_actions)

                # note that this term is only non-zero for a state if it is non-terminal
                # also note the use of stop_gradient to make sure we don't train this q_network
                self.best_future_q_vals = tf.reduce_max(tf.stop_gradient(self.eval_q_vals), axis=1) * self.unfinished_states_flags

                # future rewards given by Bellman equation
                self.future_rewards = self.rewards + self.discount_factor * self.best_future_q_vals

            # this part of the model is for computing the loss and gradients
            with tf.name_scope("loss"):
                # input: one-hot vectors that give the current actions to evaluate the loss for
                self.action_selects = tf.placeholder(tf.float32, (None, self.num_actions), name="action_select")

                # get Q-values for the actions that we took
                self.selected_action_scores = tf.reduce_sum(self.action_scores * self.action_selects, axis=1)

                # temporal difference loss
                self.td_loss = tf.reduce_mean(tf.reduce_sum(tf.square(self.future_rewards - self.selected_action_scores)))

                # cross-entropy loss for adversarial example generation
                self.cross_entropy_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(self.action_scores, self.action_selects))

                # TODO: regularization loss

                # TODO: gradient clipping

                self.train_op = self.optimizer.minimize(self.td_loss)

            # this part of the model is for updating the target Q network
            with tf.name_scope("eval_q_network_update"):
                target_network_update = []
                # slowly update target network parameters with Q network parameters
                # we do this by grabbing all the parameters in both networks and manually defining
                # update operations
                self.q_network_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="learn_q_network")
                self.target_network_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="target_q_network")
                for v_source, v_target in zip(self.q_network_variables, self.target_network_variables):
                    # this is equivalent to target = (1-alpha) * target + alpha * source
                    update_op = v_target.assign_sub(self.target_update_rate * (v_target - v_source))
                    target_network_update.append(update_op)
                # this groups all operations to run together
                # this operation will update all of the target Q network variables
                self.target_network_update = tf.group(*target_network_update)

    def store_experience(self, state, action, reward, next_state, done):
        """ 
        Adds an experience to the replay buffer.
        """
        if self.experience_cnt % self.store_replay_every == 0 or done:
            self.replay_buffer.add(state, action, reward, next_state, done)
        self.experience_cnt += 1

    def greedy_policy(self, states):
        """ 
        Executes the greedy policy. Useful for executing a learned agent.
        """
        return self.session.run(self.predicted_actions, {self.states: states})[0]


    def e_greedy_policy(self, states):
        """ 
        Executes the epsilon greedy policy. 
        """
        # with probability exploration, choose random action
        if random.random() < self.exploration:
            return random.randint(0, self.num_actions-1)
        # choose greedy action given by current Q network
        else:
            return self.greedy_policy(states)


    def annealExploration(self):
        """ 
        Anneals the exploration probability linearly with training iteration.
        """
        ratio = max((self.anneal_steps - self.train_iteration) / float(self.anneal_steps), 0)
        self.exploration = (self.init_exp- self.final_exp) * ratio + self.final_exp

    def updateModel(self):
        """ 
        Update the model by sampling a batch from the replay buffer and
        performing Q-learning updates on the network parameters.
        """

        # not enough experiences yet
        if self.replay_buffer.count() < self.batch_size:
            return

        # sample a random batch from the replay buffer
        batch = self.replay_buffer.getBatch(self.batch_size)

        # keep track of these inputs to the Q networks for the batch
        states                     = np.zeros((self.batch_size, self.state_dim))
        rewards                    = np.zeros((self.batch_size,))
        action_selects             = np.zeros((self.batch_size, self.num_actions))
        next_states                = np.zeros((self.batch_size, self.state_dim))
        unfinished_states_flags    = np.zeros((self.batch_size,))

        # train on the experiences in this batch
        for k, (s0, a, r, s1, done) in enumerate(batch):
            states[k] = s0
            rewards[k] = r
            action_selects[k][a] = 1
            # check terminal state
            if not done:
                next_states[k] = s1
                unfinished_states_flags[k] = 1

        # perform one update of training
        cost, _ = self.session.run([self.td_loss, self.train_op], {
          self.states : states,
          self.next_states : next_states,
          self.unfinished_states_flags : unfinished_states_flags,
          self.action_selects : action_selects,
          self.rewards : rewards
        })

        # update target network using learned Q-network
        self.session.run(self.target_network_update)

        self.annealExploration()
        self.train_iteration += 1

    # saves the trained model
    def saveModel(self, name):
        self.saver.save(self.session, name)

    def restoreModel(self, name):
        self.saver.restore(self.session, './' + name)

    def reset(self):
        # initialize exploration
        self.exploration = self.init_exp

        # Initialize the replay buffer.
        self.replay_buffer = ReplayBuffer(self.replay_buffer_size)
        self.experience_cnt = 0

        self.train_iteration = 0
        self.session.run(tf.global_variables_initializer())
示例#5
0
def main(config_dict):
    train = config_dict['train']
    network = config_dict['network']
    experiment_name = config_dict['experiment_name']
    EXPERIMENTS_PATH = config_dict['EXPERIMENTS_PATH']

    actor_weights_file = "%s%s/%s_actor.h5" % (EXPERIMENTS_PATH, network,
                                               network)
    critic_weights_file = "%s%s/%s_critic.h5" % (EXPERIMENTS_PATH, network,
                                                 network)

    log_directory = "%s%s/%s/" % (EXPERIMENTS_PATH, network, experiment_name)

    BUFFER_SIZE = 100000
    BATCH_SIZE = 32
    GAMMA = 0.99
    TAU = 0.001
    LRA = 0.0001
    LRC = 0.001

    action_dim = 3  # Steering / Acceleration / Blake
    state_dim = 29  # Dimension of sensor inputs

    #np.random.seed(42)

    vision = False
    EXPLORE = 100000.
    episode_count = 2000
    max_steps = 100000
    done = False
    step = 0
    epsilon = 1

    exp_logger = TORCS_ExperimentLogger(log_directory, experiment_name)

    #directory = "%s%s/" % (EXPERIMENTS_PATH, experiment)
    #actor_weights_file = "%s%s_%s" % (directory, experiment, "actor.h5")
    #critic_weights_file = "%s%s_%s" % (directory, experiment, "critic.h5")

    # TensorFlow GPU
    config = tf.ConfigProto()
    # Not sure if this is really necessary, since we only have a single GPU
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)

    from keras import backend as K
    K.set_session(sess)

    actor = ActorFCNet(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA)
    critic = CriticFCNet(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC)
    buff = ReplayBuffer(BUFFER_SIZE)

    env = TorcsEnv(vision=vision, throttle=True, gear_change=False)

    # Weight loading
    if not train:
        try:
            actor.model.load_weights(actor_weights_file)
            critic.model.load_weights(critic_weights_file)
            actor.target_model.load_weights(actor_weights_file)
            critic.target_model.load_weights(critic_weights_file)
            print "Weights loaded successfully"
            time.sleep(2)
        except:
            print "Error in loading weights"
            print '-' * 60
            traceback.print_exc(file=sys.stdout)
            print '-' * 60
            assert (False)

    for i in xrange(episode_count):
        print "Episode: %i; Replay Buffer: %i" % (i, buff.count())

        if np.mod(i, 3) == 0:
            # Relaunch TORCS every 3 episodes; memory leak error
            ob = env.reset(relaunch=True)
        else:
            ob = env.reset()

        state_t = np.hstack(
            (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ,
             ob.wheelSpinVel / 100.0, ob.rpm))

        total_reward = 0.
        # Compute rewards
        for j in xrange(max_steps):
            loss = 0
            epsilon -= 1.0 / EXPLORE  # exploration factor
            action_t = np.zeros([1, action_dim])
            noise_t = np.zeros([1, action_dim])

            action_t_raw = actor.model.predict(
                state_t.reshape(
                    1,
                    state_t.shape[0]))  # this call to reshape seems suboptimal

            noise_t[0][0] = train * max(epsilon, 0) * OU.run(
                action_t_raw[0][0], 0.0, 0.60, 0.30)
            noise_t[0][1] = train * max(epsilon, 0) * OU.run(
                action_t_raw[0][1], 0.5, 1.00, 0.10)
            noise_t[0][2] = train * max(epsilon, 0) * OU.run(
                action_t_raw[0][2], -0.1, 1.00, 0.05)

            # stochastic brake
            #if random.random() <= 0.1:
            #    noise_t[0][2] = train * max(epsilon, 0) * OU.run(action_t_raw[0][2], 0.2, 1.00, 0.10)

            # May be able to do this a bit more concisely with NumPy vectorization
            action_t[0][0] = action_t_raw[0][0] + noise_t[0][0]
            action_t[0][1] = action_t_raw[0][1] + noise_t[0][1]
            action_t[0][2] = action_t_raw[0][2] + noise_t[0][2]

            # Raw_reward_t is the raw reward computed by the gym_torcs script.
            # We will compute our own reward metric from the ob object
            ob, raw_reward_t, done, info = env.step(action_t[0])

            state_t1 = np.hstack(
                (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY,
                 ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm))
            #reward_t = lng_trans(ob)
            reward_t = raw_reward_t

            buff.add(state_t, action_t[0], reward_t, state_t1,
                     done)  # Add replay buffer

            # Batch update
            batch = buff.getBatch(BATCH_SIZE)
            states = np.asarray([e[0] for e in batch])
            actions = np.asarray([e[1] for e in batch])
            rewards = np.asarray([e[2] for e in batch])
            new_states = np.asarray([e[3] for e in batch])
            done_indicators = np.asarray([e[4] for e in batch])
            y_t = np.asarray([e[1] for e in batch])

            target_q_values = critic.target_model.predict(
                [new_states,
                 actor.target_model.predict(new_states)])

            # Can't we just use BATCH_SIZE here
            for k in xrange(len(batch)):
                if done_indicators[k]:
                    y_t[k] = rewards[k]
                else:
                    y_t[k] = rewards[k] + GAMMA * target_q_values[k]

            if (train):
                loss += critic.model.train_on_batch([states, actions], y_t)
                a_for_grad = actor.model.predict(states)
                grads = critic.gradients(states, a_for_grad)
                actor.train(states, grads)
                actor.train_target_net()
                critic.train_target_net()

            exp_logger.log(ob, action_t[0], reward_t, loss)

            total_reward += reward_t
            state_t = state_t1

            print("Episode", i, "Step", step, "Action", action_t, "Reward",
                  reward_t, "Loss", loss)

            step += 1
            if done:
                break

        if np.mod(i, 3) == 0:
            if (train):
                print("Now we save model")
                actor.model.save_weights(actor_weights_file, overwrite=True)
                #with open("actormodel.json", "w") as outfile: json.dump(actor.model.to_json(), outfile)

                critic.model.save_weights(critic_weights_file, overwrite=True)
                #with open("criticmodel.json", "w") as outfile: json.dump(critic.model.to_json(), outfile)

        print("TOTAL REWARD @ " + str(i) + "-th Episode  : Reward " +
              str(total_reward))
        print("Total Step: " + str(step))
        print("")

    env.end()  # This is for shutting down TORCS
    print("Finish.")
示例#6
0
class NeuralAgent():
    def __init__(self, track_name='practgt2.xml'):
        BUFFER_SIZE = 100000
        TAU = 0.001  # Target Network HyperParameters
        LRA = 0.0001  # Learning rate for Actor
        LRC = 0.001  # Lerning rate for Critic
        state_dim = 29  # of sensors input
        self.batch_size = 32
        self.lambda_mix = 10.0
        self.action_dim = 3  # Steering/Acceleration/Brake

        # Tensorflow GPU optimization
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        sess = tf.Session(config=config)
        from keras import backend as K
        K.set_session(sess)

        self.actor = ActorNetwork(sess, state_dim, self.action_dim,
                                  self.batch_size, TAU, LRA)
        self.critic = CriticNetwork(sess, state_dim, self.action_dim,
                                    self.batch_size, TAU, LRC)
        self.buff = ReplayBuffer(BUFFER_SIZE)  # Create replay buffer
        self.track_name = track_name

        self.save = dict(total_reward=[],
                         total_step=[],
                         ave_reward=[],
                         distRaced=[],
                         distFromStart=[],
                         lastLapTime=[],
                         curLapTime=[],
                         lapTimes=[],
                         avelapTime=[],
                         ave_sp=[],
                         max_sp=[],
                         min_sp=[],
                         test_total_reward=[],
                         test_total_step=[],
                         test_ave_reward=[],
                         test_distRaced=[],
                         test_distFromStart=[],
                         test_lastLapTime=[],
                         test_curLapTime=[],
                         test_lapTimes=[],
                         test_avelapTime=[],
                         test_ave_sp=[],
                         test_max_sp=[],
                         test_min_sp=[])

    def rollout(self, env):
        max_steps = 10000

        vision = False

        # zhichen: it is not stable to have two torcs env and UDP connections
        # env = TorcsEnv(vision=vision, throttle=True, gear_change=False, track_name=self.track_name)

        ob = env.reset(relaunch=True)
        s_t = np.hstack((ob.speedX, ob.angle, ob.trackPos, ob.speedY,
                         ob.speedZ, ob.rpm, ob.wheelSpinVel / 100.0, ob.track))

        total_reward = 0.

        sp = []

        lastLapTime = []

        for j_iter in range(max_steps):

            a_t = self.actor.model.predict(s_t.reshape(1, s_t.shape[0]))
            a_t = a_t[0]
            # print('test a_t:', a_t)
            a_t[0] = clip(a_t[0], -1, 1)
            a_t[1] = clip(a_t[1], 0, 1)
            a_t[2] = clip(a_t[2], 0, 1)

            ob, r_t, done, info = env.step(a_t)

            sp.append(info['speed'])

            if lastLapTime == []:
                if info['lastLapTime'] > 0:
                    lastLapTime.append(info['lastLapTime'])
            elif info['lastLapTime'] > 0 and lastLapTime[-1] != info[
                    'lastLapTime']:
                lastLapTime.append(info['lastLapTime'])

            if np.mod(j_iter + 1, 20) == 0:
                logging.info('step: ' + str(j_iter + 1))
                print('\n ob: ', ob)

            s_t = np.hstack(
                (ob.speedX, ob.angle, ob.trackPos, ob.speedY, ob.speedZ,
                 ob.rpm, ob.wheelSpinVel / 100.0, ob.track))

            total_reward += r_t

            if done: break

        logging.info("Test Episode Reward: " + str(total_reward) +
                     " Episode Length: " + str(j_iter + 1) + " Ave Reward: " +
                     str(total_reward / (j_iter + 1)) + "\n Distance: " +
                     str(info['distRaced']) + ' ' +
                     str(info['distFromStart']) + "\n Last Lap Times: " +
                     str(info['lastLapTime']) + " Cur Lap Times: " +
                     str(info['curLapTime']) + " lastLaptime: " +
                     str(lastLapTime) + "\n ave sp: " + str(np.mean(sp)) +
                     " max sp: " + str(np.max(sp)))
        #logging.info(" Total Steps: " + str(step) + " " + str(i_episode) + "-th Episode Reward: " + str(total_reward) +
        #            " Episode Length: " + str(j_iter+1) + "  Distance" + str(ob.distRaced) + " Lap Times: " + str(ob.lastLapTime))

        #env.end()  # This is for shutting down TORCS

        ave_sp = np.mean(sp)
        max_sp = np.max(sp)
        min_sp = np.min(sp)

        return total_reward, j_iter + 1, info, ave_sp, max_sp, min_sp, lastLapTime

    def update_neural(self,
                      controllers,
                      episode_count=200,
                      tree=False,
                      seed=1337):
        OU = FunctionOU()
        vision = False
        GAMMA = 0.99
        EXPLORE = 100000.
        max_steps = 10000
        reward = 0
        done = False
        step = 0
        epsilon = 1

        if not tree:
            steer_prog, accel_prog, brake_prog = controllers

        # Generate a Torcs environment
        env = TorcsEnv(vision=vision,
                       throttle=True,
                       gear_change=False,
                       track_name=self.track_name)

        window = 5
        lambda_store = np.zeros((max_steps, 1))
        lambda_max = 40.
        factor = 0.8

        logging.info("TORCS Experiment Start with Lambda = " +
                     str(self.lambda_mix))

        for i_episode in range(episode_count):
            logging.info("Episode : " + str(i_episode) + " Replay Buffer " +
                         str(self.buff.count()))
            if np.mod(i_episode, 3) == 0:
                logging.info('relaunch TORCS')
                ob = env.reset(
                    relaunch=True
                )  # relaunch TORCS every 3 episode because of the memory leak error
            else:
                logging.info('reset TORCS')
                ob = env.reset()

            #[ob.speedX, ob.angle, ob.trackPos, ob.speedY, ob.speedZ, ob.rpm, list(ob.wheelSpinVel / 100.0), list(ob.track)]
            s_t = np.hstack(
                (ob.speedX, ob.angle, ob.trackPos, ob.speedY, ob.speedZ,
                 ob.rpm, ob.wheelSpinVel / 100.0, ob.track))

            total_reward = 0.
            tempObs = [[ob.speedX], [ob.angle], [ob.trackPos], [ob.speedY],
                       [ob.speedZ], [ob.rpm],
                       list(ob.wheelSpinVel / 100.0),
                       list(ob.track), [0, 0, 0]]
            window_list = [tempObs[:] for _ in range(window)]

            sp = []

            lastLapTime = []

            for j_iter in range(max_steps):
                if tree:
                    tree_obs = [
                        sensor for obs in tempObs[:-1] for sensor in obs
                    ]
                    act_tree = controllers.predict([tree_obs])
                    steer_action = clip_to_range(act_tree[0][0], -1, 1)
                    accel_action = clip_to_range(act_tree[0][1], 0, 1)
                    brake_action = clip_to_range(act_tree[0][2], 0, 1)
                else:
                    steer_action = clip_to_range(
                        steer_prog.pid_execute(window_list), -1, 1)
                    accel_action = clip_to_range(
                        accel_prog.pid_execute(window_list), 0, 1)
                    brake_action = clip_to_range(
                        brake_prog.pid_execute(window_list), 0, 1)
                action_prior = [steer_action, accel_action, brake_action]

                tempObs = [[ob.speedX], [ob.angle], [ob.trackPos], [ob.speedY],
                           [ob.speedZ], [ob.rpm],
                           list(ob.wheelSpinVel / 100.0),
                           list(ob.track), action_prior]
                window_list.pop(0)
                window_list.append(tempObs[:])

                loss = 0
                epsilon -= 1.0 / EXPLORE
                a_t = np.zeros([1, self.action_dim])
                noise_t = np.zeros([1, self.action_dim])

                a_t_original = self.actor.model.predict(
                    s_t.reshape(1, s_t.shape[0]))
                noise_t[0][0] = max(epsilon, 0) * OU.function(
                    a_t_original[0][0], 0.0, 0.60, 0.30)
                noise_t[0][1] = max(epsilon, 0) * OU.function(
                    a_t_original[0][1], 0.5, 1.00, 0.10)
                noise_t[0][2] = max(epsilon, 0) * OU.function(
                    a_t_original[0][2], 0, 1.00, 0.05)

                a_t[0][0] = a_t_original[0][0] + noise_t[0][0]
                a_t[0][1] = a_t_original[0][1] + noise_t[0][1]
                a_t[0][2] = a_t_original[0][2] + noise_t[0][2]

                mixed_act = [
                    a_t[0][k_iter] / (1 + self.lambda_mix) +
                    (self.lambda_mix /
                     (1 + self.lambda_mix)) * action_prior[k_iter]
                    for k_iter in range(3)
                ]

                ob, r_t, done, info = env.step(mixed_act)

                sp.append(info['speed'])

                if lastLapTime == []:
                    if info['lastLapTime'] > 0:
                        lastLapTime.append(info['lastLapTime'])
                elif info['lastLapTime'] > 0 and lastLapTime[-1] != info[
                        'lastLapTime']:
                    lastLapTime.append(info['lastLapTime'])

                s_t1 = np.hstack(
                    (ob.speedX, ob.angle, ob.trackPos, ob.speedY, ob.speedZ,
                     ob.rpm, ob.wheelSpinVel / 100.0, ob.track))

                self.buff.add(s_t, a_t[0], r_t, s_t1,
                              done)  # Add replay buffer

                # Do the batch update
                batch = self.buff.getBatch(self.batch_size)
                states = np.asarray([e[0] for e in batch])
                actions = np.asarray([e[1] for e in batch])
                rewards = np.asarray([e[2] for e in batch])
                new_states = np.asarray([e[3] for e in batch])
                dones = np.asarray([e[4] for e in batch])
                y_t = np.zeros((states.shape[0], 1))

                target_q_values = self.critic.target_model.predict(
                    [new_states,
                     self.actor.target_model.predict(new_states)])

                for k in range(len(batch)):
                    if dones[k]:
                        y_t[k] = rewards[k]
                    else:
                        y_t[k] = rewards[k] + GAMMA * target_q_values[k]

                loss += self.critic.model.train_on_batch([states, actions],
                                                         y_t)
                a_for_grad = self.actor.model.predict(states)
                grads = self.critic.gradients(states, a_for_grad)
                self.actor.train(states, grads)
                self.actor.target_train()
                self.critic.target_train()

                total_reward += r_t
                s_t = s_t1

                # Control prior mixing term
                if j_iter > 0 and i_episode > 50:
                    lambda_track = lambda_max * (1 - np.exp(-factor * np.abs(
                        r_t +
                        GAMMA * np.mean(target_q_values[-1] - base_q[-1]))))
                    lambda_track = np.squeeze(lambda_track)
                else:
                    lambda_track = 10.
                lambda_store[j_iter] = lambda_track
                base_q = copy.deepcopy(target_q_values)

                if np.mod(step, 2000) == 0:
                    logging.info("Episode " + str(i_episode) + " Distance " +
                                 str(ob.distRaced) + " Lap Times " +
                                 str(ob.lastLapTime))

                step += 1
                if done:
                    break

            #else:
            #    env.end()

            self.lambda_mix = np.mean(lambda_store)

            logging.info('Episode ends! \n' + "Total Steps: " + str(step) +
                         " " + str(i_episode) + "-th Episode Reward: " +
                         str(total_reward) + " Episode Length: " +
                         str(j_iter + 1) + " Ave Reward: " +
                         str(total_reward / (j_iter + 1)) + "\n Distance: " +
                         str(info['distRaced']) + ' ' +
                         str(info['distFromStart']) + "\n Last Lap Times: " +
                         str(info['lastLapTime']) + " Cur Lap Times: " +
                         str(info['curLapTime']) + " lastLaptime: " +
                         str(lastLapTime) + "\n ave sp: " + str(np.mean(sp)) +
                         " max sp: " + str(np.max(sp)))

            #logging.info(" Lambda Mix: " + str(self.lambda_mix))

            self.save['total_reward'].append(total_reward)
            self.save['total_step'].append(j_iter + 1)
            self.save['ave_reward'].append(total_reward / (j_iter + 1))

            self.save['distRaced'].append(info['distRaced'])
            self.save['distFromStart'].append(info['distFromStart'])

            self.save['lastLapTime'].append(info['lastLapTime'])
            self.save['curLapTime'].append(info['curLapTime'])
            self.save['lapTimes'].append(lastLapTime)
            if lastLapTime == []:
                self.save['avelapTime'].append(0)
            else:
                self.save['avelapTime'].append(np.mean(lastLapTime))

            self.save['ave_sp'].append(np.mean(sp))
            self.save['max_sp'].append(np.max(sp))
            self.save['min_sp'].append(np.min(sp))

            # test
            if np.mod(i_episode + 1, 10) == 0:
                logging.info("Start Testing!")
                test_total_reward, test_step, test_info, test_ave_sp, test_max_sp, test_min_sp, test_lastLapTime = self.rollout(
                    env)
                self.save['test_total_reward'].append(test_total_reward)
                self.save['test_total_step'].append(test_step)
                self.save['test_ave_reward'].append(test_total_reward /
                                                    test_step)

                self.save['test_distRaced'].append(test_info['distRaced'])
                self.save['test_distFromStart'].append(
                    test_info['distFromStart'])

                self.save['test_lastLapTime'].append(test_info['lastLapTime'])
                self.save['test_curLapTime'].append(test_info['curLapTime'])
                self.save['test_lapTimes'].append(test_lastLapTime)

                if test_lastLapTime == []:
                    self.save['test_avelapTime'].append(0)
                else:
                    self.save['test_avelapTime'].append(
                        np.mean(test_lastLapTime))

                self.save['test_ave_sp'].append(test_ave_sp)
                self.save['test_max_sp'].append(test_max_sp)
                self.save['test_min_sp'].append(test_min_sp)

            if np.mod(i_episode + 1, 5) == 0:
                print("Now we save model")
                #os.remove("actormodel.h5")
                self.actor.model.save_weights("actormodel_" + str(seed) +
                                              ".h5",
                                              overwrite=True)
                with open("actormodel.json", "w") as outfile:
                    json.dump(self.actor.model.to_json(), outfile)

                #os.remove("criticmodel.h5")
                self.critic.model.save_weights("criticmodel_" + str(seed) +
                                               ".h5",
                                               overwrite=True)
                with open("criticmodel.json", "w") as outfile:
                    json.dump(self.critic.model.to_json(), outfile)

                filename = "./model/actormodel_" + str(seed) + '_' + str(
                    i_episode + 1) + ".h5"
                dirname = os.path.dirname(filename)
                if not os.path.exists(dirname):
                    os.makedirs(dirname)
                self.actor.model.save_weights(filename, overwrite=True)
                filename = "./model/criticmodel_" + str(seed) + '_' + str(
                    i_episode + 1) + ".h5"
                dirname = os.path.dirname(filename)
                if not os.path.exists(dirname):
                    os.makedirs(dirname)
                self.critic.model.save_weights(filename, overwrite=True)

            if np.mod(i_episode + 1, 10) == 0:
                filename = "./Fig/iprl_save_" + str(seed)
                dirname = os.path.dirname(filename)
                if not os.path.exists(dirname):
                    os.makedirs(dirname)
                with open(filename, 'wb') as f:
                    pickle.dump(self.save, f)

            if i_episode > 1000 and all(
                    np.array(self.save['total_reward'][-20:]) < 20):
                print('model degenerated. Stop at Epsisode ' + str(i_episode))
                break

        env.end()  # This is for shutting down TORCS
        logging.info("Neural Policy Update Finish.")
        return None

    def collect_data(self, controllers, tree=False):

        vision = False

        max_steps = 10000

        step = 0

        if not tree:
            steer_prog, accel_prog, brake_prog = controllers

        # Generate a Torcs environment
        env = TorcsEnv(vision=vision,
                       throttle=True,
                       gear_change=False,
                       track_name=self.track_name)
        ob = env.reset(relaunch=True)
        print("S0=", ob)

        window = 5
        lambda_store = np.zeros((max_steps, 1))
        lambda_max = 40.
        factor = 0.8

        logging.info("TORCS Collection started with Lambda = " +
                     str(self.lambda_mix))

        s_t = np.hstack((ob.speedX, ob.angle, ob.trackPos, ob.speedY,
                         ob.speedZ, ob.rpm, ob.wheelSpinVel / 100.0, ob.track))

        total_reward = 0.
        tempObs = [[ob.speedX], [ob.angle], [ob.trackPos], [ob.speedY],
                   [ob.speedZ], [ob.rpm],
                   list(ob.wheelSpinVel / 100.0),
                   list(ob.track), [0, 0, 0]]
        window_list = [tempObs[:] for _ in range(window)]

        observation_list = []
        actions_list = []

        lastLapTime = []
        sp = []

        for j_iter in range(max_steps):
            if tree:
                tree_obs = [sensor for obs in tempObs[:-1] for sensor in obs]
                act_tree = controllers.predict([tree_obs])
                steer_action = clip_to_range(act_tree[0][0], -1, 1)
                accel_action = clip_to_range(act_tree[0][1], 0, 1)
                brake_action = clip_to_range(act_tree[0][2], 0, 1)
            else:
                steer_action = clip_to_range(
                    steer_prog.pid_execute(window_list), -1, 1)
                accel_action = clip_to_range(
                    accel_prog.pid_execute(window_list), 0, 1)
                brake_action = clip_to_range(
                    brake_prog.pid_execute(window_list), 0, 1)

            action_prior = [steer_action, accel_action, brake_action]

            tempObs = [[ob.speedX], [ob.angle], [ob.trackPos], [ob.speedY],
                       [ob.speedZ], [ob.rpm],
                       list(ob.wheelSpinVel / 100.0),
                       list(ob.track), action_prior]
            window_list.pop(0)
            window_list.append(tempObs[:])

            a_t = self.actor.model.predict(s_t.reshape(1, s_t.shape[0]))
            mixed_act = [
                a_t[0][k_iter] / (1 + self.lambda_mix) +
                (self.lambda_mix /
                 (1 + self.lambda_mix)) * action_prior[k_iter]
                for k_iter in range(3)
            ]
            if tree:
                newobs = [item for sublist in tempObs[:-1] for item in sublist]
                observation_list.append(newobs[:])
            else:
                observation_list.append(window_list[:])
            actions_list.append(mixed_act[:])
            ob, r_t, done, info = env.step(mixed_act)

            sp.append(info['speed'])

            if lastLapTime == []:
                if info['lastLapTime'] > 0:
                    lastLapTime.append(info['lastLapTime'])
            elif info['lastLapTime'] > 0 and lastLapTime[-1] != info[
                    'lastLapTime']:
                lastLapTime.append(info['lastLapTime'])

            s_t1 = np.hstack(
                (ob.speedX, ob.angle, ob.trackPos, ob.speedY, ob.speedZ,
                 ob.rpm, ob.wheelSpinVel / 100.0, ob.track))

            total_reward += r_t
            s_t = s_t1
            #if np.mod(step, 2000) == 0:
            #    logging.info(" Distance " + str(ob.distRaced) + " Lap Times " + str(ob.lastLapTime))

            step += 1
            if done:
                break

        logging.info("Data Collection Finished!")
        logging.info('Episode ends! \n' + "Episode Reward: " +
                     str(total_reward) + " Episode Length: " +
                     str(j_iter + 1) + " Ave Reward: " + str(total_reward /
                                                             (j_iter + 1)) +
                     "\n Distance: " + str(info['distRaced']) + ' ' +
                     str(info['distFromStart']) + "\n Last Lap Times: " +
                     str(info['lastLapTime']) + " Cur Lap Times: " +
                     str(info['curLapTime']) + " lastLaptime: " +
                     str(lastLapTime) + "\n ave sp: " + str(np.mean(sp)) +
                     " max sp: " + str(np.max(sp)))
        env.end()

        return observation_list, actions_list

    def label_data(self, controllers, observation_list, tree=False):
        if not tree:
            steer_prog, accel_prog, brake_prog = controllers
        actions_list = []
        net_obs_list = []
        logging.info("Data labelling started with Lambda = " +
                     str(self.lambda_mix))
        for window_list in observation_list:
            if tree:
                act_tree = controllers.predict([window_list])
                steer_action = clip_to_range(act_tree[0][0], -1, 1)
                accel_action = clip_to_range(act_tree[0][1], 0, 1)
                brake_action = clip_to_range(act_tree[0][2], 0, 1)
                net_obs_list.append(window_list)
            else:
                steer_action = clip_to_range(
                    steer_prog.pid_execute(window_list), -1, 1)
                accel_action = clip_to_range(
                    accel_prog.pid_execute(window_list), 0, 1)
                brake_action = clip_to_range(
                    brake_prog.pid_execute(window_list), 0, 1)
                net_obs = [sensor for obs in window_list[-1] for sensor in obs]
                net_obs_list.append(net_obs[:29])

            action_prior = [steer_action, accel_action, brake_action]

            s_t = np.hstack([[net_obs[:29]]])
            a_t = self.actor.model.predict(s_t.reshape(1, 29))
            mixed_act = [
                a_t[0][k_iter] / (1 + self.lambda_mix) +
                (self.lambda_mix /
                 (1 + self.lambda_mix)) * action_prior[k_iter]
                for k_iter in range(3)
            ]

            actions_list.append(mixed_act[:])

        return net_obs_list, observation_list, actions_list
class NeuralQLearner(object):
    def __init__(
            self,
            session,
            optimizer,
            q_network,
            restore_net_path,
            state_dim,
            num_actions,
            batch_size,
            init_exp,  # initial exploration prob
            final_exp,  # final exploration prob
            anneal_steps,  # N steps for annealing exploration
            replay_buffer_size,
            store_replay_every,  # how frequent to store experience
            discount_factor,  # discount future rewards
            target_update_rate,
            reg_param,  # regularization constants
            max_gradient,  # max gradient norms
            double_q_learning,
            summary_writer,
            summary_every):

        # tensorflow machinery
        self.session = session
        self.optimizer = optimizer
        self.summary_writer = summary_writer

        # model components
        self.q_network = q_network
        self.restore_net_path = restore_net_path
        self.replay_buffer = ReplayBuffer(buffer_size=replay_buffer_size)

        # Q learning parameters
        self.batch_size = batch_size
        self.state_dim = state_dim
        self.num_actions = num_actions
        self.exploration = init_exp
        self.init_exp = init_exp
        self.final_exp = final_exp
        self.anneal_steps = anneal_steps
        self.discount_factor = discount_factor
        self.target_update_rate = target_update_rate
        self.double_q_learning = double_q_learning

        # training parameters
        self.max_gradient = max_gradient
        self.reg_param = reg_param

        # counters
        self.store_replay_every = store_replay_every
        self.store_experience_cnt = 0
        self.train_iteration = 0

        # create and initialize variables
        self.create_variables()

        if self.restore_net_path is not None:
            saver = tf.train.Saver()
            saver.restore(self.session, self.restore_net_path)
        else:
            var_lists = tf.get_collection(tf.GraphKeys.VARIABLES)
            self.session.run(tf.initialize_variables(var_lists))

        #var_lists = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
        #self.session.run(tf.variables_initializer(var_lists))

        # make sure all variables are initialized
        self.session.run(tf.assert_variables_initialized())

        self.summary_every = summary_every
        if self.summary_writer is not None:
            # graph was not available when journalist was created
            self.summary_writer.add_graph(self.session.graph)
            self.summary_every = summary_every

    def create_variables(self):
        # compute action from a state: a* = argmax_a Q(s_t,a)
        with tf.name_scope("predict_actions"):
            # raw state representation
            self.states = tf.placeholder(tf.float32, (None, self.state_dim),
                                         name="states")
            # initialize Q network
            with tf.variable_scope("q_network"):
                self.q_outputs = self.q_network(self.states)
            # predict actions from Q network
            self.action_scores = tf.identity(self.q_outputs,
                                             name="action_scores")
            tf.summary.histogram("action_scores", self.action_scores)
            self.predicted_actions = tf.argmax(self.action_scores,
                                               dimension=1,
                                               name="predicted_actions")

        # estimate rewards using the next state: r(s_t,a_t) + argmax_a Q(s_{t+1}, a)
        with tf.name_scope("estimate_future_rewards"):
            self.next_states = tf.placeholder(tf.float32,
                                              (None, self.state_dim),
                                              name="next_states")
            self.next_state_mask = tf.placeholder(tf.float32, (None, ),
                                                  name="next_state_masks")

            if self.double_q_learning:
                # reuse Q network for action selection
                with tf.variable_scope("q_network", reuse=True):
                    self.q_next_outputs = self.q_network(self.next_states)
                self.action_selection = tf.argmax(tf.stop_gradient(
                    self.q_next_outputs),
                                                  1,
                                                  name="action_selection")
                tf.summary.histogram("action_selection", self.action_selection)
                self.action_selection_mask = tf.one_hot(
                    self.action_selection, self.num_actions, 1, 0)
                # use target network for action evaluation
                with tf.variable_scope("target_network"):
                    self.target_outputs = self.q_network(
                        self.next_states) * tf.cast(self.action_selection_mask,
                                                    tf.float32)
                self.action_evaluation = tf.reduce_sum(self.target_outputs,
                                                       axis=[
                                                           1,
                                                       ])
                tf.summary.histogram("action_evaluation",
                                     self.action_evaluation)
                self.target_values = self.action_evaluation * self.next_state_mask
            else:
                # initialize target network
                with tf.variable_scope("target_network"):
                    self.target_outputs = self.q_network(self.next_states)
                # compute future rewards
                self.next_action_scores = tf.stop_gradient(self.target_outputs)
                #self.target_values = tf.reduce_max(self.next_action_scores, axis=[1, ]) * self.next_state_mask
                self.target_values = tf.reduce_max(self.next_action_scores,
                                                   reduction_indices=[
                                                       1,
                                                   ]) * self.next_state_mask
                tf.summary.histogram("next_action_scores",
                                     self.next_action_scores)

            self.rewards = tf.placeholder(tf.float32, (None, ), name="rewards")
            self.future_rewards = self.rewards + self.discount_factor * self.target_values

        # compute loss and gradients
        with tf.name_scope("compute_temporal_differences"):
            # compute temporal difference loss
            self.action_mask = tf.placeholder(tf.float32,
                                              (None, self.num_actions),
                                              name="action_mask")
            #self.masked_action_scores = tf.reduce_sum(self.action_scores * self.action_mask, axis=[1, ])
            self.masked_action_scores = tf.reduce_sum(self.action_scores *
                                                      self.action_mask,
                                                      reduction_indices=[
                                                          1,
                                                      ])
            self.temp_diff = self.masked_action_scores - self.future_rewards
            self.norm_diff = tf.square(
                tf.sigmoid(self.masked_action_scores / 100.0) -
                tf.sigmoid(self.future_rewards / 100.0))
            #self.norm_diff = tf.nn.sigmoid(tf.square(self.temp_diff)/40000.0)
            self.td_loss = tf.reduce_mean(self.norm_diff) * 20000.0
            # regularization loss
            q_network_variables = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, scope="q_network")
            self.reg_loss = self.reg_param * tf.reduce_sum(
                [tf.reduce_sum(tf.square(x)) for x in q_network_variables])
            # compute total loss and gradients
            self.loss = self.td_loss + self.reg_loss
            gradients = self.optimizer.compute_gradients(self.loss)
            # clip gradients by norm
            for i, (grad, var) in enumerate(gradients):
                if grad is not None:
                    gradients[i] = (tf.clip_by_norm(grad,
                                                    self.max_gradient), var)
            # add histograms for gradients.
            for grad, var in gradients:
                tf.summary.histogram(var.name, var)
                if grad is not None:
                    tf.summary.histogram(var.name + '/gradients', grad)
            self.train_op = self.optimizer.apply_gradients(gradients)

        # update target network with Q network
        with tf.name_scope("update_target_network"):
            self.target_network_update = []
            # slowly update target network parameters with Q network parameters
            q_network_variables = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, scope="q_network")
            target_network_variables = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, scope="target_network")
            for v_source, v_target in zip(q_network_variables,
                                          target_network_variables):
                # this is equivalent to target = (1-alpha) * target + alpha * source
                update_op = v_target.assign_sub(self.target_update_rate *
                                                (v_target - v_source))
                self.target_network_update.append(update_op)
            self.target_network_update = tf.group(*self.target_network_update)

        # scalar summaries
        tf.summary.scalar("td_loss", self.td_loss)
        #tf.summary.scalar("reg_loss", self.reg_loss)
        tf.summary.scalar("total_loss", self.loss)
        tf.summary.scalar("exploration", self.exploration)

        self.summarize = tf.summary.merge_all()
        self.no_op = tf.no_op()

    def storeExperience(self, state, action, reward, next_state, done):
        # always store end states
        if self.store_experience_cnt % self.store_replay_every == 0 or done:
            self.replay_buffer.add(state, action, reward, next_state, done)
        self.store_experience_cnt += 1

    def eGreedyAction(self, states, explore=True):
        if explore and self.exploration > random.random():
            return random.randint(0, self.num_actions - 1)
        else:
            return self.session.run(self.predicted_actions,
                                    {self.states: states})[0]

    def annealExploration(self, stategy='linear'):
        ratio = max((self.anneal_steps - self.train_iteration) /
                    float(self.anneal_steps), 0)
        self.exploration = (self.init_exp -
                            self.final_exp) * ratio + self.final_exp

    def updateModel(self, episode=-1):
        # not enough experiences yet
        print("compare  ", self.replay_buffer.count(), self.batch_size)
        if self.replay_buffer.count() < self.batch_size:
            return

        batch = self.replay_buffer.getBatch(self.batch_size)
        states = np.zeros((self.batch_size, self.state_dim))
        rewards = np.zeros((self.batch_size, ))
        action_mask = np.zeros((self.batch_size, self.num_actions))
        next_states = np.zeros((self.batch_size, self.state_dim))
        next_state_mask = np.zeros((self.batch_size, ))

        for k, (s0, a, r, s1, done) in enumerate(batch):
            states[k] = s0
            rewards[k] = r
            action_mask[k][a] = 1
            # check terminal state
            if not done:
                next_states[k] = s1
                next_state_mask[k] = 1

        # whether to calculate summaries
        calculate_summaries = self.train_iteration % self.summary_every == 0 and self.summary_writer is not None

        # perform one update of training
        #direct_r, nxt_r, label_r, now_net_r, diff, norm_diff, cost, td_cost, reg_cost, _, summary_str = self.session.run([
        cost, td_cost, reg_cost, _, summary_str = self.session.run(
            [
                #self.rewards,
                #self.target_values * self.discount_factor,
                #self.future_rewards,
                #self.masked_action_scores,
                #self.temp_diff,
                #self.norm_diff,
                self.loss,
                self.td_loss,
                self.reg_loss,
                self.train_op,
                self.summarize if calculate_summaries else self.no_op
            ],
            {
                self.states: states,
                self.next_states: next_states,
                self.next_state_mask: next_state_mask,
                self.action_mask: action_mask,
                self.rewards: rewards
            })
        '''
        rewards_out = open(rewards_out_path, 'a+')
        if self.train_iteration % 100 == 0:
            for i in range(len(direct_r)):
                print("episode: ", episode, "iter: ", self.train_iteration, "mini batch ---  ", i, "direct_r ",
                      direct_r[i],
                      "nxt_r: ", nxt_r[i], "label_r: ", label_r[i], "now_net_r: ", now_net_r[i],
                      "tmpdiff: ", diff[i],
                      "norm_diff", norm_diff[i],
                      #"loss", cost[i],
                       #"state: ", states[i],
                        file=rewards_out)
            sys.stdout.flush()
        rewards_out.close()
        '''
        #if self.train_iteration % 500:
        #   print('0000 :  ', diff, file=logf)
        #  print('llll :  ', norm_diff, file=logf)
        loss_out = open(loss_out_path, "a+")
        print("episode: ",
              episode,
              "iter: ",
              self.train_iteration,
              "hjk loss is -----  ",
              cost,
              "hjk td_loss is -----  ",
              td_cost,
              "hjk reg_loss is -----  ",
              reg_cost,
              file=loss_out)
        sys.stdout.flush()
        loss_out.close()
        # update target network using Q-network
        self.session.run(self.target_network_update)
        '''
        # emit summaries
        if calculate_summaries:
            self.summary_writer.add_summary(summary_str, self.train_iteration)
        '''
        self.annealExploration()
        self.train_iteration += 1

        del batch, states, rewards, action_mask, next_states, next_state_mask
        #del direct_r, nxt_r, label_r, now_net_r, diff, norm_diff
        gc.collect()
        #objgraph.show_most_common_types(limit=50)
    def save_net(self, path):
        saver = tf.train.Saver()
        save_path = saver.save(self.session, path)
        print("Save to path: " + save_path)