예제 #1
0
class A3CTrainingThread(object):
    def __init__(self,
                 thread_index,
                 global_network,
                 initial_learning_rate,
                 learning_rate_input,
                 grad_applier,
                 max_global_time_step,
                 device,
                 network_scope="network",
                 scene_scope="scene",
                 task_scope="task"):

        self.thread_index = thread_index
        self.learning_rate_input = learning_rate_input
        self.max_global_time_step = max_global_time_step

        self.network_scope = network_scope
        self.scene_scope = scene_scope
        self.task_scope = task_scope
        self.scopes = [network_scope, scene_scope, task_scope]

        self.local_network = ActorCriticFFNetwork(action_size=ACTION_SIZE,
                                                  device=device,
                                                  network_scope=network_scope,
                                                  scene_scopes=[scene_scope])

        self.local_network.prepare_loss(ENTROPY_BETA, self.scopes)

        self.trainer = AccumTrainer(device)
        self.trainer.prepare_minimize(self.local_network.total_loss,
                                      self.local_network.get_vars())

        self.accum_gradients = self.trainer.accumulate_gradients()
        self.reset_gradients = self.trainer.reset_gradients()

        accum_grad_names = [
            self._local_var_name(x)
            for x in self.trainer.get_accum_grad_list()
        ]
        global_net_vars = [
            x for x in global_network.get_vars()
            if self._get_accum_grad_name(x) in accum_grad_names
        ]

        self.apply_gradients = grad_applier.apply_gradients(
            global_net_vars, self.trainer.get_accum_grad_list())

        self.sync = self.local_network.sync_from(global_network)

        self.env = None

        self.local_t = 0

        self.initial_learning_rate = initial_learning_rate

        self.episode_reward = 0
        self.episode_length = 0
        self.episode_max_q = -np.inf

    def _local_var_name(self, var):
        return '/'.join(var.name.split('/')[1:])

    def _get_accum_grad_name(self, var):
        return self._local_var_name(var).replace(':', '_') + '_accum_grad:0'

    def _anneal_learning_rate(self, global_time_step):
        time_step_to_go = max(self.max_global_time_step - global_time_step,
                              0.0)
        learning_rate = self.initial_learning_rate * time_step_to_go / self.max_global_time_step
        return learning_rate

    def choose_action(self, pi_values):
        values = []
        sum = 0.0
        for rate in pi_values:
            sum = sum + rate
            value = sum
            values.append(value)

        r = random.random() * sum
        for i in range(len(values)):
            if values[i] >= r:
                return i

        # fail safe
        return len(values) - 1

    def _record_score(self, sess, writer, summary_op, placeholders, values,
                      global_t):
        feed_dict = {}
        for k in placeholders:
            feed_dict[placeholders[k]] = values[k]
        summary_str = sess.run(summary_op, feed_dict=feed_dict)
        if VERBOSE:
            sys.stdout.write('writing to summary writer at time %d\n' %
                             (global_t))
        writer.add_summary(summary_str, global_t)
        # writer.flush()

    def process(self, sess, global_t, summary_writer, summary_op,
                summary_placeholders):

        if self.env is None:
            # lazy evaluation
            time.sleep(self.thread_index * 1.0)
            self.env = Environment({
                'scene_name': self.scene_scope,
                'terminal_state_id': int(self.task_scope)
            })

        states = []
        actions = []
        rewards = []
        values = []
        targets = []

        rnn_inits = []
        state_representation = []
        usf = []
        reward_vector = []

        terminal_end = False

        # reset accumulated gradients
        sess.run(self.reset_gradients)

        # copy weights from shared to local
        sess.run(self.sync)

        #At each  episode start we set the initial state of the RNN to zero
        start_local_t = self.local_t

        start_lstm_state = self.local_network.lstm_state_out

        # t_max times loop
        for i in range(LOCAL_T_MAX):

            pi_, value_, usf_s_g = self.local_network.run_policy_and_value(
                sess, self.env.s_t, self.env.target, self.scopes)

            imidia_s = self.local_network.run_state(sess, self.env.s_t,
                                                    self.scopes)

            #usf_s_g = self.local_network.run_usf(sess, self.env.s_t, self.env.target,self.rnn_state_init[0] ,self.rnn_state_init[1] ,self.scopes)

            action = self.choose_action(pi_)

            states.append(self.env.s_t)
            actions.append(action)
            values.append(value_)
            targets.append(self.env.target)
            usf.append(usf_s_g)

            state_representation.append(imidia_s)

            if VERBOSE and (self.thread_index
                            == 0) and (self.local_t % 1000) == 0:
                sys.stdout.write("Pi = {0} V = {1}\n".format(pi_, value_))

            # process game
            self.env.step(action)

            # receive game result
            reward = self.env.reward
            terminal = self.env.terminal

            # ad-hoc reward for navigation
            reward = 10.0 if terminal else -0.01
            if self.episode_length > 5e3: terminal = True

            self.episode_reward += reward
            self.episode_length += 1
            self.episode_max_q = max(self.episode_max_q, np.max(value_))

            # clip reward
            rewards.append(np.clip(reward, -1, 1))

            self.local_t += 1

            # s_t1 -> s_t
            self.env.update()

            if i == (LOCAL_T_MAX - 1) or terminal:

                imidiate_state_representation_next = []
                usf_next = []
                #reward_vector_predictor_next=[]

                last_state = self.env.s_t
                imidia_s_next = self.local_network.run_state(
                    sess, self.env.s_t, self.scopes)
                state_representation_next = state_representation[1:] + [
                    imidia_s_next
                ]

                if terminal:
                    usf_next_imi = 0
                else:
                    usf_next_imi = self.local_network.run_usf(
                        sess, self.env.s_t, self.env.target, self.scopes)

                usf_next = usf[1:] + [usf_next_imi]

            if terminal:
                terminal_end = True
                sys.stdout.write(
                    "time %d | thread #%d | scene %s | target #%s\n%s %s episode reward = %.3f\n%s %s episode length = %d\n%s %s episode max Q  = %.3f\n"
                    % (global_t, self.thread_index, self.scene_scope,
                       self.task_scope, self.scene_scope, self.task_scope,
                       self.episode_reward, self.scene_scope, self.task_scope,
                       self.episode_length, self.scene_scope, self.task_scope,
                       self.episode_max_q))

                oneResult = [
                    global_t, self.thread_index, self.scene_scope,
                    self.task_scope, self.episode_reward, self.episode_length,
                    self.episode_max_q
                ]
                with open('trainingOutput.csv', 'a+') as fp:
                    # fd.write(oneResult)
                    wr = csv.writer(fp)
                    wr.writerow(oneResult)

                summary_values = {
                    "episode_reward_input": self.episode_reward,
                    "episode_length_input": float(self.episode_length),
                    "episode_max_q_input": self.episode_max_q,
                    "learning_rate_input": self._anneal_learning_rate(global_t)
                }

                self._record_score(sess, summary_writer, summary_op,
                                   summary_placeholders, summary_values,
                                   global_t)
                self.episode_reward = 0
                self.episode_length = 0
                self.episode_max_q = -np.inf
                self.local_network.reset_state()
                self.env.reset()

                break

        R = 0.0
        usf_R = 0.0
        if not terminal_end:
            R = self.local_network.run_value(sess, self.env.s_t,
                                             self.env.target, self.scopes)
            usf_R = self.local_network.run_usf(sess, self.env.s_t,
                                               self.env.target, self.scopes)

        actions.reverse()
        states.reverse()
        rewards.reverse()
        values.reverse()

        state_representation.reverse()
        state_representation_next.reverse()

        usf_next.reverse()

        batch_si = []
        batch_a = []
        batch_td = []
        batch_R = []
        batch_usf_R = []
        batch_t = []

        # compute and accmulate gradients
        for (ai, ri, si, Vi, ti, state,
             usf_n) in zip(actions, rewards, states, values, targets,
                           state_representation_next, usf_next):

            R = ri + GAMMA * R
            usf_R = state + GAMMA * usf_R
            #usf_R = state + GAMMA*usf_n

            td = R - Vi
            a = np.zeros([ACTION_SIZE])
            a[ai] = 1

            batch_si.append(si)
            batch_a.append(a)
            batch_td.append(td)
            batch_R.append(R)
            batch_usf_R.append(usf_R)
            batch_t.append(ti)

        #We need to reverse this since in the training we unroll for  5 steps unlike in the inferences

        batch_si.reverse()
        batch_a.reverse()
        batch_td.reverse()
        batch_R.reverse()
        batch_usf_R.reverse()
        batch_t.reverse()

        sess.run(self.accum_gradients,
                 feed_dict={
                     self.local_network.s: batch_si,
                     self.local_network.a: batch_a,
                     self.local_network.t: batch_t,
                     self.local_network.td: batch_td,
                     self.local_network.r: batch_R,
                     self.local_network.return_usf: batch_usf_R,
                     self.local_network.initial_lstm_state: start_lstm_state,
                     self.local_network.step_size: [len(batch_a)],
                 })

        cur_learning_rate = self._anneal_learning_rate(global_t)

        sess.run(self.apply_gradients,
                 feed_dict={self.learning_rate_input: cur_learning_rate})

        if VERBOSE and (self.thread_index == 0) and (self.local_t % 100) == 0:
            sys.stdout.write("Local timestep %d\n" % self.local_t)

        # return advanced local step size
        diff_local_t = self.local_t - start_local_t
        return diff_local_t
예제 #2
0
class A3C_Thread(object):
    def __init__(self,
                 thread_index,
                 global_network,
                 initial_learning_rate,
                 learning_rate_input,
                 grad_applier,
                 max_global_time_step,
                 device,
                 network_scope="network",
                 scene_scope="scene",
                 task_scope="task"):

        self.thread_index = thread_index  # Number the thread

        self._set_local_network(device, network_scope, scene_scope,
                                task_scope)  # Set local network

        self.sync = self.local_network.sync_from(
            global_network)  # Synthesize from the global network

        self.learning_rate_input = learning_rate_input  # Set learning rate

        self.max_global_time_step = max_global_time_step  # Set maximum of global time step

        self._set_trainer_optimizer(device, global_network,
                                    grad_applier)  # Set trainer

        self._set_environment(initial_learning_rate)  # Set environment

    # Create local network
    def _set_local_network(self, device, network_scope, scene_scope,
                           task_scope):
        self.local_network = DRLNetwork(action_size=ACTION_SIZE,
                                        device=device,
                                        network_scope=network_scope,
                                        scene_scopes=[scene_scope])

        self.network_scope = network_scope
        self.scene_scope = scene_scope
        self.task_scope = task_scope
        self.scopes = [network_scope, scene_scope, task_scope]
        self.local_network.prepare_loss(ENTROPY_BETA, self.scopes)

    # Set trainer and optimizer
    # Set Actor-Critic gradient and optimizer
    # Use the accumulated trainer from Zhu
    def _set_trainer_optimizer(self, device, global_network, grad_applier):
        self.trainer = AccumTrainer(device)
        self.trainer.prepare_minimize(self.local_network.total_loss,
                                      self.local_network.get_vars())

        self.accum_gradients = self.trainer.accumulate_gradients()
        self.reset_gradients = self.trainer.reset_gradients()

        accum_grad_names = [
            self._local_var_name(x)
            for x in self.trainer.get_accum_grad_list()
        ]
        global_net_vars = [
            x for x in global_network.get_vars()
            if self._get_accum_grad_name(x) in accum_grad_names
        ]

        self.apply_gradients = grad_applier.apply_gradients(
            global_net_vars, self.trainer.get_accum_grad_list())

    def _local_var_name(self, var):
        return '/'.join(var.name.split('/')[1:])

    def _get_accum_grad_name(self, var):
        return self._local_var_name(var).replace(':', '_') + '_accum_grad:0'

    # Set environments
    def _set_environment(self, initial_learning_rate):
        self.episode_max_q = -np.inf
        self.env = None
        self.local_t = 0
        self.initial_learning_rate = initial_learning_rate
        self.episode_reward = 0
        self.episode_length = 0

    # Choose one action according to the pi values
    def choose_action(self, pi_values):
        action = np.random.choice(np.arange(len(pi_values)), p=pi_values)
        return action

    # Take LOCAL_T_MAX step in one process
    # And update the accumulated gradients
    def process(self, sess, global_t, summary_writer, summary_op,
                summary_placeholders):

        if self.env is None:
            # lazy evaluation
            time.sleep(self.thread_index * 1.0)
            self.env = Environment({
                'scene_name': self.scene_scope,
                'terminal_state_id': int(self.task_scope)
            })
        start_local_t = self.local_t

        # Initialization
        states = []
        actions = []
        rewards = []
        values = []
        targets = []
        terminal_end = False

        # Reset accmulated gradient variables
        sess.run(self.reset_gradients)
        # Obtain shared parameters from global
        sess.run(self.sync)

        # t_max times loop
        for i in range(LOCAL_T_MAX):
            pi_, value_ = self.local_network.run_policy_and_value(
                sess, self.env.s_t, self.env.target, self.scopes)

            pi_ = np.array(pi_) / np.sum(pi_)
            action = self.choose_action(pi_)

            states.append(self.env.s_t)
            actions.append(action)
            values.append(value_)
            targets.append(self.env.target)

            if VERBOSE and (self.thread_index
                            == 0) and (self.local_t % 1000) == 0:
                sys.stdout.write("%s:" % self.scene_scope)
                sys.stdout.write("Pi = {0} V = {1}\n".format(pi_, value_))

            # process game
            self.env.step(action)

            # receive game result
            reward = self.env.reward
            terminal = self.env.terminal

            # ad-hoc reward for navigation
            # reward = 10.0 if terminal else -0.01
            if self.episode_length > 5e3: terminal = True

            self.episode_reward += reward
            self.episode_length += 1
            self.episode_max_q = max(self.episode_max_q, np.max(value_))

            # clip reward
            rewards.append(np.clip(reward, -1, 1))

            self.local_t += 1

            if terminal:
                terminal_end = True
                sys.stdout.write(
                    "#Thread: %d \n time %d | thread #%d | scene %s | target #%s\n%s %s episode reward = %.3f\n%s %s episode length = %d\n%s %s episode max Q  = %.3f\n"
                    % (self.thread_index, global_t, self.thread_index,
                       self.scene_scope, self.task_scope, self.scene_scope,
                       self.task_scope, self.episode_reward, self.scene_scope,
                       self.task_scope, self.episode_length, self.scene_scope,
                       self.task_scope, self.episode_max_q))

                summary_values = {
                    "episode_reward_input": self.episode_reward,
                    "episode_length_input": float(self.episode_length),
                    "episode_max_q_input": self.episode_max_q,
                    "learning_rate_input": self._anneal_learning_rate(global_t)
                }

                self._record_score(sess, summary_writer, summary_op,
                                   summary_placeholders, summary_values,
                                   global_t)
                self.episode_reward = 0
                self.episode_length = 0
                self.episode_max_q = -np.inf
                self.env.reset()

                break

        R = 0.0
        if not terminal_end:
            R = self.local_network.run_value(sess, self.env.s_t,
                                             self.env.target, self.scopes)

        actions.reverse()
        states.reverse()
        rewards.reverse()
        values.reverse()

        batch_si = []
        batch_a = []
        batch_td = []
        batch_R = []
        batch_t = []

        # compute and accmulate gradients
        for (ai, ri, si, Vi, ti) in zip(actions, rewards, states, values,
                                        targets):
            R = ri + GAMMA * R
            td = R - Vi
            a = np.zeros([ACTION_SIZE])
            a[ai] = 1

            batch_si.append(si)
            batch_a.append(a)
            batch_td.append(td)
            batch_R.append(R)
            batch_t.append(ti)

        sess.run(self.accum_gradients,
                 feed_dict={
                     self.local_network.s: batch_si,
                     self.local_network.a: batch_a,
                     self.local_network.t: batch_t,
                     self.local_network.td: batch_td,
                     self.local_network.r: batch_R
                 })

        cur_learning_rate = self._anneal_learning_rate(global_t)

        sess.run(self.apply_gradients,
                 feed_dict={self.learning_rate_input: cur_learning_rate})

        if VERBOSE and (self.thread_index == 0) and (self.local_t % 100) == 0:
            sys.stdout.write(
                "#Thread-%d-%s-Local timestep-%d\n" %
                (self.thread_index, self.scene_scope, self.local_t))

        # return advanced local step size
        diff_local_t = self.local_t - start_local_t
        return diff_local_t

    def _record_score(self, sess, writer, summary_op, placeholders, values,
                      global_t):
        feed_dict = {}
        for k in placeholders:
            feed_dict[placeholders[k]] = values[k]
        summary_str = sess.run(summary_op, feed_dict=feed_dict)
        if VERBOSE:
            sys.stdout.write('writing to summary writer at time %d\n' %
                             (global_t))
        writer.add_summary(summary_str, global_t)
        # writer.flush()

    def _anneal_learning_rate(self, global_time_step):
        time_step_to_go = max(self.max_global_time_step - global_time_step,
                              0.0)
        learning_rate = self.initial_learning_rate * time_step_to_go / self.max_global_time_step
        return learning_rate
예제 #3
0
class A3CTrainingThread(object):
    def __init__(self,
                 thread_index,
                 global_network,
                 initial_learning_rate,
                 learning_rate_input,
                 grad_applier,
                 max_global_time_step,
                 device,
                 network_scope="network",
                 scene_scope="scene",
                 training_scene="scene",
                 task_scope="task",
                 checkpoint_scope="checkpoint"):

        self.thread_index = thread_index
        self.learning_rate_input = learning_rate_input
        self.max_global_time_step = max_global_time_step

        self.network_scope = network_scope
        self.scene_scope = scene_scope
        self.training_scene = training_scene
        self.task_scope = task_scope
        self.checkpoint_scope = checkpoint_scope
        self.scopes = [network_scope, scene_scope, task_scope]

        self.local_network = ActorCriticFFNetwork(action_size=ACTION_SIZE,
                                                  device=device,
                                                  network_scope=network_scope,
                                                  scene_scopes=[scene_scope])

        self.local_network.prepare_loss(ENTROPY_BETA, self.scopes)

        self.trainer = AccumTrainer(device)
        self.trainer.prepare_minimize(self.local_network.total_loss,
                                      self.local_network.get_vars())

        self.accum_gradients = self.trainer.accumulate_gradients()
        self.reset_gradients = self.trainer.reset_gradients()

        accum_grad_names = [
            self._local_var_name(x)
            for x in self.trainer.get_accum_grad_list()
        ]
        global_net_vars = [
            x for x in global_network.get_vars()
            if self._get_accum_grad_name(x) in accum_grad_names
        ]

        self.apply_gradients = grad_applier.apply_gradients(
            global_net_vars, self.trainer.get_accum_grad_list())

        self.sync = self.local_network.sync_from(global_network)

        self.env = None

        self.local_t = 0

        self.initial_learning_rate = initial_learning_rate

        self.episode_reward = 0
        self.episode_length = 0
        self.episode_max_q = -np.inf

    def _local_var_name(self, var):
        return '/'.join(var.name.split('/')[1:])

    def _get_accum_grad_name(self, var):
        return self._local_var_name(var).replace(':', '_') + '_accum_grad:0'

    def _anneal_learning_rate(self, global_time_step):
        time_step_to_go = max(self.max_global_time_step - global_time_step,
                              0.0)
        learning_rate = self.initial_learning_rate * time_step_to_go / self.max_global_time_step
        return learning_rate

    def choose_action(self, pi_values):
        values = []
        sum = 0.0
        for rate in pi_values:
            sum = sum + rate
            value = sum
            values.append(value)

        r = random.random() * sum
        for i in range(len(values)):
            if values[i] >= r:
                return i

        # fail safe
        return len(values) - 1

    def _record_score(self, sess, writer, summary_op, placeholders, values,
                      global_t):
        feed_dict = {}
        for k in placeholders:
            feed_dict[placeholders[k]] = values[k]
        summary_str = sess.run(summary_op, feed_dict=feed_dict)
        if VERBOSE:
            sys.stdout.write('writing to summary writer at time %d\n' %
                             (global_t))
        writer.add_summary(summary_str, global_t)
        # writer.flush()

    def process(self, sess, global_t, summary_writer, summary_op,
                summary_placeholders):

        if self.env is None:
            # lazy evaluation
            time.sleep(self.thread_index * 1.0)
            self.env = Environment({
                'scene_name': self.training_scene,
                'terminal_state_id': self.task_scope,
                'checkpoint_state_id': self.checkpoint_scope
            })
            self.env.reset()

        states = []
        actions = []
        rewards = []
        values = []
        targets = []
        checkpoints = []
        positions = []
        auxilaries = []
        auxilaries_cl = []
        aclists = []
        colists = []
        isCheckpointed = []
        collision = []
        terminal_end = False

        # reset accumulated gradients
        sess.run(self.reset_gradients)

        # copy weights from shared to local
        sess.run(self.sync)

        start_local_t = self.local_t

        # t_max times loop
        for i in range(LOCAL_T_MAX):
            pi_, value_ = self.local_network.run_policy_and_value(
                sess, self.env.s_t, self.env.s_position, self.env.checkpoint,
                self.env.s_a_t, self.env.s_c_t, self.env.isCheckpoint,
                self.env.s_aux_cl, self.scopes)
            action = self.choose_action(pi_)

            states.append(self.env.s_t)
            actions.append(action)
            values.append(value_)
            targets.append(self.env.target)
            checkpoints.append(self.env.checkpoint)
            positions.append(self.env.s_position)
            aclists.append(self.env.s_a_t)
            colists.append(self.env.s_c_t)
            collision.append(self.env.s_aux_cl)
            isCheckpointed.append(int(self.env.isCheckpoint))
            if VERBOSE and (self.thread_index
                            == 0) and (self.local_t % 1000) == 0:
                sys.stdout.write("Pi = {0} V = {1}\n".format(pi_, value_))

            # process game
            self.env.step(action)

            # receive game result
            reward = self.env.reward
            terminal = self.env.terminal

            #if self.env.isCheckpoint:
            #  sys.stdout.write("CHECKPOINT \n")
            if self.episode_length > 5e3: terminal = True

            self.episode_reward += reward
            self.episode_length += 1
            self.episode_max_q = max(self.episode_max_q, np.max(value_))

            # clip reward
            rewards.append(np.clip(reward, -1, 1))

            self.local_t += 1

            # s_t1 -> s_t
            self.env.update()
            auxilaries.append(self.env.s_aux)
            auxilaries_cl.append(self.env.s_aux_cl)

            if terminal:
                terminal_end = True
                sys.stdout.write(
                    "time %d | thread #%d | scene %s | target #%s\n%s %s episode reward = %.3f\n%s %s episode length = %d\n%s %s episode max Q  = %.3f\n"
                    % (global_t, self.thread_index, self.scene_scope,
                       self.task_scope, self.scene_scope, self.task_scope,
                       self.episode_reward, self.scene_scope, self.task_scope,
                       self.episode_length, self.scene_scope, self.task_scope,
                       self.episode_max_q))

                summary_values = {
                    "episode_reward_input": self.episode_reward,
                    "episode_length_input": float(self.episode_length),
                    "episode_max_q_input": self.episode_max_q,
                    "learning_rate_input": self._anneal_learning_rate(global_t)
                }

                self._record_score(sess, summary_writer, summary_op,
                                   summary_placeholders, summary_values,
                                   global_t)
                self.episode_reward = 0
                self.episode_length = 0
                self.episode_max_q = -np.inf
                self.env.reset()

                break

        R = 0.0
        if not terminal_end:
            R = self.local_network.run_value(sess, self.env.s_t,
                                             self.env.s_position,
                                             self.env.checkpoint,
                                             self.env.s_a_t, self.env.s_c_t,
                                             self.env.isCheckpoint,
                                             self.env.s_aux_cl, self.scopes)

        actions.reverse()
        states.reverse()
        rewards.reverse()
        values.reverse()
        positions.reverse()
        auxilaries.reverse()
        auxilaries_cl.reverse()
        aclists.reverse()
        colists.reverse()
        isCheckpointed.reverse()
        collision.reverse()

        batch_si = []
        batch_a = []
        batch_td = []
        batch_R = []
        batch_t = []
        batch_c = []
        batch_p = []
        batch_aux = []
        batch_aux_cl = []
        batch_al = []
        batch_cl = []
        batch_ic = []
        batch_collision = []
        # compute and accmulate gradients
        for (ai, ri, si, Vi, ti, ci, pi, auxi, aux_cl_i, ali, cli, ici,
             coli) in zip(actions, rewards, states, values, targets,
                          checkpoints, positions, auxilaries, auxilaries_cl,
                          aclists, colists, isCheckpointed, collision):
            R = ri + GAMMA * R
            td = R - Vi
            a = np.zeros([ACTION_SIZE])
            a[ai] = 1
            batch_si.append(si)
            batch_a.append(a)
            batch_td.append(td)
            batch_R.append(R)
            batch_t.append(ti)
            batch_c.append(ci)
            batch_p.append(pi)
            batch_aux.append(auxi)
            batch_aux_cl.append(aux_cl_i)
            batch_al.append(ali)
            batch_cl.append(cli)
            batch_ic.append(ici)
            batch_collision.append(coli)

        sess.run(self.accum_gradients,
                 feed_dict={
                     self.local_network.s: batch_si,
                     self.local_network.a: batch_a,
                     self.local_network.t: batch_p,
                     self.local_network.c: batch_c,
                     self.local_network.td: batch_td,
                     self.local_network.aux: batch_aux,
                     self.local_network.aux_cl: batch_aux_cl,
                     self.local_network.al: batch_al,
                     self.local_network.cl: batch_cl,
                     self.local_network.ic: batch_ic,
                     self.local_network.col: batch_collision,
                     self.local_network.r: batch_R
                 })

        cur_learning_rate = self._anneal_learning_rate(global_t)

        sess.run(self.apply_gradients,
                 feed_dict={self.learning_rate_input: cur_learning_rate})

        if VERBOSE and (self.thread_index == 0) and (self.local_t % 100) == 0:
            sys.stdout.write("Local timestep %d\n" % self.local_t)

        # return advanced local step size
        diff_local_t = self.local_t - start_local_t
        return diff_local_t, terminal
예제 #4
0
class A3CTrainingThread(object):
    def __init__(self,
                 thread_index,
                 global_network,
                 initial_learning_rate,
                 learning_rate_input,
                 grad_applier,
                 max_global_time_step,
                 device,
                 network_scope="network",
                 scene_scope="scene",
                 task_scope="task"):

        self.thread_index = thread_index
        self.learning_rate_input = learning_rate_input
        self.max_global_time_step = max_global_time_step

        self.network_scope = network_scope
        self.scene_scope = scene_scope
        self.task_scope = task_scope
        self.scopes = [network_scope, scene_scope, task_scope]

        if USE_LSTM:
            self.local_network = ActorCriticLSTMNetwork(
                action_size=ACTION_SIZE,
                device=device,
                network_scope=network_scope,
                scene_scopes=[scene_scope])
        else:
            self.local_network = ActorCriticFFNetwork(
                action_size=ACTION_SIZE,
                device=device,
                network_scope=network_scope,
                scene_scopes=[scene_scope])

        self.local_network.prepare_loss(ENTROPY_BETA, self.scopes)

        self.trainer = AccumTrainer(device)
        self.trainer.prepare_minimize(self.local_network.total_loss,
                                      self.local_network.get_vars())

        self.accum_gradients = self.trainer.accumulate_gradients()
        self.reset_gradients = self.trainer.reset_gradients()

        accum_grad_names = [
            self._local_var_name(x)
            for x in self.trainer.get_accum_grad_list()
        ]
        global_net_vars = [
            x for x in global_network.get_vars()
            if self._get_accum_grad_name(x) in accum_grad_names
        ]

        self.apply_gradients = grad_applier.apply_gradients(
            global_net_vars, self.trainer.get_accum_grad_list())

        self.sync = self.local_network.sync_from(global_network)

        self.env = None
        self.obs = None

        self.local_t = 0

        self.initial_learning_rate = initial_learning_rate

        self.episode_reward = 0
        self.episode_length = 0
        self.episode_max_q = -np.inf
        self.entropy = np.zeros(20)

    def _local_var_name(self, var):
        return '/'.join(var.name.split('/')[1:])

    def _get_accum_grad_name(self, var):
        return self._local_var_name(var).replace(':', '_') + '_accum_grad:0'

    def _anneal_learning_rate(self, global_time_step):
        time_step_to_go = max(self.max_global_time_step - global_time_step,
                              0.0)
        learning_rate = self.initial_learning_rate * time_step_to_go / self.max_global_time_step
        return learning_rate

    def choose_action(self, pi_values):
        values = []
        sum = 0.0
        for rate in pi_values:
            sum = sum + rate
            value = sum
            values.append(value)

        r = random.random() * sum
        for i in range(len(values)):
            if values[i] >= r:
                return i

        # fail safe
        return len(values) - 1

    def _record_score(self, sess, writer, summary_op, placeholders, values,
                      global_t):
        feed_dict = {}
        for k in placeholders:
            feed_dict[placeholders[k]] = values[k]
        summary_str = sess.run(summary_op, feed_dict=feed_dict)
        if VERBOSE:
            sys.stdout.write('writing to summary writer at time %d\n' %
                             (global_t))
        writer.add_summary(summary_str, global_t)
        # writer.flush()

    def process(self, sess, global_t, summary_writer, summary_op,
                summary_placeholders):

        if self.env is None:
            # lazy evaluation
            time.sleep(self.thread_index * 1.0)
            self.env = gym.make('Pong-v0')
            self.obs = self.env.reset()

        states = []
        actions = []
        rewards = []
        values = []

        terminal_end = False

        # reset accumulated gradients
        sess.run(self.reset_gradients)

        # copy weights from shared to local
        sess.run(self.sync)

        start_local_t = self.local_t

        if USE_LSTM:
            start_lstm_state = self.local_network.lstm_state_out

        # t_max times loop
        for i in range(LOCAL_T_MAX):
            pi_, value_ = self.local_network.run_policy_and_value(
                sess, self.obs, self.scopes)
            action = self.choose_action(pi_)

            states.append(self.obs)
            actions.append(action)
            values.append(value_)

            if VERBOSE and (self.thread_index
                            == 0) and (self.local_t % 1000) == 0:
                sys.stdout.write("Pi = {0} V = {1}\n".format(pi_, value_))

            # process game
            self.obs, reward, terminal, info = self.env.step(action)

            # ad-hoc reward for navigation
            # reward = 10.0 if terminal else -0.01
            # if self.episode_length > 5e3: terminal = True

            self.episode_reward += reward
            self.episode_length += 1
            self.episode_max_q = max(self.episode_max_q, np.max(value_))

            # clip reward
            rewards.append(np.clip(reward, -1, 1))

            self.local_t += 1

            # s_t1 -> s_t
            # self.env.update()

            if terminal:
                terminal_end = True
                sys.stdout.write(
                    "time %d | thread #%d | scene %s \n"
                    "%s %s episode reward = %.3f\n"
                    "%s %s episode length = %d\n"
                    "%s %s episode max Q  = %.3f\n" %
                    (global_t, self.thread_index, self.scene_scope,
                     self.scene_scope, self.task_scope, self.episode_reward,
                     self.scene_scope, self.task_scope, self.episode_length,
                     self.scene_scope, self.task_scope, self.episode_max_q))

                summary_values = {
                    "episode_reward_input": self.episode_reward,
                    "episode_length_input": float(self.episode_length),
                    "episode_max_q_input": self.episode_max_q,
                    "learning_rate_input":
                    self._anneal_learning_rate(global_t),
                    "episode_entropy": self.entropy[
                        0]  # self.entropy here is a np.array([1, 20]) with same
                    # value for each element, don't know why
                }

                self._record_score(sess, summary_writer, summary_op,
                                   summary_placeholders, summary_values,
                                   global_t)
                self.episode_reward = 0
                self.episode_length = 0
                self.episode_max_q = -np.inf
                self.obs = self.env.reset()
                if USE_LSTM:
                    self.local_network.reset_state()
                break

        R = 0.0
        if not terminal_end:
            R = self.local_network.run_value(sess, self.obs, self.scopes)

        actions.reverse()
        states.reverse()
        rewards.reverse()
        values.reverse()

        batch_si = []
        batch_a = []
        batch_td = []
        batch_R = []

        # compute and accmulate gradients
        for (ai, ri, si, Vi) in zip(actions, rewards, states, values):
            R = ri + GAMMA * R
            td = R - Vi
            a = np.zeros([ACTION_SIZE])
            a[ai] = 1

            batch_si.append(si)
            batch_a.append(a)
            batch_td.append(td)
            batch_R.append(R)

        cur_learning_rate = self._anneal_learning_rate(global_t)

        if USE_LSTM:
            batch_si.reverse()
            batch_a.reverse()
            batch_td.reverse()
            batch_R.reverse()

            _, self.entropy = sess.run(
                [self.apply_gradients, self.local_network.entropy],
                feed_dict={
                    self.local_network.s: batch_si,
                    self.local_network.a: batch_a,
                    self.local_network.td: batch_td,
                    self.local_network.r: batch_R,
                    self.local_network.initial_lstm_state: start_lstm_state,
                    self.local_network.step_size: [len(batch_a)],
                    self.learning_rate_input: cur_learning_rate
                })
            # _, self.entropy = sess.run([self.accum_gradients, self.local_network.entropy],
            #                            feed_dict={
            #                              self.local_network.s: batch_si,
            #                              self.local_network.a: batch_a,
            #                              self.local_network.td: batch_td,
            #                              self.local_network.r: batch_R,
            #                              self.local_network.step_size: [len(batch_a)]
            #                            })

        else:
            _, self.entropy = sess.run(
                [self.accum_gradients, self.local_network.entropy],
                feed_dict={
                    self.local_network.s: batch_si,
                    self.local_network.a: batch_a,
                    self.local_network.td: batch_td,
                    self.local_network.r: batch_R
                })

            sess.run(self.apply_gradients,
                     feed_dict={self.learning_rate_input: cur_learning_rate})

        if VERBOSE and (self.thread_index == 0) and (self.local_t % 100) == 0:
            sys.stdout.write("Local timestep %d\n" % self.local_t)

        # return advanced local step size
        diff_local_t = self.local_t - start_local_t
        return diff_local_t
예제 #5
0
class A3CTrainingThread(object):
    def __init__(self,
                 thread_index,
                 global_network,
                 global_discriminator,
                 initial_learning_rate,
                 learning_rate_input,
                 grad_applier,
                 grad_applier_discriminator,
                 max_global_time_step,
                 device,
                 device2,
                 network_scope="network",
                 scene_scope="scene",
                 task_scope="task"):

        self.thread_index = thread_index
        self.learning_rate_input = learning_rate_input
        self.max_global_time_step = max_global_time_step

        self.network_scope = network_scope
        self.network_scope_D = network_scope + "_d"
        self.scene_scope = scene_scope
        self.task_scope = task_scope
        self.scopes = [network_scope, scene_scope, task_scope]

        self.scopes_d = [self.network_scope_D, task_scope]

        self.local_network = ActorCriticFFNetwork(action_size=ACTION_SIZE,
                                                  device=device,
                                                  network_scope=network_scope,
                                                  scene_scopes=[scene_scope])

        self.local_network.prepare_loss(ENTROPY_BETA, self.scopes)

        self.trainer = AccumTrainer(device)

        self.trainer.prepare_minimize(
            self.local_network.
            total_loss,  #getting the gradients of for the local network variablkes
            self.local_network.get_vars())

        #This part is for the newly added PPO loss (we need to keep old and new update parameters)
        new_variable_list = self.local_network.get_vars()
        old_varaible_list = self.local_network.get_vars_old()

        #For the ppo loss begining of the each iteration we need to sync old with current
        self.old_new_sync = self.local_network.sync_curre_old()

        self.accum_gradients = self.trainer.accumulate_gradients(
        )  #This is to assign gradients
        self.reset_gradients = self.trainer.reset_gradients(
        )  #after applying the grads to variables we need to resent those variables

        accum_grad_names = [
            self._local_var_name(x)
            for x in self.trainer.get_accum_grad_list()
        ]  #get the name list of all the grad vars

        global_net_vars = [
            x for x in global_network.get_vars()
            if self._get_accum_grad_name(x) in accum_grad_names
        ]  #check whether the global_network vars are mentioned in gradiet computations for them
        local_net_vars = [
            x for x in self.local_network.get_vars()
            if self._get_accum_grad_name(x) in accum_grad_names
        ]

        self.apply_gradients = grad_applier.apply_gradients(
            global_net_vars, self.trainer.get_accum_grad_list())
        self.apply_gradients_local = grad_applier.apply_gradients_local_net(
            local_net_vars, self.trainer.get_accum_grad_list())

        #If this is unstable it is desireable to first apply the gradients on the local network and then clip and after that we apply
        self.sync = self.local_network.sync_from(
            global_network
        )  #this is to sync from the glocal network Apply updated global params to the local network

        #This part is for the Discriminator
        #########################################################################################
        #
        self.local_discriminator = Discriminator_WGAN(  #
            action_size=ACTION_SIZE,  # 
            device=device,  #  
            network_scope=network_scope,  #
            scene_scopes=[scene_scope])  #
        #
        self.local_discriminator.prepare_loss_D(ENTROPY_BETA, self.scopes_d)  #
        #
        self.trainer_D = AccumTrainer_d(device=device,
                                        name="AccumTrainer_d")  #
        #
        self.trainer_D.prepare_minimize(
            self.local_discriminator.total_loss_d,  #
            self.local_discriminator.get_vars())  #
        #
        #
        self.accum_gradients_d = self.trainer_D.accumulate_gradients()  #
        self.reset_gradients_d = self.trainer_D.reset_gradients()

        #
        #
        accum_grad_names_discrimi = [
            self._local_var_name(x)
            for x in self.trainer_D.get_accum_grad_list()
        ]
        #
        #
        global_discri_vars = [
            x for x in global_discriminator.get_vars()
            if self._get_accum_grad_name(x) in accum_grad_names_discrimi
        ]
        local_discri_vars = [
            x for x in self.local_discriminator.get_vars()
            if self._get_accum_grad_name(x) in accum_grad_names_discrimi
        ]
        #
        self.apply_gradients_discriminator = grad_applier_discriminator.apply_gradients(
            local_discri_vars, self.trainer_D.get_accum_grad_list()
        )  #applying grad to the LOCAL network

        #
        self.clip_local_d_weights = self.local_discriminator.clip_weights(
        )  #here we are clipping the global net weights directly.
        #
        self.sync_discriminator_l_G = self.local_discriminator.sync_to(
            global_discriminator)  #
        self.sync_discriminator_G_l = self.local_discriminator.sync_from(
            global_discriminator)
        #
        self.D_var_G = global_discriminator.get_vars()
        self.D_var_l = self.local_discriminator.get_vars()  #
        #
        #
        #########################################################################################

        self.env = None
        self.local_t = 0
        self.initial_learning_rate = initial_learning_rate

        self.episode_reward = 0
        self.episode_length = 0
        self.episode_max_q = -np.inf

    def _local_var_name(self, var):
        return '/'.join(var.name.split('/')[1:])

    def _get_accum_grad_name(self, var):
        return self._local_var_name(var).replace(':', '_') + '_accum_grad:0'

    def _anneal_learning_rate(self, global_time_step):
        time_step_to_go = max(self.max_global_time_step - global_time_step,
                              0.0)
        learning_rate = self.initial_learning_rate * time_step_to_go / self.max_global_time_step
        return learning_rate

    def choose_action(self, pi_values):
        values = []
        sum = 0.0
        for rate in pi_values:
            sum = sum + rate
            value = sum
            values.append(value)

        r = random.random() * sum
        for i in range(len(values)):
            if values[i] >= r:
                return i

        # fail safe
        return len(values) - 1

    def _record_score(self, sess, writer, summary_op, placeholders, values,
                      global_t):
        feed_dict = {}
        for k in placeholders:
            feed_dict[placeholders[k]] = values[k]
        summary_str = sess.run(summary_op, feed_dict=feed_dict)
        if VERBOSE:
            print('writing to summary writer at time %d\n' % (global_t))
        writer.add_summary(summary_str, global_t)
        # writer.flush()

    def process(self, sess, global_t, summary_writer, summary_op,
                summary_placeholders):

        if self.env is None:
            # lazy evaluation
            time.sleep(self.thread_index * 1.0)
            self.env = Environment({
                'scene_name': self.scene_scope,
                'terminal_state_id': int(self.task_scope)
            })

            self.env.reset()  #resetting the environment for each thread

        self.env_Oracle = Environment(
            {  #Every iteration in the thread the expert start with the current state of the agent
                'scene_name': self.scene_scope,
                'terminal_state_id': int(self.task_scope),
                'initial_state': self.env.current_state_id
            })

        self.env_Oracle.reset()

        states = []  #to keeep state ,actions ,targets and other stae
        actions = []
        rewards = []
        values = []
        targets = []
        dones = []

        states_oracle = []
        actions_oracle = []
        targets_oracle = []

        terminal_end = False  #in the start terminal state_end is false

        sess.run(
            self.reset_gradients
        )  #resetting the gradient positions when starting the process for each Iteration
        sess.run(self.sync)  # copy weights from shared to local

        #dicriminator sync
        ##########################
        sess.run(self.sync_discriminator_G_l
                 )  #Copy the weights from the sharead to the local
        sess.run(self.reset_gradients_d
                 )  #resetting the gradients of the discriminator slosts
        ########################

        start_local_t = self.local_t
        self.oracle = ShortestPathOracle(self.env_Oracle, ACTION_SIZE)

        #########################################################################################
        #Sampling the Expert Trajectories
        for i in range(100):
            #We might need to use an for loop to finish the expert trajectory first
            oracle_pi = self.oracle.run_policy(
                self.env_Oracle.current_state_id
            )  #get the policy of the oracle which means the shotest path kind of action in the given state
            oracle_action = self.choose_action(oracle_pi)

            states_oracle.append(self.env_Oracle.s_t)
            actions_oracle.append(oracle_action)
            targets_oracle.append(self.env_Oracle.target)

            self.env_Oracle.step(oracle_action)

            terminal_o = self.env_Oracle.terminal

            self.env_Oracle.update()

            if terminal_o:
                break

        ##############################################################################################

        # t_max times loop
        for i in range(
                LOCAL_T_MAX
        ):  #one thread will run for maximum amoound to 5 iterations then do a gradiet uodate

            pi_, value_ = self.local_network.run_policy_and_value(
                sess, self.env.s_t, self.env.target, self.scopes)

            action = self.choose_action(pi_)

            states.append(self.env.s_t)
            actions.append(action)
            values.append(value_)
            targets.append(self.env.target)

            if VERBOSE and (self.thread_index
                            == 0) and (self.local_t % 1000) == 0:
                sys.stdout.write("Pi = {0} V = {1}\n".format(pi_, value_))

            # process game
            self.env.step(action)

            # receive game result
            reward = self.env.reward  #getting the reward from the env
            terminal = self.env.terminal  #geting whether the agent went to a terminal state

            # ad-hoc reward for navigation
            reward = 10.0 if terminal else -0.01  #this is the normal reward here 10 if terminal all the others it is -0.01 (ollision donesst take in to the accout)
            if self.episode_length > 5e3:
                terminal = True  #Here we do not let agent to run more that 5000 steps so we make it terminal
            #but the above terminal thing has no effect on giving 10 as the rwaerd because we set the rweard above

            self.episode_reward += reward
            self.episode_length += 1
            #this is what is the maximum value got in the episode
            self.episode_max_q = max(self.episode_max_q, np.max(
                value_))  #self.episode_max_q-This is -inf in the beggining

            # clip reward
            rewards.append(
                np.clip(reward, -1, 1)
            )  #make sure the rewartds is between -1 and +1 even thore rtthere is a 10

            self.local_t += 1

            # s_t1 -> s_t
            self.env.update()

            if terminal:  #if we go to the terminal state we will surely break the function
                score = self.local_discriminator.run_critic(
                    sess, states, targets, actions, self.scopes_d)
                sys.stdout.write("Critic_Score = {0}".format(score))
                terminal_end = True
                sys.stdout.write(
                    "time %d | thread #%d | scene %s | target #%s\n%s %s episode reward = %.3f\n%s %s episode length = %d\n%s %s episode max Q  = %.3f\n"
                    % (global_t, self.thread_index, self.scene_scope,
                       self.task_scope, self.scene_scope, self.task_scope,
                       self.episode_reward, self.scene_scope, self.task_scope,
                       self.episode_length, self.scene_scope, self.task_scope,
                       self.episode_max_q))

                summary_values = {
                    "episode_reward_input": self.episode_reward,
                    "episode_length_input": float(self.episode_length),
                    "episode_max_q_input": self.episode_max_q,
                    "learning_rate_input": self._anneal_learning_rate(global_t)
                }

                self._record_score(sess, summary_writer, summary_op,
                                   summary_placeholders, summary_values,
                                   global_t)
                self.episode_reward = 0  #after terminal state we gonna make all these variables to zero
                self.episode_length = 0  #Now the AI need to start from new position
                self.episode_max_q = -np.inf  #after a terminaltion we do this
                self.env.reset()

                break

        R = 0.0  #In the terminal Return is nothing  #If it's terminal end we do not have a return from the final state

        if not terminal_end:  #But if it's not the turminal Return is the next value function
            R = self.local_network.run_value(sess, self.env.s_t,
                                             self.env.target, self.scopes)

        #Agent's Samples
        actions.reverse()
        states.reverse()
        rewards.reverse()
        values.reverse()

        #Expert's Samples
        states_oracle.reverse()
        actions_oracle.reverse()
        actions_oracle.reverse()

        #Agent's batch
        batch_si = []
        batch_a = []
        batch_actions = []
        batch_td = []
        batch_R = []
        batch_t = []

        #Expert's Batch
        batch_si_ex = []
        batch_a_ex = []
        batch_t_ex = []

        batch_si_d = []
        batch_t_d = []
        batch_actions_d = []

        #This is for the
        for (s_e, a_e, t_e) in zip(states_oracle, actions_oracle,
                                   targets_oracle):
            batch_si_ex.append(s_e)
            batch_a_ex.append(a_e)
            batch_t_ex.append(t_e)

        for (ai, si, ti) in zip(actions, states, targets):

            batch_actions_d.append(ai)
            batch_si_d.append(si)
            batch_t_d.append(ti)

        cur_learning_rate = self._anneal_learning_rate(global_t)

        for i in range(10):

            #sess.run(self.reset_gradients_d)

            sess.run(
                self.
                accum_gradients_d,  #since we update the algorithm for given action ,given state, given advatns and given value and given reward we do not care about the sequence
                feed_dict={
                    self.local_discriminator.s_e: batch_si_ex,
                    self.local_discriminator.Actions_e: batch_a_ex,
                    self.local_discriminator.s_a: batch_si_d,
                    self.local_discriminator.Actions_a: batch_actions_d,
                    self.local_discriminator.t_e: batch_t_ex,
                    self.local_discriminator.t_a: batch_t_d
                })

            sess.run(
                self.
                apply_gradients_discriminator,  #directly gradients get apply on the global discri
                feed_dict={self.learning_rate_input: 0.00005})

            loss = sess.run(self.local_discriminator.total_loss_d,
                            feed_dict={
                                self.local_discriminator.s_e: batch_si_ex,
                                self.local_discriminator.Actions_e: batch_a_ex,
                                self.local_discriminator.s_a: batch_si_d,
                                self.local_discriminator.Actions_a:
                                batch_actions_d,
                                self.local_discriminator.t_e: batch_t_ex,
                                self.local_discriminator.t_a: batch_t_d
                            })

            sess.run(self.clip_local_d_weights
                     )  #every update make sure u clip weihtfs

        critic_r = self.local_discriminator.run_critic(sess, batch_si_d,
                                                       batch_t_d,
                                                       batch_actions_d,
                                                       self.scopes_d)
        critic_r = critic_r * 0.1

        rewards = rewards + critic_r  #We concatenate the rewrds function

        # Compute the advantage function , return and stack them as batches in Agent
        for (ai, ri, si, Vi, ti) in zip(actions, rewards, states, values,
                                        targets):
            R = ri + GAMMA * R  #calculatung the adcantage function
            td = R - Vi
            a = np.zeros([ACTION_SIZE])
            a[ai] = 1  #making the actions one hot
            batch_actions.append(ai)
            batch_si.append(si)
            batch_a.append(a)
            batch_td.append(td)
            batch_R.append(R)
            batch_t.append(ti)

        #syncying the new paramters to the old network in the thread PPO
        sess.run(self.old_new_sync)
        for i in range(4):
            #sess.run(self.reset_gradients) #reset the gradients
            sess.run(
                self.
                accum_gradients,  #since we update the algorithm for given action ,given state, given advatns and given value and given reward we do not care about the sequence
                feed_dict={
                    self.local_network.s: batch_si,
                    self.local_network.a: batch_a,
                    self.local_network.t: batch_t,
                    self.local_network.td: batch_td,
                    self.local_network.r: batch_R,
                })

            sess.run(
                self.
                apply_gradients_local,  #apply the gradients to the local networ
                feed_dict={self.learning_rate_input: cur_learning_rate})

        #theoritcally we can have one accume gradient operation here
        sess.run(self.apply_gradients,
                 feed_dict={self.learning_rate_input: cur_learning_rate})

        sess.run(
            self.sync_discriminator_l_G
        )  #syncing the paramters from the local network to the global newok

        if VERBOSE and (self.thread_index == 0) and (self.local_t % 100) == 0:
            sys.stdout.write("Local timestep %d\n" % self.local_t)

        # return advanced local step size
        diff_local_t = self.local_t - start_local_t
        return diff_local_t
class ADQN_Thread(object):
  def __init__(self, thread_index, global_network, initial_learning_rate,
               learning_rate_input, grad_applier, max_global_time_step,
               device, network_scope="network", scene_scope="scene",
               task_scope="task"):
    
    self.thread_index = thread_index                                        # Number the thread

    self._set_local_network(device, network_scope, scene_scope, task_scope) # Set local network

    self.sync = self.local_network.sync_from(global_network)                # Synthesize from the global network

    self.learning_rate_input = learning_rate_input                          # Set learning rate

    self.max_global_time_step = max_global_time_step                        # Set maximum of global time step
    
    self._set_trainer_optimizer(device, global_network, grad_applier)                     # Set trainer
    
    self._set_environment(initial_learning_rate)                            # Set environment

    self.memory_size = MEMORY_SIZE # memory size for replay buffer

    self.memory = np.zeros((self.memory_size, 2048 * 4 * 2 + 2))  # initialize zero memory [s, a, r, s_]

    self.replace_target_iter = DQN_REPLACE_TARGET_ITER

    self.batch_size = DQN_BATCH_SIZE

    self.gamma = REWARD_DECAY


  # Create local network
  def _set_local_network(self, device, network_scope, scene_scope, task_scope):
    self.local_network = DRLNetwork(action_size=ACTION_SIZE, device=device, network_scope=network_scope,
                           scene_scopes=[scene_scope])
    
    self.network_scope = network_scope
    self.scene_scope = scene_scope
    self.task_scope = task_scope
    self.scopes = [network_scope, scene_scope, task_scope]
    self.local_network.prepare_loss(ENTROPY_BETA, self.scopes)

  # Set trainer and optimizer
  # Set Actor-Critic gradient and optimizer
  # Use the accumulated trainer from Zhu
  def _set_trainer_optimizer(self, device, global_network, grad_applier):
    self.trainer = AccumTrainer(device)
    self.trainer.prepare_minimize(self.local_network.total_loss,
                                  self.local_network.get_vars())

    self.accum_gradients = self.trainer.accumulate_gradients()
    self.reset_gradients = self.trainer.reset_gradients()

    accum_grad_names = [self._local_var_name(x) for x in self.trainer.get_accum_grad_list()]
    global_net_vars = [x for x in global_network.get_vars() if self._get_accum_grad_name(x) in accum_grad_names]

    self.apply_gradients = grad_applier.apply_gradients(global_net_vars, self.trainer.get_accum_grad_list() )

  def _local_var_name(self, var):
    return '/'.join(var.name.split('/')[1:])

  def _get_accum_grad_name(self, var):
    return self._local_var_name(var).replace(':','_') + '_accum_grad:0'

  # Set environments
  def _set_environment(self, initial_learning_rate):
    self.env = None
    self.local_t = 0
    self.initial_learning_rate = initial_learning_rate
    self.episode_reward = 0
    self.episode_length = 0


  def choose_action(self, actions_value):
    # epsilon-greedy
    if np.random.uniform() < EPSILON:
      action = np.argmax(actions_value)
    else:
      action = np.random.randint(0, ACTION_SIZE)
    return action

  # Take LOCAL_T_MAX step in one process
  def process(self, sess, global_t, summary_writer, summary_op, summary_placeholders):
    #print("start process")

    if self.env is None:
      # lazy evaluation
      time.sleep(self.thread_index*1.0)
      self.env = Environment({
        'scene_name': self.scene_scope,
        'terminal_state_id': int(self.task_scope)
      })
    start_local_t = self.local_t

    # Reset accmulated gradient variables
    sess.run(self.reset_gradients)
    # Obtain shared parameters from global 
    sess.run( self.sync )

    # t_max times loop
    for i in range(LOCAL_T_MAX):
      old_s_t = self.env.s_t
      actions_value = self.local_network.run_DQN(sess, self.env.s_t, self.env.target, self.scopes)
      action = self.choose_action(actions_value)

      if VERBOSE and (self.thread_index == 0) and (self.local_t % 1000) == 0:
        sys.stdout.write("%s:" % self.scene_scope)
        sys.stdout.write("Pi = {0} V = {1}\n".format(actions_value, action))

      # process game
      self.env.step(action)

      # receive game result
      reward = self.env.reward
      terminal = self.env.terminal

      # ad-hoc reward for navigation
      # reward = 10.0 if terminal else -0.01
      if self.episode_length > 5e3: terminal = True

      self.episode_reward += reward
      self.episode_length += 1

      """
      print("Local t: {0:d}".format(self.local_t))
      print("Reward: {0:f}".format(reward))
      print("Episode reward: {0:f}".format(self.episode_reward))
      print("Episode length: {0:d}".format(self.episode_length))
      """

      self.local_t += 1

      # store transition to replay buffer
      self.store_transition(old_s_t, action, reward, self.env.s_t)

      if terminal:
        sys.stdout.write("#Thread: %d \n time %d | thread #%d | scene %s | target #%s\n%s %s episode reward = %.3f\n%s %s episode length = %d\n%s %s \n" % (self.thread_index, global_t, self.thread_index, self.scene_scope, self.task_scope, self.scene_scope, self.task_scope, self.episode_reward, self.scene_scope, self.task_scope, self.episode_length, self.scene_scope, self.task_scope))

        summary_values = {
          "episode_reward_input": self.episode_reward,
          "episode_length_input": float(self.episode_length),
          "learning_rate_input": self._anneal_learning_rate(global_t)
        }

        self._record_score(sess, summary_writer, summary_op, summary_placeholders,
                           summary_values, global_t)
        self.episode_reward = 0
        self.episode_length = 0
        self.env.reset()

        break

    # update target network
    if self.local_t % self.replace_target_iter == 0:
      sess.run(self.local_network.replace_target_op)
      # print('\ntarget_params_replaced\n')

    # sample batch memory from all memory
    if self.memory_counter > self.memory_size:
      sample_index = np.random.choice(self.memory_size, size=self.batch_size)
    else:
      sample_index = np.random.choice(self.memory_counter, size=self.batch_size)
    batch_memory = self.memory[sample_index, :]

    batch_memory_s_ = np.reshape(batch_memory[:, -2048*4:], (-1, 2048, 4))
    batch_memory_s = np.reshape(batch_memory[:, :2048*4], (-1, 2048, 4))
    batch_memory_t = np.reshape(np.tile(self.env.target, [self.batch_size, 1]), (-1, 2048, 4))

    q_next, q_eval = sess.run(
      [self.local_network.q_next, self.local_network.q_eval],
      feed_dict={
        self.local_network.s_: batch_memory_s_,  # fixed params
        self.local_network.s: batch_memory_s,  # newest params
        self.local_network.t: batch_memory_t
      })

    # change q_target w.r.t q_eval's action
    q_target = q_eval.copy()

    batch_index = np.arange(self.batch_size, dtype=np.int32)
    eval_act_index = batch_memory[:, 2048*4].astype(int)
    reward = batch_memory[:, 2048*4 + 1]

    key_eval = self.network_scope + '/' + self.scene_scope + '/eval'
    if terminal:
      q_target[key_eval][batch_index, eval_act_index] = reward
    else:
      key_target = self.network_scope + '/'+ self.scene_scope + '/target'
      q_target[key_eval][batch_index, eval_act_index] = reward + self.gamma * np.max(q_next[key_target], axis=1)

    for idx in batch_index:
      # train eval network
      sess.run(self.accum_gradients,
               feed_dict={
                 self.local_network.s: [batch_memory_s[idx]],
                 self.local_network.t: [batch_memory_t[idx]],
                 self.local_network.q_target: [q_target[key_eval][idx]]})

      cur_learning_rate = self._anneal_learning_rate(global_t)

      # update global network
      sess.run( self.apply_gradients,
                feed_dict = { self.learning_rate_input: cur_learning_rate } )

    if VERBOSE and (self.thread_index == 0) and (self.local_t % 100) == 0:
      sys.stdout.write("#Thread-%d-%s-Local timestep-%d\n" % (self.thread_index, self.scene_scope, self.local_t))

    # return advanced local step size
    diff_local_t = self.local_t - start_local_t
    return diff_local_t


  def _record_score(self, sess, writer, summary_op, placeholders, values, global_t):
    feed_dict = {}
    for k in placeholders:
      feed_dict[placeholders[k]] = values[k]
    summary_str = sess.run(summary_op, feed_dict=feed_dict)
    if VERBOSE: sys.stdout.write('writing to summary writer at time %d\n' % (global_t))
    writer.add_summary(summary_str, global_t)
    # writer.flush()


  def _anneal_learning_rate(self, global_time_step):
    time_step_to_go = max(self.max_global_time_step - global_time_step, 0.0)
    learning_rate = self.initial_learning_rate * time_step_to_go / self.max_global_time_step
    return learning_rate

  def store_transition(self, s, a, r, s_):
    if not hasattr(self, 'memory_counter'):
      self.memory_counter = 0

    transition = np.hstack((np.reshape(s, -1), [a, r], np.reshape(s_,-1)))

    # replace the old memory with new memory
    index = self.memory_counter % self.memory_size
    self.memory[index, :] = transition

    self.memory_counter += 1
class A3CTrainingThread(object):
    def __init__(self,
                 thread_index,
                 global_network,
                 global_discriminator,
                 initial_learning_rate,
                 learning_rate_input,
                 grad_applier,
                 grad_applier_discriminator,
                 max_global_time_step,
                 device,
                 network_scope="network",
                 scene_scope="scene",
                 task_scope="task"):

        self.thread_index = thread_index
        self.learning_rate_input = learning_rate_input
        self.max_global_time_step = max_global_time_step

        self.network_scope = network_scope
        self.network_scope_D = network_scope + "_d"
        self.scene_scope = scene_scope
        self.task_scope = task_scope
        self.scopes = [network_scope, scene_scope, task_scope]

        self.scopes_d = [self.network_scope_D, scene_scope, task_scope]

        self.local_network = ActorCriticFFNetwork(action_size=ACTION_SIZE,
                                                  device=device,
                                                  network_scope=network_scope,
                                                  scene_scopes=[scene_scope])

        self.local_discriminator = Discriminator_WGAN(
            action_size=ACTION_SIZE,
            device=device,
            network_scope=network_scope,
            scene_scopes=[scene_scope])

        self.local_network.prepare_loss(ENTROPY_BETA, self.scopes)
        self.local_discriminator.prepare_loss_D(ENTROPY_BETA, self.scopes_d)

        self.trainer = AccumTrainer(device)

        self.trainer_D = AccumTrainer(
            device, name="AccumTrainer_d")  #new instance for discrimninateor

        self.trainer.prepare_minimize(
            self.local_network.
            total_loss,  #getting the gradients of for the local network variablkes
            self.local_network.get_vars())

        self.trainer_D.prepare_minimize(self.local_discriminator.total_loss_d,
                                        self.local_discriminator.get_vars())

        new_variable_list = self.local_network.get_vars()
        old_varaible_list = self.local_network.get_vars_old()

        self.old_new_sync = self.local_network.sync_curre_old()

        self.accum_gradients = self.trainer.accumulate_gradients(
        )  #This is to assign gradients
        self.reset_gradients = self.trainer.reset_gradients(
        )  #after applying the grads to variables we need to resent those variables

        #This is for the discriminatro
        self.accum_gradients_d = self.trainer_D.accumulate_gradients()
        self.reset_gradients_d = self.trainer_D.reset_gradients()

        accum_grad_names = [
            self._local_var_name(x)
            for x in self.trainer.get_accum_grad_list()
        ]  #get the name list of all the grad vars
        accum_grad_names_discrimi = [
            self._local_var_name(x)
            for x in self.trainer_D.get_accum_grad_list()
        ]

        global_net_vars = [
            x for x in global_network.get_vars()
            if self._get_accum_grad_name(x) in accum_grad_names
        ]  #check whether the global_network vars are mentioned in gradiet computations for them
        local_net_vars = [
            x for x in self.local_network.get_vars()
            if self._get_accum_grad_name(x) in accum_grad_names
        ]
        #self.trainer.get_accum_grad_list() this is about gradients righjt now

        global_discri_vars = [
            x for x in global_discriminator.get_vars()
            if self._get_accum_grad_name(x) in accum_grad_names_discrimi
        ]

        self.apply_gradients = grad_applier.apply_gradients(
            global_net_vars, self.trainer.get_accum_grad_list())
        self.apply_gradients_local = grad_applier.apply_gradients_local_net(
            local_net_vars, self.trainer.get_accum_grad_list())

        #Discriminator
        self.apply_gradients_discriminator = grad_applier_discriminator.apply_gradients(
            global_discri_vars, self.trainer_D.get_accum_grad_list())
        self.clip_global_d_weights = global_discriminator.clip_weights(
        )  #here we are clipping the global net weights directly.

        #If this is unstable it is desireable to first apply the gradients on the local network and then clip and after that we apply

        self.sync = self.local_network.sync_from(
            global_network
        )  #this is to sync from the glocal network Apply updated global params to the local network
        self.sync_discriminator = self.local_discriminator.sync_from(
            global_discriminator)

        self.env = None

        self.local_t = 0

        self.initial_learning_rate = initial_learning_rate

        self.episode_reward = 0
        self.episode_length = 0
        self.episode_max_q = -np.inf

    def _local_var_name(self, var):
        return '/'.join(var.name.split('/')[1:])

    def _get_accum_grad_name(self, var):
        return self._local_var_name(var).replace(':', '_') + '_accum_grad:0'

    def _anneal_learning_rate(self, global_time_step):
        time_step_to_go = max(self.max_global_time_step - global_time_step,
                              0.0)
        learning_rate = self.initial_learning_rate * time_step_to_go / self.max_global_time_step
        return learning_rate

    def choose_action(self, pi_values):
        values = []
        sum = 0.0
        for rate in pi_values:
            sum = sum + rate
            value = sum
            values.append(value)

        r = random.random() * sum
        for i in range(len(values)):
            if values[i] >= r:
                return i

        # fail safe
        return len(values) - 1

    def _record_score(self, sess, writer, summary_op, placeholders, values,
                      global_t):
        feed_dict = {}
        for k in placeholders:
            feed_dict[placeholders[k]] = values[k]
        summary_str = sess.run(summary_op, feed_dict=feed_dict)
        if VERBOSE:
            print('writing to summary writer at time %d\n' % (global_t))
        writer.add_summary(summary_str, global_t)
        # writer.flush()

    def process(self, sess, global_t, summary_writer, summary_op,
                summary_placeholders):

        if self.env is None:
            # lazy evaluation
            time.sleep(self.thread_index * 1.0)
            self.env = Environment({
                'scene_name': self.scene_scope,
                'terminal_state_id': int(self.task_scope)
            })
            self.env.reset()  #resetting the environment for each thread

        states = []  #to keeep state ,actions ,targets and other stae
        actions = []
        rewards = []
        values = []
        targets = []
        dones = []

        terminal_end = False  #in the start terminal state_end is false

        # reset accumulated gradients
        sess.run(
            self.reset_gradients
        )  #resetting the gradient positions when starting the process for each

        # copy weights from shared to local
        sess.run(self.sync)

        start_local_t = self.local_t

        # t_max times loop
        for i in range(
                LOCAL_T_MAX
        ):  #one thread will run for maximum amoound to 5 iterations then do a gradiet uodate

            pi_, value_ = self.local_network.run_policy_and_value(
                sess, self.env.s_t, self.env.target, self.scopes)
            #pi_Old, value_Old = self.local_network.run_policy_and_value_old(sess, self.env.s_t, self.env.target, self.scopes)

            action = self.choose_action(pi_)

            states.append(self.env.s_t)
            actions.append(action)
            values.append(value_)
            targets.append(self.env.target)

            if VERBOSE and (self.thread_index
                            == 0) and (self.local_t % 1000) == 0:
                sys.stdout.write("Pi = {0} V = {1}\n".format(pi_, value_))

            # process game
            self.env.step(action)

            # receive game result
            reward = self.env.reward  #getting the reward from the env
            terminal = self.env.terminal  #geting whether the agent went to a terminal state

            # ad-hoc reward for navigation
            reward = 10.0 if terminal else -0.01  #this is the normal reward here 10 if terminal all the others it is -0.01 (ollision donesst take in to the accout)
            if self.episode_length > 5e3:
                terminal = True  #Here we do not let agent to run more that 5000 steps so we make it terminal
            #but the above terminal thing has no effect on giving 10 as the rwaerd because we set the rweard above

            self.episode_reward += reward
            self.episode_length += 1
            #this is what is the maximum value got in the episode
            self.episode_max_q = max(self.episode_max_q, np.max(
                value_))  #self.episode_max_q-This is -inf in the beggining

            # clip reward
            rewards.append(
                np.clip(reward, -1, 1)
            )  #make sure the rewartds is between -1 and +1 even thore rtthere is a 10

            self.local_t += 1

            # s_t1 -> s_t
            self.env.update()

            if terminal:  #if we go to the terminal state we will surely break the function
                sys.stdout.write("Pi = {0} V = {1}\n".format(pi_, value_))
                terminal_end = True
                sys.stdout.write(
                    "time %d | thread #%d | scene %s | target #%s\n%s %s episode reward = %.3f\n%s %s episode length = %d\n%s %s episode max Q  = %.3f\n"
                    % (global_t, self.thread_index, self.scene_scope,
                       self.task_scope, self.scene_scope, self.task_scope,
                       self.episode_reward, self.scene_scope, self.task_scope,
                       self.episode_length, self.scene_scope, self.task_scope,
                       self.episode_max_q))

                summary_values = {
                    "episode_reward_input": self.episode_reward,
                    "episode_length_input": float(self.episode_length),
                    "episode_max_q_input": self.episode_max_q,
                    "learning_rate_input": self._anneal_learning_rate(global_t)
                }

                self._record_score(sess, summary_writer, summary_op,
                                   summary_placeholders, summary_values,
                                   global_t)
                self.episode_reward = 0  #after terminal state we gonna make all these variables to zero
                self.episode_length = 0  #Now the AI need to start from new position
                self.episode_max_q = -np.inf  #after a terminaltion we do this
                self.env.reset()

                break
        '''
    Here I should call the discriminator  and get the reward signal from that 
    R_D=sess.run(D.get_reward(state,action))
    '''

        R = 0.0  #In the terminal Return is nothing  #If it's terminal end we do not have a return from the final state

        if not terminal_end:  #But if it's not the turminal Return is the next value function
            R = self.local_network.run_value(sess, self.env.s_t,
                                             self.env.target, self.scopes)

        Returns = np.zeros_like(rewards)
        Advants = np.zeros_like(rewards)
        lastgaelam = 0
        LAMBDA = 0.9
        GAM = 0.9

        self.nsteps = len(rewards)

        ############################################################################# we should assined all params to the new params

        #This will only has an effect on

        #####################################################################

        for t in reversed(range(self.nsteps)):
            if t == self.nsteps - 1:
                nextnonterminal = 1.0 - bool(
                    R == 0)  #if R ==0 means the agent found the terminal stage
                nextvalues = R

            else:
                nextnonterminal = 1.0 - bool(R == 0)
                nextvalues = values[t + 1]
            delta = rewards[t] + GAM * nextvalues * nextnonterminal - values[t]
            Advants[
                t] = lastgaelam = delta + GAM * LAMBDA * lastgaelam * nextnonterminal
            Returns[t] = Advants[t] + values[t]

        #Returns=Advants+values #This is more of the v_next

        Advants = (Advants - Advants.mean()) / (Advants.std() + 1e-5)
        #Returns=(Returns - Returns.mean()) / (Returns.std() + 1e-5)

        Returns = Returns.tolist()
        Advants = Advants.tolist()

        actions.reverse()
        states.reverse()
        rewards.reverse()
        values.reverse()
        Returns.reverse()
        Advants.reverse()

        batch_si = []
        batch_a = []
        batch_td = []
        batch_R = []
        batch_t = []
        batch_advant = []
        batch_Return = []

        # compute and accmulate gradients
        for (ai, ri, si, Vi, ti, Re, Ad) in zip(actions, rewards, states,
                                                values, targets, Returns,
                                                Advants):
            R = ri + GAMMA * R  #calculatung the adcantage function
            td = R - Vi
            a = np.zeros([ACTION_SIZE])
            a[ai] = 1  #making the actions one hot

            batch_si.append(si)
            batch_a.append(a)
            batch_td.append(td)
            batch_R.append(R)
            batch_t.append(ti)
            batch_advant.append(Ad)
            batch_Return.append(Re)

        sess.run(self.old_new_sync)
        cur_learning_rate = self._anneal_learning_rate(global_t)

        for i in range(3):

            sess.run(
                self.
                accum_gradients,  #since we update the algorithm for given action ,given state, given advatns and given value and given reward we do not care about the sequence
                feed_dict={
                    self.local_network.s: batch_si,
                    self.local_network.a: batch_a,
                    self.local_network.t: batch_t,
                    self.local_network.td: batch_td,
                    self.local_network.r: batch_R,
                    self.local_network.Returns: batch_Return,
                    self.local_network.Advantages: batch_advant
                })

            sess.run(self.apply_gradients_local,
                     feed_dict={self.learning_rate_input: cur_learning_rate})

        sess.run(self.apply_gradients,
                 feed_dict={self.learning_rate_input: cur_learning_rate})

        if VERBOSE and (self.thread_index == 0) and (self.local_t % 100) == 0:
            sys.stdout.write("Local timestep %d\n" % self.local_t)

        # return advanced local step size
        diff_local_t = self.local_t - start_local_t
        return diff_local_t
예제 #8
0
class SmashNetTrainingThread(object):  #Threading th training
    def __init__(self,
                 thread_index,
                 global_network,
                 initial_learning_rate,
                 learning_rate_input,
                 grad_applier,
                 max_global_time_step,
                 device,
                 initial_diffidence_rate_seed,
                 mode="train",
                 network_scope="network",
                 scene_scope="scene",
                 task_scope="task",
                 encourage_symmetry=False):

        self.thread_index = thread_index
        self.learning_rate_input = learning_rate_input
        self.max_global_time_step = max_global_time_step

        self.network_scope = network_scope  #assiciated with the thread number
        self.scene_scope = scene_scope  #Score
        self.task_scope = task_scope  #This the targe
        self.scopes = [network_scope, scene_scope,
                       task_scope]  # ["thread-n", "scene", "target"]

        self.local_network = SmashNet(  #locally smash net policy netwotk
            action_size=ACTION_SIZE,
            device=device,
            network_scope=network_scope,
            scene_scopes=[scene_scope])

        self.local_network.prepare_loss(
            self.scopes)  #This is to calculate the loss for this thread

        if mode is "train":
            self.trainer = AccumTrainer(device)
            self.trainer.prepare_minimize(self.local_network.loss,
                                          self.local_network.get_vars())

            self.accum_gradients = self.trainer.accumulate_gradients()
            self.reset_gradients = self.trainer.reset_gradients()

            accum_grad_names = [
                self._local_var_name(x)
                for x in self.trainer.get_accum_grad_list()
            ]  #This is more of we apply gradients to globabl network
            global_net_vars = [
                x for x in global_network.get_vars()
                if self._get_accum_grad_name(x) in accum_grad_names
            ]

            self.apply_gradients = grad_applier.apply_gradients(
                global_net_vars, self.trainer.get_accum_grad_list())

        self.sync = self.local_network.sync_from(global_network)

        self.env = None

        self.local_t = 0

        self.initial_learning_rate = initial_learning_rate

        # self.episode_reward = 0
        self.episode_length = 0
        # self.episode_max_q = -np.inf
        self.episode_pi_sim = 0
        self.episode_loss = 0

        self.initial_diffidence_rate_seed = initial_diffidence_rate_seed

        self.oracle = None
        self.mode = mode
        self.encourage_symmetry = encourage_symmetry

    def _local_var_name(self, var):
        return '/'.join(var.name.split('/')[1:])

    def _get_accum_grad_name(self, var):
        return self._local_var_name(var).replace(':', '_') + '_accum_grad:0'

    def _anneal_rate(self, init_rate, global_time_step):
        time_step_to_go = max(self.max_global_time_step - global_time_step,
                              0.0)
        rate = init_rate * time_step_to_go / self.max_global_time_step
        return rate

    def _anneal_learning_rate(self, global_time_step):
        learning_rate = self._anneal_rate(self.initial_learning_rate,
                                          global_time_step)
        return learning_rate

    def _inverse_sigmoid_decay_rate(self, init_rate_seed, global_time_step):
        rate = init_rate_seed * np.exp(-global_time_step / init_rate_seed)
        rate = rate / (1. + rate)
        return rate

    def _anneal_diffidence_rate(self, global_time_step):
        if self.initial_diffidence_rate_seed == 0: return 0
        else:
            return self._inverse_sigmoid_decay_rate(
                self.initial_diffidence_rate_seed, global_time_step)

    # TODO: check
    def choose_action(self, smashnet_pi_values, oracle_pi_values,
                      confidence_rate):  #can change the action to take

        r = random.random()
        if r < confidence_rate: pi_values = oracle_pi_values
        else: pi_values = smashnet_pi_values

        r = random.random() * np.sum(pi_values)
        values = np.cumsum(pi_values)
        for i in range(len(values)):
            if values[i] >= r: return i

    def _record_score(self, sess, writer, summary_op, placeholders, values,
                      global_t):
        feed_dict = {}
        for k in placeholders:
            feed_dict[placeholders[k]] = values[k]
        summary_str = sess.run(summary_op, feed_dict=feed_dict)
        writer.add_summary(summary_str, global_t)
        # writer.flush()

    def _evaluate(self, sess, list_of_tasks, num_episodes, max_steps,
                  success_cutoff):

        scene_scopes = list_of_tasks.keys()
        results = {}

        for scene_scope in scene_scopes:

            for task_scope in list_of_tasks[scene_scope]:

                env = Environment({
                    'scene_name': scene_scope,
                    'terminal_state_id': int(task_scope)
                })
                ep_lengths = []
                ep_collisions = []
                oracle_lengths = []
                ep_successes = []

                scopes = [self.network_scope, scene_scope, task_scope]

                for i_episode in range(num_episodes):

                    env.reset()
                    oracle_lengths.append(env.shortest_path_distances[
                        env.current_state_id][env.terminal_state_id])

                    terminal = False
                    ep_length = 0
                    ep_collision = 0

                    while not terminal:

                        pi_values = self.local_network.run_policy(
                            sess, env.s_t, env.target, scopes)
                        action = sample_action(pi_values)
                        env.step(action)
                        env.update()

                        terminal = env.terminal
                        if ep_length == max_steps: break
                        if env.collided: ep_collision += 1
                        ep_length += 1

                    ep_lengths.append(ep_length)
                    ep_collisions.append(ep_collision)
                    ep_successes.append(int(ep_length < success_cutoff))

                results[scene_scope + task_scope] = [
                    np.mean(ep_lengths),
                    np.mean(ep_collisions),
                    np.mean(oracle_lengths),
                    np.mean(ep_successes)
                ]

        return results

    def _flip_policy(self, policy):

        flipped_policy = np.array([policy[3], policy[2], policy[1], policy[0]])
        return flipped_policy

    def process(self, sess, global_t, summary_writer, summary_op,
                summary_placeholders):  #This is to run the process
        if self.env is None:
            # lazy evaluation
            time.sleep(self.thread_index * 1.0)
            self.env = Environment(
                {  #This is where you access in to the environment  #scene_loader import THORDiscreteEnvironment 
                    'scene_name': self.scene_scope,
                    'terminal_state_id': int(self.task_scope)
                })

            self.env.reset()
            self.oracle = ShortestPathOracle(
                self.env, ACTION_SIZE
            )  #Get the probabilities of the shortest paths to go to exat position

        states = []
        targets = []
        oracle_pis = []  #expert policies

        terminal_end = False

        if self.mode is "train":  #if the trainign is there
            # reset accumulated gradients
            sess.run(self.reset_gradients)  #reet all the gradients

            # copy weights from shared to local
            sess.run(self.sync)  #

        start_local_t = self.local_t

        # t_max times loop (5 steps)
        for i in range(LOCAL_T_MAX):  # This is for the training

            flipped_run = self.encourage_symmetry and np.random.random() > 0.5

            if flipped_run:
                s_t = self.env.target
                g = self.env.s_t
            else:
                s_t = self.env.s_t
                g = self.env.target  #first the initial state start with same state 4 times as the history stacked as frames 2048*5

            smashnet_pi = self.local_network.run_policy(
                sess, s_t, g,
                self.scopes)  #now gethe policy frmo the local network
            if flipped_run: smashnet_pi = self._flip_policy(smashnet_pi)

            oracle_pi = self.oracle.run_policy(
                self.env.current_state_id
            )  #get the policy of the oracle which means the shotest path kind of action in the given state

            diffidence_rate = self._anneal_diffidence_rate(global_t)

            action = self.choose_action(smashnet_pi, oracle_pi,
                                        diffidence_rate)

            states.append(s_t)  #stack action
            targets.append(g)  #stack target position
            if flipped_run: oracle_pis.append(self._flip_policy(oracle_pi))
            else: oracle_pis.append(oracle_pi)  #get the expert's policies

            # if VERBOSE and global_t % 10000 == 0:
            #       print("Thread %d" % (self.thread_index))
            #       sys.stdout.write("SmashNet Pi = {}, Oracle Pi = {}\n".format(["{:0.2f}".format(i) for i in smashnet_pi], ["{:0.2f}".format(i) for i in oracle_pi]))

            if VALIDATE and global_t % VALIDATE_FREQUENCY == 0 and global_t > 0 and self.thread_index == 0:  #This is for the alidation of the results
                results = self._evaluate(sess,
                                         list_of_tasks=VALID_TASK_LIST,
                                         num_episodes=NUM_VAL_EPISODES,
                                         max_steps=MAX_VALID_STEPS,
                                         success_cutoff=SUCCESS_CUTOFF)
                print("Thread %d" % (self.thread_index))
                print("Validation results: %s" % (results))

            self.env.step(action)  #here we change the next step

            is_terminal = self.env.terminal or self.episode_length > 5e3
            if self.mode is "val" and self.episode_length > 1e3:
                is_terminal = True

            self.episode_length += 1
            self.episode_pi_sim += 1. - cosine(smashnet_pi, oracle_pi)

            self.local_t += 1

            # s_t1 -> s_t
            self.env.update()  #update the new state

            if is_terminal:
                terminal_end = True
                if self.mode is "val":
                    sess.run(self.sync)
                    sys.stdout.write(
                        "time %d | thread #%d | scene %s | target %s | episode length = %d\n"
                        % (global_t, self.thread_index, self.scene_scope,
                           self.task_scope, self.episode_length))

                summary_values = {
                    "episode_length_input":
                    float(self.episode_length),
                    "episode_pi_sim_input":
                    self.episode_pi_sim / float(self.episode_length),
                    "episode_loss_input":
                    float(self.episode_loss)
                }

                self._record_score(sess, summary_writer, summary_op,
                                   summary_placeholders, summary_values,
                                   global_t)
                self.episode_length = 0
                self.episode_pi_sim = 0
                self.episode_loss = 0
                self.env.reset()

            break

        if self.mode is "train":
            states.reverse()
            oracle_pis.reverse()

            batch_si = []
            batch_ti = []
            batch_opi = []

            # compute and accmulate gradients
            for (si, ti, opi) in zip(states, targets, oracle_pis):

                batch_si.append(si)
                batch_ti.append(ti)
                batch_opi.append(opi)

            sess.run(self.accum_gradients,
                     feed_dict={
                         self.local_network.s: batch_si,
                         self.local_network.t: batch_ti,
                         self.local_network.opi: batch_opi
                     })

            self.episode_loss += sum(
                sess.run(self.local_network.loss,
                         feed_dict={
                             self.local_network.s: batch_si,
                             self.local_network.t: batch_ti,
                             self.local_network.opi: batch_opi
                         }))

            cur_learning_rate = self._anneal_learning_rate(global_t)
            sess.run(self.apply_gradients,
                     feed_dict={self.learning_rate_input: cur_learning_rate})

        # if VERBOSE and (self.thread_index == 0) and (self.local_t % 100) == 0:
        #   sys.stdout.write("Local timestep %d\n" % self.local_t)

        # return advanced local step size
        diff_local_t = self.local_t - start_local_t
        return diff_local_t
예제 #9
0
class A3CTrainingThread(object):
  def __init__(self,
               thread_index,
               global_network,
               initial_learning_rate,
               learning_rate_input,
               grad_applier,
               max_global_time_step,
               device,
               network_scope="network",
               scene_scope="scene",
               task_scope="task"):

    self.thread_index = thread_index
    self.learning_rate_input = learning_rate_input
    self.max_global_time_step = max_global_time_step

    self.network_scope = network_scope
    self.scene_scope = scene_scope
    self.task_scope = task_scope
    self.scopes = [network_scope, scene_scope, task_scope]
    self.task_scope_name=1
    self.local_network = ActorCriticFFNetwork(
                           action_size=ACTION_SIZE,
                           device=device,
                           network_scope=network_scope,
                           scene_scopes=[scene_scope])

    self.local_network.prepare_loss(ENTROPY_BETA, self.scopes)

    self.trainer = AccumTrainer(device)
    self.trainer.prepare_minimize(self.local_network.total_loss,
                                  self.local_network.get_vars())

    self.accum_gradients = self.trainer.accumulate_gradients()
    self.reset_gradients = self.trainer.reset_gradients()

    accum_grad_names = [self._local_var_name(x) for x in self.trainer.get_accum_grad_list()]
    global_net_vars = [x for x in global_network.get_vars() if self._get_accum_grad_name(x) in accum_grad_names]

    self.apply_gradients = grad_applier.apply_gradients(
      global_net_vars, self.trainer.get_accum_grad_list() )

    self.sync = self.local_network.sync_from(global_network)

    self.env = None

    self.local_t = 0

    self.initial_learning_rate = initial_learning_rate

    self.episode_reward = 0
    self.episode_length = 0
    self.episode_max_q = -np.inf

  def _local_var_name(self, var):
    return '/'.join(var.name.split('/')[1:])

  def _get_accum_grad_name(self, var):
    return self._local_var_name(var).replace(':','_') + '_accum_grad:0'

  def _anneal_learning_rate(self, global_time_step):
    time_step_to_go = max(self.max_global_time_step - global_time_step, 0.0)
    learning_rate = self.initial_learning_rate * time_step_to_go / self.max_global_time_step
    return learning_rate

  def choose_action(self, pi_values):
    values = []
    sum = 0.0
    for rate in pi_values:
      sum = sum + rate
      value = sum
      values.append(value)

    r = random.random() * sum
    for i in range(len(values)):
      if values[i] >= r:
        return i

    # fail safe
    return len(values) - 1

  def _record_score(self, sess, writer, summary_op, placeholders, values, global_t):
    feed_dict = {}
    for k in placeholders:
      feed_dict[placeholders[k]] = values[k]
    summary_str = sess.run(summary_op, feed_dict=feed_dict)
    if VERBOSE: print('writing to summary writer at time %d\n' % (global_t))
    writer.add_summary(summary_str, global_t)
    # writer.flush()

  def process(self, sess, global_t, summary_writer, summary_op, summary_placeholders):

    if self.env is None:
      # lazy evaluation
      time.sleep(self.thread_index*1.0)
      self.task_scope_name = random.randint(1, 468) - 1
      self.env = Environment({
        'scene_name': self.scene_scope,
        'terminal_state_id': self.task_scope_name
      })
      self.env.reset()

    states = []
    actions = []
    rewards = []
    values = []
    targets = []

    terminal_end = False

    # reset accumulated gradients
    sess.run( self.reset_gradients )

    # copy weights from shared to local
    sess.run( self.sync )

    start_local_t = self.local_t

    # t_max times loop
    for i in range(LOCAL_T_MAX):
      pi_, value_ = self.local_network.run_policy_and_value(sess, self.env.s_t, self.env.target, self.scopes)
      action = self.choose_action(pi_)

      states.append(self.env.s_t)
      actions.append(action)
      values.append(value_)
      targets.append(self.env.target)

      if VERBOSE and (self.thread_index == 0) and (self.local_t % 1000) == 0:
        sys.stdout.write("Pi = {0} V = {1}\n".format(pi_, value_))

      # process game
      self.env.step(action)

      # receive game result
      reward = self.env.reward
      terminal = self.env.terminal

      # ad-hoc reward for navigation
      reward = 10.0 if terminal else -0.01
      if self.episode_length > 5e3: terminal = True

      self.episode_reward += reward
      self.episode_length += 1
      self.episode_max_q = max(self.episode_max_q, np.max(value_))

      # clip reward
      rewards.append( np.clip(reward, -1, 1) )

      self.local_t += 1

      # s_t1 -> s_t
      self.env.update()

      if terminal:
        terminal_end = True
        print'----------'
        print('real terminal id is {}'.format(self.task_scope_name))
        sys.stdout.write("time %d | thread #%d | scene %s | target #%s\n%s %s episode reward = %.3f\n%s %s episode length = %d\n%s %s episode max Q  = %.3f\n" % (global_t, self.thread_index, self.scene_scope, self.task_scope, self.scene_scope, self.task_scope, self.episode_reward, self.scene_scope, self.task_scope, self.episode_length, self.scene_scope, self.task_scope, self.episode_max_q))

        summary_values = {
          "episode_reward_input": self.episode_reward,
          "episode_length_input": float(self.episode_length),
          "episode_max_q_input": self.episode_max_q,
          "learning_rate_input": self._anneal_learning_rate(global_t)
        }

        self._record_score(sess, summary_writer, summary_op, summary_placeholders,
                           summary_values, global_t)
        self.episode_reward = 0
        self.episode_length = 0
        self.episode_max_q = -np.inf
        self.task_scope_name = random.randint(1, 468) - 1
        self.env = Environment({
          'scene_name': self.scene_scope,
          'terminal_state_id': self.task_scope_name
        })
        self.env.reset()
        print ('init id is {}'.format(self.env.current_state_id))
        print'----------'
        break

    R = 0.0
    if not terminal_end:
      R = self.local_network.run_value(sess, self.env.s_t, self.env.target, self.scopes)

    actions.reverse()
    states.reverse()
    rewards.reverse()
    values.reverse()

    batch_si = []
    batch_a = []
    batch_td = []
    batch_R = []
    batch_t = []

    # compute and accmulate gradients
    for(ai, ri, si, Vi, ti) in zip(actions, rewards, states, values, targets):
      R = ri + GAMMA * R
      td = R - Vi
      a = np.zeros([ACTION_SIZE])
      a[ai] = 1

      batch_si.append(si)
      batch_a.append(a)
      batch_td.append(td)
      batch_R.append(R)
      batch_t.append(ti)

    if USE_LSTM:
      batch_si.reverse()
      batch_a.reverse()
      batch_td.reverse()
      batch_R.reverse()
      batch_t.reverse()

      sess.run(self.accum_gradients,
               feed_dict={
                   self.local_network.s: batch_si,
                   self.local_network.a: batch_a,
                   self.local_network.t: batch_t,
                   self.local_network.td: batch_td,
                   self.local_network.r: batch_R,
                   self.local_network.initial_lstm_state: start_lstm_state,})
                  #  self.local_network.step_size: [len(batch_a)]
    else:
      sess.run( self.accum_gradients,
                feed_dict = {
                  self.local_network.s: batch_si,
                  self.local_network.a: batch_a,
                  self.local_network.t: batch_t,
                  self.local_network.td: batch_td,
                  self.local_network.r: batch_R} )

    cur_learning_rate = self._anneal_learning_rate(global_t)

    sess.run( self.apply_gradients,
              feed_dict = { self.learning_rate_input: cur_learning_rate } )

    if VERBOSE and (self.thread_index == 0) and (self.local_t % 100) == 0:
      sys.stdout.write("Local timestep %d\n" % self.local_t)

    # return advanced local step size
    diff_local_t = self.local_t - start_local_t
    return diff_local_t
예제 #10
0
class SmashNetTrainingThread(object):
  def __init__(self,
               thread_index,
               global_network,
               initial_learning_rate,
               learning_rate_input,
               grad_applier,
               max_global_time_step,
               device,
               initial_diffidence_rate_seed,
               mode="train",
               network_scope="network",
               scene_scope="scene",
               task_scope="task",
               encourage_symmetry=False):

    self.thread_index = thread_index
    self.learning_rate_input = learning_rate_input
    self.max_global_time_step = max_global_time_step

    self.network_scope = network_scope
    self.scene_scope = scene_scope
    self.task_scope = task_scope
    self.scopes = [network_scope, scene_scope, task_scope] # ["thread-n", "scene", "target"]

    self.local_network = SmashNet(
                           action_size=ACTION_SIZE,
                           device=device,
                           network_scope=network_scope,
                           scene_scopes=[scene_scope])

    self.local_network.prepare_loss(self.scopes)

    if mode is "train":
      self.trainer = AccumTrainer(device)
      self.trainer.prepare_minimize(self.local_network.loss,
                                    self.local_network.get_vars())

      self.accum_gradients = self.trainer.accumulate_gradients()
      self.reset_gradients = self.trainer.reset_gradients()

      accum_grad_names = [self._local_var_name(x) for x in self.trainer.get_accum_grad_list()]
      global_net_vars = [x for x in global_network.get_vars() if self._get_accum_grad_name(x) in accum_grad_names]

      self.apply_gradients = grad_applier.apply_gradients( global_net_vars, self.trainer.get_accum_grad_list() )

    self.sync = self.local_network.sync_from(global_network)

    self.env = None

    self.local_t = 0

    self.initial_learning_rate = initial_learning_rate

    # self.episode_reward = 0
    self.episode_length = 0
    # self.episode_max_q = -np.inf
    self.episode_pi_sim = 0
    self.episode_loss = 0

    self.initial_diffidence_rate_seed = initial_diffidence_rate_seed

    self.oracle = None
    self.mode = mode
    self.encourage_symmetry = encourage_symmetry


  def _local_var_name(self, var):
    return '/'.join(var.name.split('/')[1:])

  def _get_accum_grad_name(self, var):
    return self._local_var_name(var).replace(':','_') + '_accum_grad:0'

  def _anneal_rate(self, init_rate, global_time_step):
    time_step_to_go = max(self.max_global_time_step - global_time_step, 0.0)
    rate = init_rate * time_step_to_go / self.max_global_time_step
    return rate

  def _anneal_learning_rate(self, global_time_step):
    learning_rate = self._anneal_rate(self.initial_learning_rate, global_time_step)
    return learning_rate

  def _inverse_sigmoid_decay_rate(self, init_rate_seed, global_time_step):
      rate = init_rate_seed*np.exp(-global_time_step/init_rate_seed)
      rate = rate / (1. + rate)
      return rate

  def _anneal_diffidence_rate(self, global_time_step):
    if self.initial_diffidence_rate_seed == 0: return 0
    else: return self._inverse_sigmoid_decay_rate(self.initial_diffidence_rate_seed, global_time_step)

  # TODO: check
  def choose_action(self, smashnet_pi_values, oracle_pi_values, confidence_rate):

    r = random.random()
    if r < confidence_rate: pi_values = oracle_pi_values
    else: pi_values = smashnet_pi_values

    r = random.random() * np.sum(pi_values)
    values = np.cumsum(pi_values)
    for i in range(len(values)):
        if values[i] >= r: return i

  def _record_score(self, sess, writer, summary_op, placeholders, values, global_t):
    feed_dict = {}
    for k in placeholders:
      feed_dict[placeholders[k]] = values[k]
    summary_str = sess.run(summary_op, feed_dict=feed_dict)
    writer.add_summary(summary_str, global_t)
    # writer.flush()


  def _evaluate(self, sess, list_of_tasks, num_episodes, max_steps, success_cutoff):

    scene_scopes = list_of_tasks.keys()
    results = {}

    for scene_scope in scene_scopes:

        for task_scope in list_of_tasks[scene_scope]:

            env = Environment({
                'scene_name': scene_scope,
                'terminal_state_id': int(task_scope)
            })
            ep_lengths = []
            ep_collisions = []
            oracle_lengths = []
            ep_successes = []

            scopes = [self.network_scope, scene_scope, task_scope]

            for i_episode in range(num_episodes):

                env.reset()
                oracle_lengths.append(env.shortest_path_distances[env.current_state_id][env.terminal_state_id])

                terminal = False
                ep_length = 0
                ep_collision = 0

                while not terminal:

                  pi_values = self.local_network.run_policy(sess, env.s_t, env.target, scopes)
                  action = sample_action(pi_values)
                  env.step(action)
                  env.update()

                  terminal = env.terminal
                  if ep_length == max_steps: break
                  if env.collided: ep_collision += 1
                  ep_length += 1

                ep_lengths.append(ep_length)
                ep_collisions.append(ep_collision)
                ep_successes.append(int(ep_length  < success_cutoff))

            results[scene_scope + task_scope] = [np.mean(ep_lengths), np.mean(ep_collisions), np.mean(oracle_lengths), np.mean(ep_successes)]

    return results

  def _flip_policy(self, policy):
        flipped_policy = np.array([policy[3],
                         policy[2],
                         policy[1],
                         policy[0]])
        return flipped_policy

  def process(self, sess, global_t, summary_writer, summary_op, summary_placeholders):

    if self.env is None:
      # lazy evaluation
      time.sleep(self.thread_index*1.0)
      self.env = Environment({
        'scene_name': self.scene_scope,
        'terminal_state_id': int(self.task_scope)
      })
      self.env.reset()
      self.oracle = ShortestPathOracle(self.env, ACTION_SIZE)

    states = []
    targets = []
    oracle_pis = []

    terminal_end = False

    if self.mode is "train":
      # reset accumulated gradients
      sess.run( self.reset_gradients )

      # copy weights from shared to local
      sess.run( self.sync )

    start_local_t = self.local_t

    # t_max times loop (5 steps)
    for i in range(LOCAL_T_MAX):

      flipped_run = self.encourage_symmetry and np.random.random() > 0.5

      if flipped_run: s_t = self.env.target; g = self.env.s_t
      else: s_t = self.env.s_t; g = self.env.target

      smashnet_pi = self.local_network.run_policy(sess, s_t, g, self.scopes)
      if flipped_run: smashnet_pi = self._flip_policy(smashnet_pi)

      oracle_pi = self.oracle.run_policy(self.env.current_state_id)

      diffidence_rate = self._anneal_diffidence_rate(global_t)
      action = self.choose_action(smashnet_pi, oracle_pi, diffidence_rate)

      states.append(s_t)
      targets.append(g)
      if flipped_run: oracle_pis.append(self._flip_policy(oracle_pi))
      else: oracle_pis.append(oracle_pi)

      # if VERBOSE and global_t % 10000 == 0:
      #       print("Thread %d" % (self.thread_index))
      #       sys.stdout.write("SmashNet Pi = {}, Oracle Pi = {}\n".format(["{:0.2f}".format(i) for i in smashnet_pi], ["{:0.2f}".format(i) for i in oracle_pi]))

      if VALIDATE and global_t % VALIDATE_FREQUENCY == 0 and global_t > 0 and self.thread_index == 0:
        results = self._evaluate(sess, list_of_tasks=VALID_TASK_LIST, num_episodes=NUM_VAL_EPISODES, max_steps=MAX_VALID_STEPS, success_cutoff=SUCCESS_CUTOFF)
        print("Thread %d" % (self.thread_index))
        print("Validation results: %s" % (results))

      self.env.step(action)

      is_terminal = self.env.terminal or self.episode_length > 5e3
      if self.mode is "val" and self.episode_length > 1e3:
        is_terminal = True

      self.episode_length += 1
      self.episode_pi_sim += 1. - cosine(smashnet_pi, oracle_pi)

      self.local_t += 1

      # s_t1 -> s_t
      self.env.update()

      if is_terminal:
        terminal_end = True
        if self.mode is "val":
          sess.run(self.sync)
          sys.stdout.write("time %d | thread #%d | scene %s | target %s | episode length = %d\n" % (global_t, self.thread_index, self.scene_scope, self.task_scope, self.episode_length))

        summary_values = {
            "episode_length_input": float(self.episode_length),
            "episode_pi_sim_input": self.episode_pi_sim / float(self.episode_length),
            "episode_loss_input": float(self.episode_loss)
        }

        self._record_score(sess, summary_writer, summary_op, summary_placeholders,
                           summary_values, global_t)
        self.episode_length = 0
        self.episode_pi_sim = 0
        self.episode_loss = 0
        self.env.reset()

        break

    if self.mode is "train":
      states.reverse()
      oracle_pis.reverse()

      batch_si = []
      batch_ti = []
      batch_opi = []

      # compute and accmulate gradients
      for(si, ti, opi) in zip(states, targets, oracle_pis):

        batch_si.append(si)
        batch_ti.append(ti)
        batch_opi.append(opi)

      sess.run( self.accum_gradients,
                feed_dict = {
                  self.local_network.s: batch_si,
                  self.local_network.t: batch_ti,
                  self.local_network.opi: batch_opi} )

      self.episode_loss += sum(sess.run(self.local_network.loss,
                                        feed_dict={
                                            self.local_network.s: batch_si,
                                            self.local_network.t: batch_ti,
                                            self.local_network.opi: batch_opi}))

      cur_learning_rate = self._anneal_learning_rate(global_t)
      sess.run( self.apply_gradients, feed_dict = { self.learning_rate_input: cur_learning_rate } )

    # if VERBOSE and (self.thread_index == 0) and (self.local_t % 100) == 0:
    #   sys.stdout.write("Local timestep %d\n" % self.local_t)

    # return advanced local step size
    diff_local_t = self.local_t - start_local_t
    return diff_local_t