Python DaggerLSTM примеры использования

Язык программирования: Python

Пространство имен/Пакет: models

Класс/Тип: DaggerLSTM

Примеров на hotexamples.com: 13

Python DaggerLSTM - 13 примеров найдено. Это лучшие примеры Python кода для models.DaggerLSTM, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

zero_init_state(7)

DaggerLSTM(6)

Основные методы

zero_init_state (7)

DaggerLSTM (6)

Пример #1

Показать файл

Файл: dagger.py Проект: whukk/DQNCC

    def __init__(self, cluster, server, worker_tasks):
        self.cluster = cluster
        self.server = server
        self.worker_tasks = worker_tasks
        self.num_workers = len(worker_tasks)
        self.aggregated_states = []
        self.aggregated_actions = []
        self.max_eps = 1000
        self.checkpoint_delta = 10
        self.checkpoint = self.checkpoint_delta
        self.learn_rate = 0.01
        self.regularization_lambda = 1e-4
        self.train_step = 0

        self.state_dim = Sender.state_dim
        self.action_cnt = Sender.action_cnt
        self.aug_state_dim = self.state_dim + self.action_cnt

        # Create the master network and training/sync queues
        with tf.variable_scope('global'):
            self.global_network = DaggerLSTM(state_dim=self.aug_state_dim,
                                             action_cnt=self.action_cnt)

        self.leader_device_cpu = '/job:ps/task:0/cpu:0'
        with tf.device(self.leader_device_cpu):
            with tf.variable_scope('global_cpu'):
                self.global_network_cpu = DaggerLSTM(
                    state_dim=self.aug_state_dim, action_cnt=self.action_cnt)

        cpu_vars = self.global_network_cpu.trainable_vars
        gpu_vars = self.global_network.trainable_vars
        self.sync_op = tf.group(
            *[v1.assign(v2) for v1, v2 in zip(cpu_vars, gpu_vars)])

        self.default_batch_size = 300
        self.default_init_state = self.global_network.zero_init_state(
            self.default_batch_size)

        # Each element is [[aug_state]], [action]
        self.train_q = tf.FIFOQueue(self.num_workers, [tf.float32, tf.int32],
                                    shared_name='training_feed')

        # Keys: worker indices, values: Tensorflow messaging queues
        # Queue Elements: Status message
        self.sync_queues = {}
        for idx in worker_tasks:
            queue_name = 'sync_q_%d' % idx
            self.sync_queues[idx] = tf.FIFOQueue(3, [tf.int16],
                                                 shared_name=queue_name)

        self.setup_tf_ops(server)

        self.sess = tf.Session(
            server.target, config=tf.ConfigProto(allow_soft_placement=True))
        self.sess.run(tf.global_variables_initializer())

Пример #2

Показать файл

class Learner(object):
    def __init__(self, state_dim, action_cnt, restore_vars):
        self.aug_state_dim = state_dim + action_cnt
        self.action_cnt = action_cnt
        self.prev_action = action_cnt - 1

        with tf.variable_scope('global'):
            self.model = DaggerLSTM(state_dim=self.aug_state_dim,
                                    action_cnt=action_cnt)

        self.lstm_state = self.model.zero_init_state(1)

        self.sess = tf.Session()

        # restore saved variables
        saver = tf.train.Saver(self.model.trainable_vars)
        saver.restore(self.sess, restore_vars)

        # init the remaining vars, especially those created by optimizer
        uninit_vars = set(tf.global_variables())
        uninit_vars -= set(self.model.trainable_vars)
        self.sess.run(tf.variables_initializer(uninit_vars))

        self.log = open(
            '/home/eric/Dev/DRL-IL/pantheon/third_party/indigo/logs.txt', 'w')

    def sample_action(self, state):
        norm_state = normalize(state)

        one_hot_action = one_hot(self.prev_action, self.action_cnt)
        aug_state = norm_state + one_hot_action

        btime = time.time()
        # Get probability of each action from the local network.
        pi = self.model
        feed_dict = {
            pi.input: [[aug_state]],
            pi.state_in: self.lstm_state,
        }
        ops_to_run = [pi.action_probs, pi.state_out]
        action_probs, self.lstm_state = self.sess.run(ops_to_run, feed_dict)

        # Choose an action to take
        action = np.argmax(action_probs[0][0])
        self.prev_action = action

        info = 'make decision {} to {} with {}s \n'.format(
            action, state[3],
            time.time() - btime)
        self.log.write(info)

        # action = np.argmax(np.random.multinomial(1, action_probs[0] - 1e-5))
        # temperature = 1.0
        # temp_probs = softmax(action_probs[0] / temperature)
        # action = np.argmax(np.random.multinomial(1, temp_probs - 1e-5))
        return action, aug_state

Пример #3

Показать файл

    def __init__(self, sender, state_dim, restore_vars):
        self.aug_state_dim = state_dim + 1  #action_cnt
        self.prev_action = 0
        self.sender = sender
        with tf.variable_scope('global'):
            self.model = DaggerLSTM(state_dim=self.aug_state_dim,
                                    dwnd=Sender.dwnd)

        self.lstm_state = self.model.zero_init_state(1)

        self.sess = tf.Session()

        # restore saved variables
        saver = tf.train.Saver(self.model.trainable_vars)
        saver.restore(self.sess, restore_vars)

        # init the remaining vars, especially those created by optimizer
        uninit_vars = set(tf.global_variables())
        uninit_vars -= set(self.model.trainable_vars)
        self.sess.run(tf.variables_initializer(uninit_vars))

Пример #4

Показать файл

class Learner(object):
    def __init__(self, sender, state_dim, restore_vars):
        self.aug_state_dim = state_dim + 1  #action_cnt
        self.prev_action = 0
        self.sender = sender
        with tf.variable_scope('global'):
            self.model = DaggerLSTM(state_dim=self.aug_state_dim,
                                    dwnd=Sender.dwnd)

        self.lstm_state = self.model.zero_init_state(1)

        self.sess = tf.Session()

        # restore saved variables
        saver = tf.train.Saver(self.model.trainable_vars)
        saver.restore(self.sess, restore_vars)

        # init the remaining vars, especially those created by optimizer
        uninit_vars = set(tf.global_variables())
        uninit_vars -= set(self.model.trainable_vars)
        self.sess.run(tf.variables_initializer(uninit_vars))

    def policy(self, state):
        """ Given a state buffer in the past step, returns an action
        to perform.

        Appends to the state/action buffers the state and the
        "correct" action to take according to the expert.
        """

        aug_state = state + [self.prev_action]
        self.sender.update_decision_window(aug_state)

        # Get probability of each action from the local network.
        pi = self.model
        feed_dict = {
            pi.input: [self.sender.decision_window],
            pi.state_in: self.lstm_state,
        }
        ops_to_run = [pi.actions, pi.state_out]
        actions, self.lstm_state = self.sess.run(ops_to_run, feed_dict)

        # Choose an action to take and update current LSTM state
        if len(self.sender.decision_window) <= 1:
            action = actions
        else:
            action = actions[-1]
        # print("actions shape:" + str(actions.shape))
        # print("in policy(): action is: " + str(action))
        self.prev_action = action

        return action

Пример #5

Показать файл

class Learner(object):
    def __init__(self, state_dim, action_cnt, restore_vars):
        self.aug_state_dim = state_dim + action_cnt
        self.action_cnt = action_cnt
        self.prev_action = action_cnt - 1

        with tf.variable_scope('global'):
            self.model = DaggerLSTM(
                state_dim=self.aug_state_dim, action_cnt=action_cnt)

        self.lstm_state = self.model.zero_init_state(1)

        self.sess = tf.Session()

        logging.basicConfig(level=logging.WARNING, filename="/home/zyk/state.log")
        self.logger = logging.getLogger("state")

        # restore saved variables
        saver = tf.train.Saver(self.model.trainable_vars)
        saver.restore(self.sess, restore_vars)

        # init the remaining vars, especially those created by optimizer
        uninit_vars = set(tf.global_variables())
        uninit_vars -= set(self.model.trainable_vars)
        self.sess.run(tf.variables_initializer(uninit_vars))

    def sample_action(self, state):
        norm_state = normalize(state)

        one_hot_action = one_hot(self.prev_action, self.action_cnt)
        aug_state = norm_state + one_hot_action
        #  debug
        # print("entry")
        # print("dagger-runsender: aug_state: " + str(aug_state))
        #  debug
        # Get probability of each action from the local network.
        pi = self.model
        feed_dict = {
            pi.input: [[aug_state]],
            pi.state_in: self.lstm_state,
        }
        #debug
        self.logger.warning("RUN_SENDER: aug_state is: "+str(aug_state))
        #debug
        ops_to_run = [pi.action_probs, pi.state_out]
        action_probs, self.lstm_state = self.sess.run(ops_to_run, feed_dict)

        # Choose an action to take
        action = np.argmax(action_probs[0][0])
        self.prev_action = action
        return action

Пример #6

Показать файл

    def setup_tf_ops(self):
        """ Sets up the shared Tensorflow operators and structures
        Refer to DaggerLeader for more information
        """

        # Set up the shared global network and local network.
        with tf.device(self.leader_device):
            with tf.variable_scope('global_cpu'):
                self.global_network_cpu = DaggerLSTM(
                    state_dim=self.aug_state_dim, action_cnt=self.action_cnt)

        with tf.device(self.worker_device):
            with tf.variable_scope('local'):
                self.local_network = DaggerLSTM(state_dim=self.aug_state_dim,
                                                action_cnt=self.action_cnt)

        self.init_state = self.local_network.zero_init_state(1)
        self.lstm_state = self.init_state

        # Build shared queues for training data and synchronization
        self.train_q = tf.FIFOQueue(self.num_workers, [tf.float32, tf.int32],
                                    shared_name='training_feed')

        self.sync_q = tf.FIFOQueue(3, [tf.int16],
                                   shared_name=('sync_q_%d' % self.task_idx))

        # Training data is [[aug_state]], [action]
        self.state_data = tf.placeholder(tf.float32,
                                         shape=(None, self.aug_state_dim))
        self.action_data = tf.placeholder(tf.int32, shape=(None))
        self.enqueue_train_op = self.train_q.enqueue(
            [self.state_data, self.action_data])

        # Sync local network to global network (CPU)
        local_vars = self.local_network.trainable_vars
        global_vars = self.global_network_cpu.trainable_vars
        self.sync_op = tf.group(
            *[v1.assign(v2) for v1, v2 in zip(local_vars, global_vars)])

Пример #7

Показать файл

    def __init__(self, state_dim, action_cnt, restore_vars):
        self.aug_state_dim = state_dim + action_cnt
        self.action_cnt = action_cnt
        self.prev_action = action_cnt - 1

        with tf.variable_scope('global'):
            self.model = DaggerLSTM(
                state_dim=self.aug_state_dim, action_cnt=action_cnt)

        self.lstm_state = self.model.zero_init_state(1)

        self.sess = tf.Session()

        logging.basicConfig(level=logging.WARNING, filename="/home/zyk/state.log")
        self.logger = logging.getLogger("state")

        # restore saved variables
        saver = tf.train.Saver(self.model.trainable_vars)
        saver.restore(self.sess, restore_vars)

        # init the remaining vars, especially those created by optimizer
        uninit_vars = set(tf.global_variables())
        uninit_vars -= set(self.model.trainable_vars)
        self.sess.run(tf.variables_initializer(uninit_vars))

Пример #8

Показать файл

    def __init__(self, state_dim, action_cnt, restore_vars):
        self.aug_state_dim = state_dim + action_cnt
        self.action_cnt = action_cnt
        self.prev_action = action_cnt - 1

        with tf.variable_scope('global'):
            self.model = DaggerLSTM(state_dim=self.aug_state_dim,
                                    action_cnt=action_cnt)

        self.lstm_state = self.model.zero_init_state(1)

        self.sess = tf.Session()

        # restore saved variables
        saver = tf.train.Saver(self.model.trainable_vars)
        saver.restore(self.sess, restore_vars)

        # init the remaining vars, especially those created by optimizer
        uninit_vars = set(tf.global_variables())
        uninit_vars -= set(self.model.trainable_vars)
        self.sess.run(tf.variables_initializer(uninit_vars))

        self.log = open(
            '/home/eric/Dev/DRL-IL/pantheon/third_party/indigo/logs.txt', 'w')

Пример #9

Показать файл

class Learner(object):
    def __init__(self, state_dim, action_cnt, restore_vars):
        self.aug_state_dim = state_dim + action_cnt
        self.action_cnt = action_cnt
        self.prev_action = action_cnt - 1

        with tf.variable_scope('global'):
            self.model = DaggerLSTM(state_dim=self.aug_state_dim,
                                    action_cnt=action_cnt)

        self.lstm_state = self.model.zero_init_state(1)

        self.sess = tf.Session()

        # restore saved variables
        saver = tf.train.Saver(self.model.trainable_vars)
        saver.restore(self.sess, restore_vars)

        # init the remaining vars, especially those created by optimizer
        uninit_vars = set(tf.global_variables())
        uninit_vars -= set(self.model.trainable_vars)
        self.sess.run(tf.variables_initializer(uninit_vars))

    def sample_action(self, state):
        norm_state = normalize(state)

        one_hot_action = one_hot(self.prev_action, self.action_cnt)
        aug_state = norm_state + one_hot_action

        # Get probability of each action from the local network.
        pi = self.model
        feed_dict = {
            pi.input: [[aug_state]],
            pi.state_in: self.lstm_state,
        }
        ops_to_run = [pi.action_probs, pi.state_out]
        action_probs, self.lstm_state = self.sess.run(ops_to_run, feed_dict)

        # Choose an action to take
        action = np.argmax(action_probs[0][0])
        self.prev_action = action

        # action = np.argmax(np.random.multinomial(1, action_probs[0] - 1e-5))
        # temperature = 1.0
        # temp_probs = softmax(action_probs[0] / temperature)
        # action = np.argmax(np.random.multinomial(1, temp_probs - 1e-5))
        return action

Пример #10

Показать файл

Файл: dagger.py Проект: whukk/DQNCC

    def setup_tf_ops(self):  # called in __init__()
        """ Sets up the shared Tensorflow operators and structures
        Refer to DaggerLeader for more information
        """

        # Set up the shared global network and local network.
        with tf.device(self.leader_device):
            with tf.variable_scope('global_cpu'):
                self.global_network_cpu = DaggerLSTM(
                    state_dim=self.aug_state_dim, action_cnt=self.action_cnt)

        with tf.device(self.worker_device):
            with tf.variable_scope('local'):
                """
                Modify : change DaggerLSTM to DQN
                """
                self.local_network = DaggerLSTM(state_dim=self.aug_state_dim,
                                                action_cnt=self.action_cnt)

        self.init_state = self.local_network.zero_init_state(1)
        self.lstm_state = self.init_state

        # Build shared queues for training data and synchronization
        self.train_q = tf.FIFOQueue(self.num_workers, [tf.float32, tf.int32],
                                    shared_name='training_feed')

        self.sync_q = tf.FIFOQueue(3, [tf.int16],
                                   shared_name=('sync_q_%d' % self.task_idx))

        # Training data is [[aug_state]], [action]
        """
        Modify note 
        ------
        Remove  - self.action_data
        ------
        Add following placeholders:
                - self.pre_state_ph    (list)
                - self.action_ph   (int)
                - self.reward_ph       (float)
                - self.current_state(list)
        ------   
        Modify  - self.state_data shape->state_dim
                - change self.enqueue_train_op
        """
        self.prev_state_ph = tf.placeholder(tf.float32,
                                            shape=(None, self.aug_state_dim))
        # self.action_data = tf.placeholder(tf.int32, shape=(None))
        self.reward_ph = tf.placeholder(tf.float32, shape=())
        self.action_ph = tf.placeholder(tf.int32, shape=())
        self.cur_state_ph = tf.placeholder(tf.float32,
                                           shape=(None, self.aug_state_dim))
        self.enqueue_train_op = self.train_q.enqueue([
            self.prev_state_ph, self.reward_ph, self.action_ph,
            self.cur_state_ph
        ])

        # Sync local network to global network (CPU)
        local_vars = self.local_network.trainable_vars
        global_vars = self.global_network_cpu.trainable_vars
        self.sync_op = tf.group(
            *[v1.assign(v2) for v1, v2 in zip(local_vars, global_vars)])

Пример #11

Показать файл

Файл: dagger.py Проект: whukk/DQNCC

class DaggerWorker(object):
    def __init__(self, cluster, server, task_idx, env):
        # Distributed tensorflow and logging related
        self.cluster = cluster
        self.env = env
        self.task_idx = task_idx
        self.leader_device = '/job:ps/task:0'
        self.worker_device = '/job:worker/task:%d' % task_idx
        self.num_workers = cluster.num_tasks('worker')

        # Buffers and parameters required to train
        self.curr_ep = 0
        self.state_buf = []
        self.action_buf = []
        self.reward_buf = []
        self.prev_state_buf = []

        self.state_dim = env.state_dim
        self.action_cnt = env.action_cnt

        self.aug_state_dim = self.state_dim + self.action_cnt
        self.prev_action = self.action_cnt - 1

        self.expert = TrueDaggerExpert(env)
        # Must call env.set_sample_action() before env.rollout()
        env.set_sample_action(self.sample_action)

        # Modified
        self.prev_utility = 0
        self.prev_state = [0] * 10
        self.memory_size = 200
        self.memory_counter = 0
        # initialize zero memory [s, a, r, s_]
        self.batch_size = 8

        self.memory = np.zeros(
            (self.memory_size,
             self.state_dim * 2 + 5 + 1))  # 5=actions;1=reward

        # Set up Tensorflow for synchronization, training
        self.setup_tf_ops()
        self.sess = tf.Session(
            server.target, config=tf.ConfigProto(allow_soft_placement=True))
        self.sess.run(tf.global_variables_initializer())

    def cleanup(self):
        self.env.cleanup()
        self.sess.run(self.sync_q.enqueue(Status.WORKER_DONE))

    def setup_tf_ops(self):  # called in __init__()
        """ Sets up the shared Tensorflow operators and structures
        Refer to DaggerLeader for more information
        """

        # Set up the shared global network and local network.
        with tf.device(self.leader_device):
            with tf.variable_scope('global_cpu'):
                self.global_network_cpu = DaggerLSTM(
                    state_dim=self.aug_state_dim, action_cnt=self.action_cnt)

        with tf.device(self.worker_device):
            with tf.variable_scope('local'):
                """
                Modify : change DaggerLSTM to DQN
                """
                self.local_network = DaggerLSTM(state_dim=self.aug_state_dim,
                                                action_cnt=self.action_cnt)

        self.init_state = self.local_network.zero_init_state(1)
        self.lstm_state = self.init_state

        # Build shared queues for training data and synchronization
        self.train_q = tf.FIFOQueue(self.num_workers, [tf.float32, tf.int32],
                                    shared_name='training_feed')

        self.sync_q = tf.FIFOQueue(3, [tf.int16],
                                   shared_name=('sync_q_%d' % self.task_idx))

        # Training data is [[aug_state]], [action]
        """
        Modify note 
        ------
        Remove  - self.action_data
        ------
        Add following placeholders:
                - self.pre_state_ph    (list)
                - self.action_ph   (int)
                - self.reward_ph       (float)
                - self.current_state(list)
        ------   
        Modify  - self.state_data shape->state_dim
                - change self.enqueue_train_op
        """
        self.prev_state_ph = tf.placeholder(tf.float32,
                                            shape=(None, self.aug_state_dim))
        # self.action_data = tf.placeholder(tf.int32, shape=(None))
        self.reward_ph = tf.placeholder(tf.float32, shape=())
        self.action_ph = tf.placeholder(tf.int32, shape=())
        self.cur_state_ph = tf.placeholder(tf.float32,
                                           shape=(None, self.aug_state_dim))
        self.enqueue_train_op = self.train_q.enqueue([
            self.prev_state_ph, self.reward_ph, self.action_ph,
            self.cur_state_ph
        ])

        # Sync local network to global network (CPU)
        local_vars = self.local_network.trainable_vars
        global_vars = self.global_network_cpu.trainable_vars
        self.sync_op = tf.group(
            *[v1.assign(v2) for v1, v2 in zip(local_vars, global_vars)])

    def utility(self, state):
        # todo : calculate the utility function of a state
        # structure of state : delay_ewma,self.delivery_rate_ewma,send_rate_ewma,cwnd

        return 1

    def sample_action(self, state):
        """ Given a state buffer in the past step, returns an action
        to perform.

        Appends to the state/action buffers the state and the
        "correct" action to take according to the expert.
        """

        cwnd = state[3]
        # expert_action = self.expert.sample_action(cwnd)

        # For decision-making, normalize.

        norm_state = normalize(state)

        one_hot_action = one_hot(self.prev_action, self.action_cnt)
        aug_state = norm_state + one_hot_action

        # Fill in state_buf, action_buf
        # self.state_buf.append(aug_state)

        r = self.utility(aug_state) - self.prev_utility
        transition = np.hstack((aug_state, [self.prev_action,
                                            r], self.prev_state))

        # replace the old memory with new memory
        index = self.memory_counter % self.memory_size
        self.memory[index, :] = transition  # sample action
        self.memory_counter += 1
        # refresh previous state and utility
        self.prev_utility = self.utility(aug_state)
        self.prev_state = aug_state

        # sample batch memory from all memory
        if self.memory_counter > self.memory_size:
            sample_index = np.random.choice(self.memory_size,
                                            size=self.batch_size)
        else:
            sample_index = np.random.choice(self.memory_counter,
                                            size=self.batch_size)
        batch_memory = self.memory[sample_index, :]

        # todo : train current network

        # self.action_buf.append(expert_action)

        # Always use the expert on the first episode to get our bearings.
        #if self.curr_ep == 0:
        #   self.prev_action = expert_action
        #   return expert_action

        # Get probability of each action from the local network.
        pi = self.local_network

        feed_dict = {
            pi.input: [[aug_state]],
            pi.state_in: self.lstm_state,
        }
        ops_to_run = [pi.action_probs, pi.state_out]
        action_probs, self.lstm_state = self.sess.run(ops_to_run, feed_dict)

        # Choose an action to take and update current LSTM state
        # action = np.argmax(np.random.multinomial(1, action_probs[0][0] - 1e-5))
        action = np.argmax(action_probs[0][0])
        self.prev_action = action

        return action

    def rollout(self):
        """ Start an episode/flow with an empty dataset/environment. """
        self.state_buf = []
        self.reward_buf = []
        self.action_buf = []
        self.pre_state_buf = []

        self.prev_action = self.action_cnt - 1
        self.lstm_state = self.init_state

        self.env.reset()
        self.env.rollout()

    def run(self, debug=False):
        """Runs for max_ep episodes, each time sending data to the leader."""

        pi = self.local_network
        while True:
            if debug:
                sys.stderr.write('[WORKER %d Ep %d] Starting...\n' %
                                 (self.task_idx, self.curr_ep))

            # Reset local parameters to global
            self.sess.run(self.sync_op)

            print 'DaggerWorker:global_network_cpu:cnt', self.sess.run(
                self.global_network_cpu.cnt)
            print 'DaggerWorker:local_network:cnt', self.sess.run(
                self.local_network.cnt)
            sys.stdout.flush()

            # Start a single episode, populating state-action buffers.
            self.rollout()

            #################################

            if debug:
                queue_size = self.sess.run(self.train_q.size())
                sys.stderr.write(
                    '[WORKER %d Ep %d]: enqueueing a sequence of data '
                    'into queue of size %d\n' %
                    (self.task_idx, self.curr_ep, queue_size))
                sys.stderr.write(
                    'state buffer %s \n'
                    'action buffer: %s\n'
                    'state buffer size: %d\n'
                    'action buffer size %d \n' %
                    (self.state_buf, self.action_buf, self.state_buf.__len__(),
                     self.action_buf.__len__()))
            # Enqueue a sequence of data into the training queue.

            # todo: do some pre processing to state_buf
            #
            """
            self.enqueue_train_op = self.train_q.enqueue(
                [self.prev_state_ph, self.reward_ph,self.action_ph,self.cur_state_ph])
                
            Feed the data from self.memory to train_q
            """
            # Every element in a feed_dict should be a list
            self.sess.run(self.enqueue_train_op,
                          feed_dict={
                              self.prev_state_ph:
                              self.memory[:, :self.state_dim],
                              self.reward_ph: self.memory[:, self.state_dim],
                              self.action_ph: self.memory[:,
                                                          self.state_dim + 1],
                              self.cur_state_ph: self.memory[:,
                                                             -self.state_dim:]
                          })

            self.sess.run(self.sync_q.enqueue(Status.EP_DONE))

            if debug:
                queue_size = self.sess.run(self.train_q.size())
                sys.stderr.write('[WORKER %d Ep %d]: finished queueing data. '
                                 'queue size now %d\n' %
                                 (self.task_idx, self.curr_ep, queue_size))

            if debug:
                sys.stderr.write('[WORKER %d Ep %d]: waiting for server\n' %
                                 (self.task_idx, self.curr_ep))

            # Let the leader dequeue EP_DONE
            time.sleep(0.5)

            # Wait until pserver finishes training by blocking on sync_q
            # Only proceeds when it finds a message from the pserver.
            msg = self.sess.run(self.sync_q.dequeue())
            while (msg != Status.WORKER_START and msg != Status.PS_DONE):
                self.sess.run(self.sync_q.enqueue(msg))
                time.sleep(0.5)
                msg = self.sess.run(self.sync_q.dequeue())

            if msg == Status.PS_DONE:
                break

            self.curr_ep += 1

Пример #12

Показать файл

Файл: dagger.py Проект: whukk/DQNCC

class DaggerLeader(object):
    #  worker_tasks is a set. It contains number from 0 to number of worker-1
    def __init__(self, cluster, server, worker_tasks):
        self.cluster = cluster
        self.server = server
        self.worker_tasks = worker_tasks
        self.num_workers = len(worker_tasks)
        self.aggregated_states = []
        self.aggregated_actions = []
        self.max_eps = 1000
        self.checkpoint_delta = 10
        self.checkpoint = self.checkpoint_delta
        self.learn_rate = 0.01
        self.regularization_lambda = 1e-4
        self.train_step = 0

        self.state_dim = Sender.state_dim
        self.action_cnt = Sender.action_cnt
        self.aug_state_dim = self.state_dim + self.action_cnt

        # Create the master network and training/sync queues
        with tf.variable_scope('global'):
            self.global_network = DaggerLSTM(state_dim=self.aug_state_dim,
                                             action_cnt=self.action_cnt)

        self.leader_device_cpu = '/job:ps/task:0/cpu:0'
        with tf.device(self.leader_device_cpu):
            with tf.variable_scope('global_cpu'):
                self.global_network_cpu = DaggerLSTM(
                    state_dim=self.aug_state_dim, action_cnt=self.action_cnt)

        cpu_vars = self.global_network_cpu.trainable_vars
        gpu_vars = self.global_network.trainable_vars
        self.sync_op = tf.group(
            *[v1.assign(v2) for v1, v2 in zip(cpu_vars, gpu_vars)])

        self.default_batch_size = 300
        self.default_init_state = self.global_network.zero_init_state(
            self.default_batch_size)

        # Each element is [[aug_state]], [action]
        self.train_q = tf.FIFOQueue(self.num_workers, [tf.float32, tf.int32],
                                    shared_name='training_feed')

        # Keys: worker indices, values: Tensorflow messaging queues
        # Queue Elements: Status message
        self.sync_queues = {}
        for idx in worker_tasks:
            queue_name = 'sync_q_%d' % idx
            self.sync_queues[idx] = tf.FIFOQueue(3, [tf.int16],
                                                 shared_name=queue_name)

        self.setup_tf_ops(server)

        self.sess = tf.Session(
            server.target, config=tf.ConfigProto(allow_soft_placement=True))
        self.sess.run(tf.global_variables_initializer())

    def cleanup(self):
        """ Sends messages to workers to stop and saves the model. """
        for idx in self.worker_tasks:
            self.sess.run(self.sync_queues[idx].enqueue(Status.PS_DONE))
        self.save_model()

    def save_model(self, checkpoint=None):
        """ Takes care of saving/checkpointing the model. """
        if checkpoint is None:
            model_path = path.join(self.logdir, 'model')
        else:
            model_path = path.join(self.logdir, 'checkpoint-%d' % checkpoint)

        # save parameters to parameter server
        saver = tf.train.Saver(self.global_network.trainable_vars)
        saver.save(self.sess, model_path)
        sys.stderr.write('\nModel saved to param. server at %s\n' % model_path)

    def setup_tf_ops(self, server):
        """ Sets up Tensorboard operators and tools, such as the optimizer,
        summary values, Tensorboard, and Session.
        """

        self.actions = tf.placeholder(tf.int32, [None, None])

        reg_loss = 0.0
        for x in self.global_network.trainable_vars:
            if x.name == 'global/cnt:0':
                continue
            reg_loss += tf.nn.l2_loss(x)
        reg_loss *= self.regularization_lambda

        cross_entropy_loss = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                labels=self.actions,  # todo: Q-target and Q-evaluation
                logits=self.global_network.action_scores))

        self.total_loss = cross_entropy_loss + reg_loss

        optimizer = tf.train.AdamOptimizer(self.learn_rate)
        self.train_op = optimizer.minimize(self.total_loss)

        tf.summary.scalar('reduced_ce_loss', cross_entropy_loss)
        tf.summary.scalar('reg_loss', reg_loss)
        tf.summary.scalar('total_loss', self.total_loss)
        self.summary_op = tf.summary.merge_all()

        git_commit = check_output('cd %s && git rev-parse @' %
                                  project_root.DIR,
                                  shell=True)
        date_time = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
        log_name = date_time + '-%s' % git_commit.strip()
        self.logdir = path.join(project_root.DIR, 'dagger', 'logs', log_name)
        make_sure_path_exists(self.logdir)
        self.summary_writer = tf.summary.FileWriter(self.logdir)

    def wait_on_workers(self):
        """ Update which workers are done or dead. Stale tokens will
        eventually be cleaned out.
        Returns the number of workers that finished their episode.
        """
        workers_ep_done = 0
        while workers_ep_done < len(self.worker_tasks):
            # Let the workers dequeue their start tokens
            time.sleep(0.5)

            # check in each queue for worker messages and update workers
            workers_done = []
            for idx in self.worker_tasks:
                worker_queue = self.sync_queues[idx]
                msg = self.sess.run(
                    worker_queue.dequeue())  # used for communicate

                if msg == Status.EP_DONE:
                    workers_ep_done += 1
                elif msg == Status.WORKER_DONE:
                    workers_done.append(idx)
                    self.sess.run(worker_queue.close())
                else:
                    self.sess.run(worker_queue.enqueue(msg))

            for worker in workers_done:
                self.worker_tasks.remove(worker)

        return workers_ep_done

    def run_one_train_step(self, batch_states, batch_actions):
        """ Runs one step of the training operator on the given data.
        At times will update Tensorboard and save a checkpointed model.
        Returns the total loss calculated.
        """

        summary = True if self.train_step % 10 == 0 else False

        ops_to_run = [self.train_op, self.total_loss]

        if summary:
            ops_to_run.append(self.summary_op)

        pi = self.global_network

        start_ts = curr_ts_ms()
        ret = self.sess.run(ops_to_run,
                            feed_dict={
                                pi.input: batch_states,
                                self.actions: batch_actions,
                                pi.state_in: self.init_state
                            })

        elapsed = (curr_ts_ms() - start_ts) / 1000.0
        sys.stderr.write('train step %d: time %.2f\n' %
                         (self.train_step, elapsed))

        if summary:
            self.summary_writer.add_summary(ret[2], self.train_step)
        print "Dagger leader: ret"
        print ret
        return ret[1]

    def train(self):
        """ Runs the training operator until the loss converges.
        """
        curr_iter = 0

        min_loss = float('inf')
        iters_since_min_loss = 0

        batch_size = min(len(self.aggregated_states), self.default_batch_size)
        num_batches = len(self.aggregated_states) / batch_size

        if batch_size != self.default_batch_size:
            self.init_state = self.global_network.zero_init_state(batch_size)
        else:
            self.init_state = self.default_init_state

        while True:
            curr_iter += 1

            mean_loss = 0.0
            max_loss = 0.0

            for batch_num in xrange(num_batches):
                self.train_step += 1

                start = batch_num * batch_size
                end = start + batch_size

                batch_states = self.aggregated_states[start:end]
                batch_actions = self.aggregated_actions[start:end]

                loss = self.run_one_train_step(batch_states, batch_actions)

                mean_loss += loss
                max_loss = max(loss, max_loss)

            mean_loss /= num_batches

            sys.stderr.write('--- iter %d: max loss %.4f, mean loss %.4f\n' %
                             (curr_iter, max_loss, mean_loss))

            if max_loss < min_loss - 0.001:
                min_loss = max_loss
                iters_since_min_loss = 0
            else:
                iters_since_min_loss += 1

            if curr_iter > 50:
                break

            if iters_since_min_loss >= max(0.2 * curr_iter, 10):
                break

        self.sess.run(self.global_network.add_one)

        # copy trained variables from GPU to CPU
        self.sess.run(self.sync_op)

        print 'DaggerLeader:global_network:cnt', self.sess.run(
            self.global_network.cnt)
        print 'DaggerLeader:global_network_cpu:cnt', self.sess.run(
            self.global_network_cpu.cnt)
        sys.stdout.flush()

    def run(self, debug=False):
        for curr_ep in xrange(self.max_eps):
            if debug:
                sys.stderr.write('[PSERVER EP %d]: waiting for workers %s\n' %
                                 (curr_ep, self.worker_tasks))

            workers_ep_done = self.wait_on_workers()

            # If workers had data, dequeue ALL the samples and train
            if workers_ep_done > 0:
                while True:
                    num_samples = self.sess.run(self.train_q.size())
                    if num_samples == 0:
                        break
                    # Collect all data from train_q, aggregate them in aggregated_states/aggregated_actions
                    data = self.sess.run(self.train_q.dequeue())
                    self.aggregated_states.append(data[0])
                    self.aggregated_actions.append(data[1])

                if debug:
                    sys.stderr.write('[PSERVER]: start training\n')

                self.train()
            else:
                if debug:
                    sys.stderr.write('[PSERVER]: quitting...\n')
                break

            # Save the network model for testing every so often
            if curr_ep == self.checkpoint:
                self.save_model(curr_ep)
                self.checkpoint += self.checkpoint_delta

            # After training, tell workers to start another episode
            for idx in self.worker_tasks:
                worker_queue = self.sync_queues[idx]
                self.sess.run(worker_queue.enqueue(Status.WORKER_START))

Пример #13

Показать файл

class DaggerWorker(object):
    def __init__(self, cluster, server, task_idx, env):
        # Distributed tensorflow and logging related
        self.cluster = cluster
        self.env = env
        self.task_idx = task_idx
        self.leader_device = '/job:ps/task:0'
        self.worker_device = '/job:worker/task:%d' % task_idx
        self.num_workers = cluster.num_tasks('worker')

        # Buffers and parameters required to train
        self.curr_ep = 0
        self.state_buf = []
        self.action_buf = []
        self.state_dim = env.state_dim
        self.action_cnt = env.action_cnt

        self.aug_state_dim = self.state_dim + self.action_cnt
        self.prev_action = self.action_cnt - 1

        self.expert = TrueDaggerExpert(env)
        # Must call env.set_sample_action() before env.rollout()
        env.set_sample_action(self.sample_action)

        # Set up Tensorflow for synchronization, training
        self.setup_tf_ops()
        self.sess = tf.Session(
            server.target, config=tf.ConfigProto(allow_soft_placement=True))
        self.sess.run(tf.global_variables_initializer())

    def cleanup(self):
        self.env.cleanup()
        self.sess.run(self.sync_q.enqueue(Status.WORKER_DONE))

    def setup_tf_ops(self):
        """ Sets up the shared Tensorflow operators and structures
        Refer to DaggerLeader for more information
        """

        # Set up the shared global network and local network.
        with tf.device(self.leader_device):
            with tf.variable_scope('global_cpu'):
                self.global_network_cpu = DaggerLSTM(
                    state_dim=self.aug_state_dim, action_cnt=self.action_cnt)

        with tf.device(self.worker_device):
            with tf.variable_scope('local'):
                self.local_network = DaggerLSTM(state_dim=self.aug_state_dim,
                                                action_cnt=self.action_cnt)

        self.init_state = self.local_network.zero_init_state(1)
        self.lstm_state = self.init_state

        # Build shared queues for training data and synchronization
        self.train_q = tf.FIFOQueue(self.num_workers, [tf.float32, tf.int32],
                                    shared_name='training_feed')

        self.sync_q = tf.FIFOQueue(3, [tf.int16],
                                   shared_name=('sync_q_%d' % self.task_idx))

        # Training data is [[aug_state]], [action]
        self.state_data = tf.placeholder(tf.float32,
                                         shape=(None, self.aug_state_dim))
        self.action_data = tf.placeholder(tf.int32, shape=(None))
        self.enqueue_train_op = self.train_q.enqueue(
            [self.state_data, self.action_data])

        # Sync local network to global network (CPU)
        local_vars = self.local_network.trainable_vars
        global_vars = self.global_network_cpu.trainable_vars
        self.sync_op = tf.group(
            *[v1.assign(v2) for v1, v2 in zip(local_vars, global_vars)])

    def sample_action(self, state):
        """ Given a state buffer in the past step, returns an action
        to perform.

        Appends to the state/action buffers the state and the
        "correct" action to take according to the expert.
        """
        cwnd = state[self.state_dim - 1]
        expert_action = self.expert.sample_action(cwnd)

        # For decision-making, normalize.
        norm_state = normalize(state)

        one_hot_action = one_hot(self.prev_action, self.action_cnt)
        aug_state = norm_state + one_hot_action

        # Fill in state_buf, action_buf
        self.state_buf.append(aug_state)
        self.action_buf.append(expert_action)

        # Always use the expert on the first episode to get our bearings.
        if self.curr_ep == 0:
            self.prev_action = expert_action
            return expert_action

        # Get probability of each action from the local network.
        pi = self.local_network
        feed_dict = {
            pi.input: [[aug_state]],
            pi.state_in: self.lstm_state,
        }
        ops_to_run = [pi.action_probs, pi.state_out]
        action_probs, self.lstm_state = self.sess.run(ops_to_run, feed_dict)

        # Choose an action to take and update current LSTM state
        # action = np.argmax(np.random.multinomial(1, action_probs[0][0] - 1e-5))
        action = np.argmax(action_probs[0][0])
        self.prev_action = action

        return action

    def rollout(self):
        """ Start an episode/flow with an empty dataset/environment. """
        self.state_buf = []
        self.action_buf = []
        self.prev_action = self.action_cnt - 1
        self.lstm_state = self.init_state

        self.env.reset()
        self.env.rollout()

    def run(self, debug=False):
        """Runs for max_ep episodes, each time sending data to the leader."""

        pi = self.local_network
        while True:
            if debug:
                sys.stderr.write('[WORKER %d Ep %d] Starting...\n' %
                                 (self.task_idx, self.curr_ep))

            # Reset local parameters to global
            self.sess.run(self.sync_op)

            print 'DaggerWorker:global_network_cpu:cnt', self.sess.run(
                self.global_network_cpu.cnt)
            print 'DaggerWorker:local_network:cnt', self.sess.run(
                self.local_network.cnt)
            sys.stdout.flush()

            # Start a single episode, populating state-action buffers.
            self.rollout()

            if debug:
                queue_size = self.sess.run(self.train_q.size())
                sys.stderr.write(
                    '[WORKER %d Ep %d]: enqueueing a sequence of data '
                    'into queue of size %d\n' %
                    (self.task_idx, self.curr_ep, queue_size))

            # Enqueue a sequence of data into the training queue.
            self.sess.run(self.enqueue_train_op,
                          feed_dict={
                              self.state_data: self.state_buf,
                              self.action_data: self.action_buf
                          })
            self.sess.run(self.sync_q.enqueue(Status.EP_DONE))

            if debug:
                queue_size = self.sess.run(self.train_q.size())
                sys.stderr.write('[WORKER %d Ep %d]: finished queueing data. '
                                 'queue size now %d\n' %
                                 (self.task_idx, self.curr_ep, queue_size))

            if debug:
                sys.stderr.write('[WORKER %d Ep %d]: waiting for server\n' %
                                 (self.task_idx, self.curr_ep))

            # Let the leader dequeue EP_DONE
            time.sleep(0.5)

            # Wait until pserver finishes training by blocking on sync_q
            # Only proceeds when it finds a message from the pserver.
            msg = self.sess.run(self.sync_q.dequeue())
            while (msg != Status.WORKER_START and msg != Status.PS_DONE):
                self.sess.run(self.sync_q.enqueue(msg))
                time.sleep(0.5)
                msg = self.sess.run(self.sync_q.dequeue())

            if msg == Status.PS_DONE:
                break

            self.curr_ep += 1