Exemplo n.º 1
0
class Actor:
    def __init__(
            self,
            args,
            queues,
            number,
            sess,
            param_copy_interval=20,
            send_size=10,  # Send () transition to shared queue in a time.
            no_op_steps=30,  # Maximum number of "do nothing" actions to be performed by the agent at the start of an episode
            epsilon=0.9,
            alpha=7,
            anealing=False,
            no_anealing_steps=100,
            anealing_steps=10000,
            initial_epsilon=1.0,
            final_epsilon=0.1):

        self.queue = queues[0]
        self.param_queue = queues[1]
        self.data_queue = queues[2]
        self.buffer_queue = queues[3]
        self.path = args.path
        self.num_episodes = args.num_episodes
        self.num_actors = args.num_actors
        self.frame_width = args.frame_width
        self.frame_height = args.frame_height
        self.state_length = args.state_length
        self.n_step = args.n_step
        self.gamma = args.gamma
        self.gamma_n = self.gamma**self.n_step
        self.param_copy_interval = param_copy_interval
        self.send_size = send_size
        self.no_op_steps = no_op_steps
        self.epsilon = epsilon
        self.alpha = alpha
        self.anealing = anealing
        self.no_anealing_steps = no_anealing_steps
        self.anealing_steps = anealing_steps
        self.prop = Property()
        self.env = Simulator(args, self.prop)
        self.num = number
        self.num_actions = args.n_actions
        self.a_buffer = [0] * self.num_actions
        self.t = 0
        self.repeated_action = 0
        self.total_reward = 0
        self.total_q_max = 0

        if not self.anealing:
            self.epsilon = self.epsilon**(
                1 + (self.num / (self.num_actors - 1)) *
                self.alpha) if self.num_actors != 1 else self.epsilon
        else:
            self.epsilon = initial_epsilon
            self.epsilon_step = (initial_epsilon -
                                 final_epsilon) / anealing_steps

        self.model = Network(args)
        self.local_memory = deque(maxlen=self.send_size * 2)
        self.env_memory = np.zeros((0, self.env.num_i))
        self.buffer_memory = np.zeros((0, 8))
        self.buffer = []
        self.R = 0

        #with tf.device("/cpu:0"):
        self.s, self.q_values, q_network = self.model.build_network()

        self.q_network_weights = self.bubble_sort_parameters(
            q_network.trainable_weights)
        #with tf.device("/cpu:0"):
        self.st, self.target_q_values, target_network = self.model.build_network(
        )
        self.target_network_weights = self.bubble_sort_parameters(
            target_network.trainable_weights)

        self.a, self.y, self.q, self.error = self.td_error_op()

        learner_params = self.param_queue.get()
        shapes = self.get_params_shape(learner_params)

        self.ph_list = [
            tf.placeholder(tf.float32, shape=shapes[i])
            for i in range(len(shapes))
        ]
        self.target_ph_list = [
            tf.placeholder(tf.float32, shape=shapes[i])
            for i in range(len(shapes))
        ]
        self.obtain_q_parameters = [
            self.q_network_weights[i].assign(self.ph_list[i])
            for i in range(len(self.q_network_weights))
        ]
        self.obtain_target_parameters = [
            self.target_network_weights[i].assign(self.target_ph_list[i])
            for i in range(len(self.target_network_weights))
        ]

        self.sess = sess

        self.sess.run(tf.global_variables_initializer())

        self.sess.run(
            [self.obtain_q_parameters, self.obtain_target_parameters],
            feed_dict=self.create_feed_dict(learner_params))

    def create_feed_dict(self, learner_params):
        feed_dict = {}
        for i in range(len(learner_params[0])):
            feed_dict[self.ph_list[i]] = learner_params[0][i]
            feed_dict[self.target_ph_list[i]] = learner_params[1][i]
        return feed_dict

    def get_params_shape(self, learner_params):
        shapes = []
        for p in learner_params[0]:
            shapes.append(p.shape)
        return shapes

    def bubble_sort_parameters(self, arr):
        change = True
        while change:
            change = False
            for i in range(len(arr) - 1):
                if arr[i].name > arr[i + 1].name:
                    arr[i], arr[i + 1] = arr[i + 1], arr[i]
                    change = True
        return arr

    def td_error_op(self):
        a = tf.placeholder(tf.int64, [None])
        y = tf.placeholder(tf.float32, [None])
        q = tf.placeholder(tf.float32, [None, None])
        #w = tf.placeholder(tf.float32, [None])

        # Convert action to one hot vector. shape=(BATCH_SIZE, num_actions)
        a_one_hot = tf.one_hot(a, self.num_actions, 1.0, 0.0)
        # shape = (BATCH_SIZE,)
        q_value = tf.reduce_sum(tf.multiply(q, a_one_hot), reduction_indices=1)

        # Clip the error, the loss is quadratic when the error is in (-1, 1), and linear outside of that region
        error = tf.abs(y - q_value)

        return a, y, q, error

    def get_initial_state(self, observation):

        state = [processed_observation for _ in range(self.state_length)]
        return np.stack(state, axis=0)

    def get_action_and_q(self, state):
        q = self.q_values.eval(feed_dict={self.s: [np.float32(state)]},
                               session=self.sess)
        if self.epsilon >= random.random():
            action = random.randrange(self.num_actions)
        else:
            action = np.argmax(q[0])
        self.repeated_action = action

        return action, q[0], np.max(q)

    def get_action_at_test(self, state):
        action = self.repeated_action

        if random.random() <= 0.05:
            action = random.randrange(self.num_actions)
        else:
            action = np.argmax(
                self.q_values.eval(feed_dict={self.s: [np.float32(state)]}))
        self.repeated_action = action

        return action

    def get_sample(self, n):
        s, a, _, _, q = self.buffer[0]
        _, _, _, s_, q_ = self.buffer[n - 1]
        return s, a, self.R, s_, q, q_

    def calculate_R(self, reward):
        self.R = round((self.R + reward * self.gamma_n) / self.gamma, 3)

    def calculate_n_step_transition(self):
        if len(self.buffer) >= self.n_step:
            s, a, r, s_, q, q_ = self.get_sample(self.n_step)
            self.local_memory.append((s, a, r, s_, q, q_))
            self.R = self.R - self.buffer[0][2]
            self.buffer.pop(0)

    def add_experience_and_priority_to_remote_memory(self):
        # Add experience and priority to remote memory
        if len(self.local_memory) > self.send_size:
            state_batch = []
            action_batch = []
            reward_batch = []
            next_state_batch = []
            #erminal_batch = []
            q_batch = []
            qn_batch = []

            for _ in range(self.send_size):
                data = self.local_memory.popleft()
                state_batch.append(data[0])
                action_batch.append(data[1])
                reward_batch.append(data[2])
                #shape = (BATCH_SIZE, 4, 32, 32)
                next_state_batch.append(data[3])
                #terminal_batch.append(data[4])
                q_batch.append(data[4])
                qn_batch.append(data[5])

            # shape = (BATCH_SIZE, num_actions)
            target_q_values_batch = self.target_q_values.eval(
                feed_dict={self.st: np.float32(np.array(next_state_batch))},
                session=self.sess)
            # DDQN
            actions = np.argmax(qn_batch, axis=1)
            target_q_values_batch = np.array([
                target_q_values_batch[i][action]
                for i, action in enumerate(actions)
            ])
            # shape = (BATCH_SIZE,)
            y_batch = reward_batch + self.gamma_n * target_q_values_batch

            error_batch = self.error.eval(feed_dict={
                self.s:
                np.float32(np.array(state_batch)),
                self.a:
                action_batch,
                self.q:
                q_batch,
                self.y:
                y_batch
            },
                                          session=self.sess)

            send = [(state_batch[i], action_batch[i], reward_batch[i],
                     next_state_batch[i]) for i in range(self.send_size)]

            self.queue.put((send, error_batch))

    def copy_weight(self):
        if self.t % self.param_copy_interval == 0:
            while self.param_queue.empty():
                print('Actor {} is wainting for learner params coming'.format(
                    self.num))
                time.sleep(4)
            learner_params = self.param_queue.get()
            self.sess.run(
                [self.obtain_q_parameters, self.obtain_target_parameters],
                feed_dict=self.create_feed_dict(learner_params))

        if self.anealing and self.anealing_steps + self.no_anealing_steps > self.t >= self.no_anealing_steps:
            self.epsilon -= self.epsilon_step

    def start_loop(self, num_loop):
        self.env.setup_DAQmx()
        # initiate
        state = self.env.get_initial_state()
        # start read analog
        self.env.start_reading()

        # first loop
        for n in range(num_loop):
            observation = self.env.get_observation()
            processed_observation = self.env.preprocess(observation)
            next_state = np.append(state[1:, :, :],
                                   processed_observation,
                                   axis=0)
            # adopt output timing and action zero
            if n != 10 or n != 20 or n != 40 or n != 50 or n != 60 or n != 80 or n != 90:
                self.env.write_daqmx_zero()
            cpte = np.average(observation[:, self.env.loc_100])
            m = [0, cpte, 0]
            m.extend(self.a_buffer)
            self.buffer_memory = np.append(self.buffer_memory, [m], axis=0)
            state = next_state
        return state

    def end_loop(self, num_loop, state):
        # third loop
        for _ in range(num_loop):
            observation = self.env.get_observation()
            processed_observation = self.env.preprocess(observation)
            next_state = np.append(state[1:, :, :],
                                   processed_observation,
                                   axis=0)
            # action
            self.env.write_daqmx_zero()
            cpte = np.average(observation[:, self.env.loc_100])
            self.env_memory = np.append(self.env_memory, observation, axis=0)
            m = [0, cpte, 0]
            m.extend(self.a_buffer)
            self.buffer_memory = np.append(self.buffer_memory, [m], axis=0)
            state = next_state
        # stop DAQmx
        self.env.stop_DAQmx()

    def run(self):
        for episode in range(self.num_episodes):
            # initialize
            self.R = 0
            self.total_reward = 0
            self.total_q_max = 0
            # simulation start
            state = self.start_loop(int(self.env.n_loop / 2))
            # measure time
            start = time.time()
            # ineract_with_envornment
            for n in range(self.env.n_loop):
                # action
                action, q, q_max = self.get_action_and_q(state)
                self.env.write_daqmx(action)

                observation = self.env.get_observation()
                processed_observation = self.env.preprocess(observation)
                next_state = np.append(state[1:, :, :],
                                       processed_observation,
                                       axis=0)
                cpte = np.average(observation[:, self.env.loc_100])
                reward = self.env.get_reward(cpte)
                self.buffer.append((state, action, reward, next_state, q))
                state = next_state
                # n-step transition
                self.calculate_n_step_transition()

                self.env_memory = np.append(self.env_memory,
                                            observation,
                                            axis=0)
                m = [0, cpte, 0]
                m.extend(q)
                self.buffer_memory = np.append(self.buffer_memory, [m], axis=0)
                self.total_reward += reward
                self.total_q_max += q_max
                self.t = 1
            # simulation end
            state = self.end_loop(int(self.env.n_loop / 2), state)
            # Add_experience_and_priority_to_remote_memory
            self.add_experience_and_priority_to_remote_memory()
            # copy weight
            self.copy_weight()
            # measure time
            elapsed = time.time() - start
            # write text
            text = 'EPISODE: {0:6d} / ACTOR: {1:3d} / EPSILON: {2:.5f} / TOTAL_REWARD: {3:3.0f} / MAX_Q_AVG: {4:2.4f} '.format(
                episode + 1, self.num, self.epsilon, self.total_reward,
                (self.total_q_max / float(self.env.n_loop)))

            print(text)

            with open(self.path + '/output.txt', 'a') as f:
                f.write(text + "\n")
            # send data to copy
            self.send_env_data()

        print("Actor", self.num, "is Over.")
        time.sleep(0.5)

    def send_env_data(self):
        self.data_queue.put(self.env_memory)
        self.buffer_queue.put(self.buffer_memory)
        self.buffer_memory = np.zeros((0, 8))
        self.env_memory = np.zeros((0, self.env.num_i))
Exemplo n.º 2
0
class Agent:
    def __init__(self, args, queues, sess):

        self.queue = queues[0]
        self.param_queue = queues[1]
        self.data_queue = queues[2]
        self.buffer_queue = queues[3]
        self.load = args.load
        self.save_network_path = args.path + '/saved_networks/'
        self.path = args.path
        self.num_episodes = args.test_num
        self.frame_width = args.frame_width
        self.frame_height = args.frame_height
        self.state_length = args.state_length
        self.prop = Property()
        self.env = Simulator(args, self.prop)
        self.t = 0
        self.total_reward = 0
        self.total_q_max = 0
        self.model = Network(args)
        self.env_memory = np.zeros((0, self.env.num_i))
        self.buffer_memory = np.zeros((0, 8))
        self.buffer = []
        self.num_actions = args.n_actions
        self.a_buffer = [0] * self.num_actions

        with tf.variable_scope("learner_parameters", reuse=True):
            with tf.device("/cpu:0"):
                self.s, self.q_values, q_network = self.model.build_network()

        self.q_network_weights = self.bubble_sort_parameters(
            q_network.trainable_weights)

        self.sess = sess

        self.sess.run(tf.global_variables_initializer())

        with tf.device("/cpu:0"):
            self.saver = tf.train.Saver(self.q_network_weights)

        # Load network
        if self.load:
            self.load_network()

    def load_network(self):
        checkpoint = tf.train.get_checkpoint_state(self.save_network_path)
        if checkpoint and checkpoint.model_checkpoint_path:
            self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
            print('Successfully loaded: ' + checkpoint.model_checkpoint_path)
        else:
            print('Training new network...')

    def create_feed_dict(self, learner_params):
        feed_dict = {}
        for i in range(len(learner_params[0])):
            feed_dict[self.ph_list[i]] = learner_params[0][i]
            feed_dict[self.target_ph_list[i]] = learner_params[1][i]
        return feed_dict

    def get_params_shape(self, learner_params):
        shapes = []
        for p in learner_params[0]:
            shapes.append(p.shape)
        return shapes

    def bubble_sort_parameters(self, arr):
        change = True
        while change:
            change = False
            for i in range(len(arr) - 1):
                if arr[i].name > arr[i + 1].name:
                    arr[i], arr[i + 1] = arr[i + 1], arr[i]
                    change = True
        return arr

    def get_initial_state(self, observation):
        state = [processed_observation for _ in range(self.state_length)]
        return np.stack(state, axis=0)

    def get_action_at_test(self, state):
        q = self.q_values.eval(feed_dict={self.s: [np.float32(state)]},
                               session=self.sess)
        action = np.argmax(q[0])

        return action, q[0], np.max(q)

    def start_loop(self, num_loop):
        self.env.setup_DAQmx()
        # initiate
        state = self.env.get_initial_state()
        # start read analog
        self.env.start_reading()

        # first loop
        for n in range(num_loop):
            observation = self.env.get_observation()
            processed_observation = self.env.preprocess(observation)
            next_state = np.append(state[1:, :, :],
                                   processed_observation,
                                   axis=0)
            # adopt output timing and action zero
            if n != 10 or n != 20 or n != 40 or n != 50 or n != 60 or n != 80 or n != 90:
                self.env.write_daqmx_zero()
            self.env_memory = np.append(self.env_memory, observation, axis=0)
            cpte = np.average(observation[:, self.env.loc_100])
            m = [0, cpte, 0]
            m.extend(self.a_buffer)
            self.buffer_memory = np.append(self.buffer_memory, [m], axis=0)
            state = next_state
        return state

    def end_loop(self, num_loop, state):
        # third loop
        for _ in range(num_loop):
            observation = self.env.get_observation()
            processed_observation = self.env.preprocess(observation)
            next_state = np.append(state[1:, :, :],
                                   processed_observation,
                                   axis=0)
            # action
            self.env.write_daqmx_zero()
            self.env_memory = np.append(self.env_memory, observation, axis=0)
            cpte = np.average(observation[:, self.env.loc_100])
            m = [0, cpte, 0]
            m.extend(self.a_buffer)
            self.buffer_memory = np.append(self.buffer_memory, [m], axis=0)
            state = next_state
        # stop DAQmx
        self.env.stop_DAQmx()

    def run(self):
        for episode in range(self.num_episodes):
            # initialize
            self.total_reward = 0
            self.total_q_max = 0
            # simulation start
            state = self.start_loop(int(self.env.n_loop / 2))
            # measure time
            start = time.time()
            # ineract_with_envornment
            for n in range(self.env.n_loop):
                # action
                action, q, q_max = self.get_action_at_test(state)
                self.env.write_daqmx(action)

                observation = self.env.get_observation()
                processed_observation = self.env.preprocess(observation)
                next_state = np.append(state[1:, :, :],
                                       processed_observation,
                                       axis=0)
                cpte = np.average(observation[:, self.env.loc_100])
                reward = self.env.get_reward(cpte)
                self.buffer.append((state, action, reward, next_state, q))
                state = next_state

                self.env_memory = np.append(self.env_memory,
                                            observation,
                                            axis=0)
                m = [0, cpte, 0]
                m.extend(self.a_buffer)
                self.buffer_memory = np.append(self.buffer_memory, [m], axis=0)

                self.total_reward += reward
                self.total_q_max += q_max
                self.t += 1
            # simulation end
            state = self.end_loop(int(self.env.n_loop / 2), state)
            # measure time
            elapsed = time.time() - start
            # write text
            text = 'EPISODE: {0:6d} / TOTAL_REWARD: {1:3.0f}'.format(
                episode + 1, self.total_reward)
            print(text)

            with open(self.path + '/test.txt', 'a') as f:
                f.write(text + "\n")

            self.send_env_data()

    def send_env_data(self):
        self.data_queue.put(self.env_memory)
        self.buffer_queue.put(self.buffer_memory)
        self.buffer_memory = np.zeros((0, 8))
        self.env_memory = np.zeros((0, self.env.num_i))
Exemplo n.º 3
0
class Learner:
    def __init__(self,
                 args,
                 queues,
                 sess,
                 target_update_interval=2500,
                 batch_size=512,
                 lr=0.00025 / 4,
                 save_interval=100,
                 print_interval=100,
                 max_queue_no_added=1000):

        self.path = args.path
        self.load = args.load
        self.save_path = args.path + '/saved_networks/'
        self.replay_memory_size = args.replay_memory_size
        self.initial_memory_size = args.initial_memory_size
        self.frame_width = args.frame_width
        self.frame_height = args.frame_height
        self.state_length = args.state_length
        self.gamma_n = args.gamma**args.n_step
        self.queue = queues[0]
        self.param_queue = queues[1]
        self.target_update_interval = target_update_interval
        self.batch_size = batch_size
        self.lr = lr
        self.save_interval = save_interval
        self.print_interval = print_interval
        self.max_queue_no_added = max_queue_no_added
        self.no_added_count = 0
        self.remote_memory = Memory(self.replay_memory_size)
        self.model = Network(args)
        self.num_actions = args.n_actions
        self.t = 0
        self.total_time = 0
        self.queue_not_changed_count = 0
        # Parameters used for summary
        self.total_reward = 0
        self.total_q_max = 0
        self.total_loss = 0
        self.duration = 0
        self.episode = 0
        self.start = 0

        #with tf.device('/gpu:0'):
        with tf.variable_scope("learner_parameters", reuse=True):
            self.s, self.q_values, q_network = self.model.build_network()

        self.q_network_weights = self.bubble_sort_parameters(
            q_network.trainable_weights)

        # Create target network
        with tf.variable_scope("learner_target_parameters", reuse=True):
            self.st, self.target_q_values, target_network = self.model.build_network(
            )
        self.target_network_weights = self.bubble_sort_parameters(
            target_network.trainable_weights)

        # Define target network update operation
        self.update_target_network = [
            self.target_network_weights[i].assign(self.q_network_weights[i])
            for i in range(len(self.target_network_weights))
        ]

        # Define loss and gradient update operation
        self.a, self.y, self.error_abs, self.loss, self.grad_update, self.gv, self.cl = self.build_training_op(
            self.q_network_weights)

        self.sess = sess
        self.sess.run(tf.global_variables_initializer())

        #with tf.device("/cpu:0"):
        self.saver = tf.train.Saver(self.q_network_weights)

        # Load network
        if self.load:
            self.load_network()

        params = self.sess.run(
            (self.q_network_weights, self.target_network_weights))
        while not self.param_queue.full():
            self.param_queue.put(params)

        # Initialize target network
        self.sess.run(self.update_target_network)

    def bubble_sort_parameters(self, arr):
        change = True
        while change:
            change = False
            for i in range(len(arr) - 1):
                if arr[i].name > arr[i + 1].name:
                    arr[i], arr[i + 1] = arr[i + 1], arr[i]
                    change = True
        return arr

    def huber_loss(self, x, delta=1.0):
        return tf.where(
            tf.abs(x) < delta,
            tf.square(x) * 0.5, delta * (tf.abs(x) - 0.5 * delta))

    def build_training_op(self, q_network_weights):
        a = tf.placeholder(tf.int64, [None])
        y = tf.placeholder(tf.float32, [None])
        #w = tf.placeholder(tf.float32, [None])

        # Convert action to one hot vector. shape=(BATCH_SIZE, num_actions)
        a_one_hot = tf.one_hot(a, self.num_actions, 1.0, 0.0)
        # shape = (BATCH_SIZE,)
        q_value = tf.reduce_sum(tf.multiply(self.q_values, a_one_hot),
                                reduction_indices=1)

        # Clip the error, the loss is quadratic when the error is in (-1, 1), and linear outside of that region
        td_error = tf.stop_gradient(y) - q_value
        errors = self.huber_loss(td_error)
        loss = tf.reduce_mean(errors)

        optimizer = tf.train.RMSPropOptimizer(self.lr,
                                              decay=0.95,
                                              epsilon=1.5e-7,
                                              centered=True)
        grads_and_vars = optimizer.compute_gradients(
            loss, var_list=q_network_weights)
        capped_gvs = [
            (grad if grad is None else tf.clip_by_norm(grad, clip_norm=40),
             var) for grad, var in grads_and_vars
        ]
        grad_update = optimizer.apply_gradients(capped_gvs)

        return a, y, tf.abs(
            td_error), loss, grad_update, grads_and_vars, capped_gvs

    def load_network(self):
        checkpoint = tf.train.get_checkpoint_state(self.save_path)
        if checkpoint and checkpoint.model_checkpoint_path:
            self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
            print('Successfully loaded: ' + checkpoint.model_checkpoint_path)
        else:
            print('Training new network...')

    def run(self):  #, server):

        if self.remote_memory.length() < self.initial_memory_size:
            print("Learner is Waiting... Replay memory have {} transitions".
                  format(self.remote_memory.length()))

            while not self.queue.empty():
                t_error = self.queue.get()
                for i in range(len(t_error[0])):
                    self.remote_memory.add(t_error[0][i], t_error[1][i])

            if not self.param_queue.full():
                params = self.sess.run(
                    (self.q_network_weights, self.target_network_weights))
                while not self.param_queue.full():
                    self.param_queue.put(params)

            time.sleep(4)
            return self.run()

        print("Learner Starts!")
        while self.no_added_count < self.max_queue_no_added:
            start = time.time()

            state_batch = []
            action_batch = []
            reward_batch = []
            next_state_batch = []
            w_batch = []

            if self.queue.empty():
                self.no_added_count += 1
            else:
                self.no_added_count = 0
                while not self.queue.empty():
                    t_error = self.queue.get()
                    for i in range(len(t_error[0])):
                        self.remote_memory.add(t_error[0][i], t_error[1][i])

            if not self.param_queue.full():
                params = self.sess.run(
                    (self.q_network_weights, self.target_network_weights))
                while not self.param_queue.full():
                    self.param_queue.put(params)

            minibatch, idx_batch = self.remote_memory.sample(self.batch_size)

            for data in minibatch:
                state_batch.append(data[0])
                action_batch.append(data[1])
                reward_batch.append(data[2])
                #shape = (BATCH_SIZE, 4, 32, 32)
                next_state_batch.append(data[3])

                self.total_q_max += np.max(
                    self.q_values.eval(
                        feed_dict={self.s: [np.float32(data[0])]},
                        session=self.sess))

            # shape = (BATCH_SIZE, num_actions)
            target_q_values_batch = self.target_q_values.eval(
                feed_dict={self.st: np.float32(np.array(next_state_batch))},
                session=self.sess)
            # DDQN
            actions = np.argmax(self.q_values.eval(
                feed_dict={self.s: np.float32(np.array(next_state_batch))},
                session=self.sess),
                                axis=1)
            target_q_values_batch = np.array([
                target_q_values_batch[i][action]
                for i, action in enumerate(actions)
            ])
            # shape = (BATCH_SIZE,)
            y_batch = reward_batch + self.gamma_n * target_q_values_batch

            error_batch = self.error_abs.eval(feed_dict={
                self.s:
                np.float32(np.array(state_batch)),
                self.a:
                action_batch,
                self.y:
                y_batch
            },
                                              session=self.sess)

            loss, _ = self.sess.run(
                [self.loss, self.grad_update],
                feed_dict={
                    self.s: np.float32(np.array(state_batch)),
                    self.a: action_batch,
                    self.y: y_batch
                    #self.w: w_batch
                })

            self.total_loss += loss
            self.total_time += time.time() - start

            # Memory update
            [
                self.remote_memory.update(idx_batch[i], error_batch[i])
                for i in range(len(idx_batch))
            ]

            self.t += 1

            if self.t % self.print_interval == 0:
                text_l = 'AVERAGE LOSS: {0:.5F} / AVG_MAX_Q: {1:2.4F} / LEARN PER SECOND: {2:.1F} / NUM LEARN: {3:5d}'.format(
                    self.total_loss / self.print_interval,
                    self.total_q_max / (self.print_interval * self.batch_size),
                    self.print_interval / self.total_time, self.t)
                print(text_l)
                with open(self.path + '/_output.txt', 'a') as f:
                    f.write(text_l + "\n")
                #print("Average Loss: ", self.total_loss/PRINT_LOSS_INTERVAL, " / Learn Per Second: ", PRINT_LOSS_INTERVAL/self.total_time, " / AVG_MAX_Q", self.total_q_max/(PRINT_LOSS_INTERVAL*BATCH_SIZE))
                self.total_loss = 0
                self.total_time = 0
                self.total_q_max = 0

            # Update target network
            if self.t % self.target_update_interval == 0:
                self.sess.run(self.update_target_network)

            # Save network
            if self.t % self.save_interval == 0:
                with tf.device('/cpu:0'):
                    save_path = self.saver.save(self.sess,
                                                self.save_path,
                                                global_step=(self.t))
                print('Successfully saved: ' + save_path)

        print("The Learning is Over.")
        time.sleep(0.5)