示例#1
0
class Agent(BaseAgent):
    '''Deep Trading Agent based on Deep Q Learning'''
    '''TODO: 
        1. add `play` function to run tests in the simulated environment
    '''

    def __init__(self, sess, logger, config, env):
        super(Agent, self).__init__(config, logger)
        self.sess = sess
        self.logger = logger
        self.config = config
        params = DeepSenseParams(config)

        self.env = env
        self.history = History(logger, config)
        self.replay_memory = ReplayMemory(logger, config)

        with tf.variable_scope(STEPS):
            self.step_op = tf.Variable(0, trainable=False, name=STEP)
            self.step_input = tf.placeholder('int32', None, name=STEP_INPUT)
            self.step_assign_op = self.step_op.assign(self.step_input)

        self.build_dqn(params)

    @property
    def summary_writer(self):
        return self._summary_writer

    def train(self):
        start_step = self.sess.run(self.step_op)

        num_episodes, self.update_count, ep_reward = 0, 0, 0.
        total_reward, self.total_loss, self.total_q = 0., 0., 0.
        max_avg_ep_reward = 0
        ep_rewards, actions = [], []

        trade_rem = self.env.new_random_episode(self.history, self.replay_memory)

        for self.step in tqdm(range(start_step, self.max_step), ncols=70, initial=start_step):
            if self.step == self.learn_start:
                num_episodes, self.update_count, ep_reward = 0, 0, 0.
                total_reward, self.total_loss, self.total_q = 0., 0., 0.
                ep_rewards, actions = [], []

            # 1. predict
            action = self.predict((self.history.history, trade_rem))
            # 2. act
            screen, reward, terminal, trade_rem = self.env.act(action)
            # 3. observe
            self.observe(screen, reward, action, terminal, trade_rem)

            if terminal:
                self.env.new_random_episode(self.history, self.replay_memory)
                num_episodes += 1
                ep_rewards.append(ep_reward)
                ep_reward = 0.

            else:
                ep_reward += reward

            actions.append(action)
            total_reward += reward
            
            if self.step >= self.learn_start:
                if self.step % self.test_step == self.test_step - 1:
                    avg_reward = total_reward / self.test_step
                    avg_loss = self.total_loss / self.update_count
                    avg_q = self.total_q / self.update_count

                    try:
                        max_ep_reward = np.max(ep_rewards)
                        min_ep_reward = np.min(ep_rewards)
                        avg_ep_reward = np.mean(ep_rewards)
                    except:
                        max_ep_reward, min_ep_reward, avg_ep_reward = 0, 0, 0

                    message = 'avg_r: %.4f, avg_l: %.6f, avg_q: %3.6f, avg_ep_r: %.4f, max_ep_r: %.4f, min_ep_r: %.4f, # game: %d' \
                        % (avg_reward, avg_loss, avg_q, avg_ep_reward, max_ep_reward, min_ep_reward, num_episodes)
                    self.logger.info(message)

                    if max_avg_ep_reward * 0.9 <= avg_ep_reward:
                        self.sess.run(
                            fetches=self.step_assign_op,
                            feed_dict={self.step_input: self.step + 1}
                        )
                        self.save_model(self.step + 1)

                        max_avg_ep_reward = max(max_avg_ep_reward, avg_ep_reward)

                    if self.step > 180:
                        self.inject_summary({
                            'average.reward': avg_reward,
                            'average.loss': avg_loss,
                            'average.q': avg_q,
                            'episode.max reward': max_ep_reward,
                            'episode.min reward': min_ep_reward,
                            'episode.avg reward': avg_ep_reward,
                            'episode.num of episodes': num_episodes,
                            'episode.rewards': ep_rewards,
                            'episode.actions': actions,
                            'training.learning_rate': self.sess.run(
                                fetches=self.learning_rate_op,
                                feed_dict={self.learning_rate_step: self.step}
                            )
                        }, self.step)

                    num_episodes = 0
                    total_reward = 0.
                    self.total_loss = 0.
                    self.total_q = 0.
                    self.update_count = 0
                    ep_reward = 0.
                    ep_rewards = []
                    actions = []
    
    def predict(self, state, test_ep=None):
        s_t = state[0]
        trade_rem_t = state[1]
        ep = test_ep or (self.ep_end +
            max(0., (self.ep_start - self.ep_end) \
            * (self.ep_end_t - max(0., self.step - self.learn_start)) / self.ep_end_t))

        if random.random() < ep:
            action = random.randrange(self.config[NUM_ACTIONS])
        else:
            action = self.sess.run(
                fetches=self.q.action,
                feed_dict={
                    self.q.phase: 0,  
                    self.s_t: [s_t], 
                    self.trade_rem_t: [trade_rem_t],
                    self.q_conv_keep_prob: 1.0,
                    self.q_dense_keep_prob: 1.0,
                    self.q_gru_keep_prob: 1.0
                }
            )[0]

        return action

    def observe(self, screen, reward, action, terminal, trade_rem):
        #clip reward in the range min to max
        reward = max(self.min_reward, min(self.max_reward, reward))
        
        self.history.add(screen)
        self.replay_memory.add(screen, reward, action, terminal, trade_rem)

        if self.step > self.learn_start:
            if self.step % self.train_frequency == 0:
                self.q_learning_mini_batch()

            if self.step % self.target_q_update_step == self.target_q_update_step - 1:
                self.update_target_network()

    def q_learning_mini_batch(self):
        if self.replay_memory.count >= self.replay_memory.history_length:
            state_t, action, reward, state_t_plus_1, terminal = self.replay_memory.sample
            s_t, trade_rem_t = state_t[0], state_t[1]
            s_t_plus_1, trade_rem_t_plus_1 = state_t_plus_1[0], state_t_plus_1[1]
            
            q_t_plus_1 = self.sess.run(
                fetches=self.t_q.values,
                feed_dict={
                    self.t_q.phase: 0, 
                    self.t_s_t: s_t_plus_1, 
                    self.t_trade_rem_t: trade_rem_t_plus_1
                }
            )

            max_q_t_plus_1 = np.max(q_t_plus_1, axis=1)

            terminal = np.array(terminal) + 0.
            target_q = reward + (1 - terminal) * max_q_t_plus_1

            _, q_t, loss, avg_q_summary = self.sess.run([self.optimizer, self.q.values, self.loss, self.q.avg_q_summary], {
                self.q.phase: 1,
                self.target_q: target_q,
                self.action: action,
                self.s_t: s_t,
                self.trade_rem_t: trade_rem_t,
                self.q_conv_keep_prob: self.config[CONV_KEEP_PROB],
                self.q_dense_keep_prob: self.config[DENSE_KEEP_PROB],
                self.q_gru_keep_prob: self.config[GRU_KEEP_PROB],
                self.learning_rate_step: self.step
            })

            self.summary_writer.add_summary(avg_q_summary, self.step)
            self.total_loss += loss
            self.total_q += q_t.mean()
            self.update_count += 1

    def build_dqn(self, params):
        with tf.variable_scope(PREDICTION):
            self.s_t = tf.placeholder(
                dtype=tf.float32,
                shape=[None, self.replay_memory.history_length, 
                            self.replay_memory.num_channels],
                name=HISTORICAL_PRICES
            )
            self.trade_rem_t = tf.placeholder(
                dtype=tf.float32,
                shape=[None,],
                name=TRADE_REM
            )
            
            with tf.variable_scope(DROPOUT_KEEP_PROBS):
                self.q_conv_keep_prob = tf.placeholder(tf.float32)
                self.q_dense_keep_prob = tf.placeholder(tf.float32)
                self.q_gru_keep_prob = tf.placeholder(tf.float32)

        params.dropoutkeepprobs = DropoutKeepProbs(
                    self.q_conv_keep_prob,
                    self.q_dense_keep_prob,
                    self.q_gru_keep_prob
                )
        self.q = DeepSense(params, self.logger, self.sess, self.config, name=Q_NETWORK)
        self.q.build_model((self.s_t, self.trade_rem_t))

        with tf.variable_scope(TARGET):
            self.t_s_t = tf.placeholder(
                dtype=tf.float32,
                shape=[None, self.replay_memory.history_length, 
                            self.replay_memory.num_channels],
                name=HISTORICAL_PRICES
            )
            self.t_trade_rem_t = tf.placeholder(
                dtype=tf.float32,
                shape=[None,],
                name=TRADE_REM
            )

        params.dropoutkeepprobs = DropoutKeepProbs()
        self.t_q = DeepSense(params, self.logger, self.sess, self.config, name=T_Q_NETWORK)
        self.t_q.build_model((self.t_s_t, self.t_trade_rem_t))

        with tf.variable_scope(UPDATE_TARGET_NETWORK):
            self.q_weights_placeholders = {}
            self.t_weights_assign_ops = {}

            for name in self.q.weights.keys():
                self.q_weights_placeholders[name] = tf.placeholder(
                            tf.float32,
                            self.q.weights[name].get_shape().as_list()
                        )
            for name in self.q.weights.keys():
                self.t_weights_assign_ops[name] = self.t_q.weights[name].assign(
                    self.q_weights_placeholders[name]
                )

        with tf.variable_scope(TRAINING):
            self.target_q = tf.placeholder(tf.float32, [None], name=TARGET_Q)
            self.action = tf.placeholder(tf.int64, [None], name=ACTION)
            
            action_one_hot = tf.one_hot(self.action, self.config[NUM_ACTIONS], 
                                            1.0, 0.0, name=ACTION_ONE_HOT)
            q_acted = tf.reduce_sum(self.q.values * action_one_hot, 
                                        reduction_indices=1, name=Q_ACTED)
                                        
            with tf.variable_scope(LOSS):
                self.delta = self.target_q - q_acted

                self.global_step = tf.Variable(0, trainable=False)

                self.loss = tf.reduce_mean(clipped_error(self.delta), name=LOSS)

            with tf.variable_scope(OPTIMIZER):
                self.learning_rate_step = tf.placeholder(tf.int64, None, name=LEARNING_RATE_STEP)
                self.learning_rate_op = tf.maximum(self.learning_rate_minimum,
                    tf.train.exponential_decay(
                        self.learning_rate,
                        self.learning_rate_step,
                        self.learning_rate_decay_step,
                        self.learning_rate_decay,
                        staircase=True))

                self.optimizer = tf.train.RMSPropOptimizer(
                    self.learning_rate_op, momentum=0.95, epsilon=0.01).minimize(self.loss)

        with tf.variable_scope(SUMMARY):
            scalar_summary_tags = ['average.reward', 'average.loss', 'average.q', \
                'episode.max reward', 'episode.min reward', 'episode.avg reward', \
                'episode.num of episodes', 'training.learning_rate']            

            self.summary_placeholders = {}
            self.summary_ops = {}

            for tag in scalar_summary_tags:
                self.summary_placeholders[tag] = \
                    tf.placeholder('float32', None, name=tag.replace(' ', '_'))
                self.summary_ops[tag] = \
                    tf.summary.scalar(
                        name="{}-{}".format(self.env_name, tag.replace(' ', '_')),
                        tensor=self.summary_placeholders[tag]
                    )

            histogram_summary_tags = ['episode.rewards', 'episode.actions']
            for tag in histogram_summary_tags:
                self.summary_placeholders[tag] = \
                    tf.placeholder('float32', None, name=tag)
                self.summary_ops[tag] = \
                    tf.summary.histogram(
                        tag,
                        self.summary_placeholders[tag]
                    )

        self.sess.run(tf.local_variables_initializer())
        self.sess.run(tf.global_variables_initializer())
        self._saver = tf.train.Saver(self.q.weights.values() + [self.step_op], max_to_keep=30)
        
        self.load_model()
        self.update_target_network()

        self._summary_writer = tf.summary.FileWriter(self.config[TENSORBOARD_LOG_DIR])
        self._summary_writer.add_graph(self.sess.graph)

    def update_target_network(self):
        for name in self.q.weights.keys():
            self.sess.run(
                fetches=self.t_weights_assign_ops[name],
                feed_dict=
                {self.q_weights_placeholders[name]: self.sess.run(
                    fetches=self.q.weights[name]
                )}
            )
    
    def inject_summary(self, tag_dict, step):
        summary_str_lists = self.sess.run([self.summary_ops[tag] for tag in tag_dict.keys()], {
            self.summary_placeholders[tag]: value for tag, value in tag_dict.items()
        })
        for summary_str in summary_str_lists:
            self.summary_writer.add_summary(summary_str, self.step)
示例#2
0
class UserFile:
    DATA_FILE = os.path.expanduser("~/.cache/.wfclidata")
    root_node_id = "0"

    # SETUP METHODS
    def __init__(self):
        self.nds = NodeStore()
        self.cursor_y = 0
        self._load_data()
        self._update = True
        self.history = History(seed=self.nds)

    # Decorator for funtions that need to force an update to our tree
    def update_visible_after(func):
        def do_update(self, *args, **kwargs):
            result = func(self, *args, **kwargs)
            self._update = True
            return result
        return do_update

    def update_visible_now(self):
        self._update = True

    def set_cursor_to_node(self, node_id):
        self.update_visible_now()
        for i, v in enumerate(self.visible):
            if v[0].uuid == node_id:
                self.cursor_y = i

    def current_node(self, depth=False):
        node_pair = self.visible[self.cursor_y]
        if depth:
            return node_pair
        else:
            return node_pair[0]

    # FILE METHODS
    def data_from_file_object(self, fo):
        data = json.load(fo)
        for node_def in data:
            node = Node(node_def=node_def)
            self.nds.add_node(node)

    @classmethod
    def _write_data_file(cls, data_obj):
        os.makedirs(os.path.dirname(cls.DATA_FILE), exist_ok=True)
        with open(cls.DATA_FILE, "x+") as f:
            json.dump(data_obj, f, indent=2)

    @classmethod
    def _create_empty_data_file(cls):
        empty_data = [
            {"id": cls.root_node_id, "pa": None, "ch": ["1"]},
            {"pa": cls.root_node_id, "id": "1", "nm": "Write down your thoughts"},
        ]
        cls._write_data_file(empty_data)

    def _load_data(self):
        if not os.path.exists(self.DATA_FILE):
            self._create_empty_data_file()

        with open(self.DATA_FILE) as f:
            self.data_from_file_object(f)

    def save(self):
        with open(self.DATA_FILE, "w") as f:
            json.dump(self.nds.flat_format, f, indent=2)

    def commit(self):
        self.history.add(self.nds)

    # TREE TRAVERSAL
    @property
    def visible(self):
        if self._update:
            self._update = False
            return self.load_visible()
        else:
            return self._visible

    def load_visible(self):
        """
        returns a list of tuples like this ( node, depth,)
        """
        self._visible = []
        for node in self.nds.get_node(self.root_node_id).children:
            if node is not None:
                self._traverse_node(node, 0)
        return self._visible

    def _traverse_node(self, node, depth):
        current_node = self.nds.get_node(node)
        self._visible.append((current_node, depth))
        if not current_node.closed:
            for child in current_node.children:
                self._traverse_node(child, depth + 1)

    # NAVIGATION METHODS
    def nav_up(self):
        if self.cursor_y > 0:
            self.cursor_y -= 1

    def nav_down(self):
        if self.cursor_y < len(self.visible) - 1:
            self.cursor_y += 1

    def bottom(self):
        self.cursor_y = len(self.visible) - 1

    def top(self):
        self.cursor_y = 0

    # LINKING METHODS
    @update_visible_after
    def link_parent_child(self, parent, child, position=-1):
        self.nds.get_node(child).parent = parent
        if position >= 0:
            self.nds.get_node(parent).children.insert(position, child)
        else:
            self.nds.get_node(parent).children.append(child)

    @update_visible_after
    def unlink_relink(self, old_parent, child, new_parent, position):
        def unlink_parent_child(self, parent, child):
            assert child in self.nds
            assert parent in self.nds
            assert self.nds.get_node(child).parent == parent
            assert child in self.nds.get_node(parent).children
            self.nds.get_node(parent).children.remove(child)
            self.nds.get_node(child).parent = None
        unlink_parent_child(self, old_parent, child)
        self.link_parent_child(new_parent, child, position)

    # MANIPULATE NODES
    @update_visible_after
    def indent(self):
        current_node = self.current_node()
        parent_node = current_node.parent
        parents_child_list = self.nds.get_node(parent_node).children
        current_node_index = parents_child_list.index(current_node.uuid)
        if current_node_index == 0:
            raise ModelException("Indent of top child")
        else:
            new_parent = parents_child_list[current_node_index - 1]
            self.unlink_relink(parent_node, current_node.uuid, new_parent, -1)
            self.nds.get_node(new_parent).closed = False
            log.info("Nailed it")

    @update_visible_after
    def unindent(self):
        current_node = self.current_node()
        parent_id = current_node.parent
        if parent_id == self.root_node_id:
            raise ModelException("top level, can't unindent")
        else:
            super_parent_node = self.nds.get_node(self.nds.get_node(parent_id).parent)
            pos_in_parent_list = super_parent_node.children.index(parent_id)
            self.unlink_relink(
                parent_id,
                current_node.uuid,
                super_parent_node.uuid,
                pos_in_parent_list + 1,
            )
            log.info("nailed it")

    @update_visible_after
    def open_above(self):
        current_node = self.current_node()
        parent_node = self.nds.get_node(current_node.parent)
        new_node = self.create_node(parent_node.uuid)
        pos_in_parent_list = parent_node.children.index(current_node.uuid)
        self.link_parent_child(
            parent_node.uuid,
            new_node.uuid,
            pos_in_parent_list,
        )

    @update_visible_after
    def open_below(self):
        current_node = self.current_node()
        if current_node.state == "open":
            new_node = self.create_node(current_node.uuid)
            self.link_parent_child(
                current_node.uuid,
                new_node.uuid,
                0,
            )
        else:  # new node is sibling of current node
            parent_node = self.nds.get_node(current_node.parent)
            new_node = self.create_node(parent_node.uuid)
            pos_in_parent_list = parent_node.children.index(current_node.uuid)
            self.link_parent_child(
                parent_node.uuid,
                new_node.uuid,
                pos_in_parent_list + 1,
            )

    @update_visible_after
    def delete_item(self, node_id=None):
        current_node = self.current_node() if node_id is None else self.nds.get_node(node_id)
        for child_id in current_node.children[:]:
            self.delete_item(node_id=child_id)
        parent_id = current_node.parent
        self.nds.get_node(parent_id).children.remove(current_node.uuid)
        del self.nds[current_node.uuid]
        if node_id is None:  # this is our top-level delete
            self.cursor_y = max(0, self.cursor_y - 1)
            if len(self.nds.get_node(self.root_node_id).children) == 0:
                new_node = self.create_node(
                    self.root_node_id,
                    nm="Ooops, you deleted the last item on the list",
                )
                self.link_parent_child(
                    self.root_node_id,
                    new_node.uuid,
                    0,
                )

    @update_visible_after
    def move_down(self):
        current_node = self.current_node()
        parent_id = current_node.parent
        parents_child_list = self.nds.get_node(parent_id).children
        current_node_index = parents_child_list.index(current_node.uuid)
        if current_node_index < len(parents_child_list) - 1:
            # swap with the one behind
            parents_child_list[current_node_index] = parents_child_list[current_node_index + 1]
            parents_child_list[current_node_index + 1] = current_node.uuid
            self.set_cursor_to_node(current_node.uuid)

    @update_visible_after
    def move_up(self):
        current_node = self.current_node()
        parent_id = current_node.parent
        parents_child_list = self.nds.get_node(parent_id).children
        current_node_index = parents_child_list.index(current_node.uuid)
        if current_node_index > 0:
            # swap with the one behind
            parents_child_list[current_node_index] = parents_child_list[current_node_index - 1]
            parents_child_list[current_node_index - 1] = current_node.uuid
            self.set_cursor_to_node(current_node.uuid)

    @update_visible_after
    def complete(self):
        current_node = self.current_node()
        current_node.complete = not current_node.complete

    @update_visible_after
    def create_node(self, parent, **kwargs):
        node = Node(pa=parent, **kwargs)
        self.nds.add_node(node)
        return node

    @update_visible_after
    def collapse_node(self):
        self.visible[self.cursor_y][0].closed = True

    @update_visible_after
    def expand_node(self):
        self.visible[self.cursor_y][0].closed = False

    @update_visible_after
    def undo(self):
        ret = self.history.undo()
        if ret is not None:
            self.nds = ret

    @update_visible_after
    def redo(self):
        ret = self.history.redo()
        if ret is not None:
            self.nds = ret

    # EDIT TEXT
    @update_visible_after
    def add_char(self, char, cursor_x):
        current_node = self.current_node()
        name = current_node.name[0:cursor_x] + char + current_node.name[cursor_x:]
        current_node.name = name

    @update_visible_after
    def delete_char(self, num, cursor_x):
        current_node = self.current_node()
        if cursor_x > 0:
            name = current_node.name[0:cursor_x - num] + current_node.name[cursor_x:]
            current_node.name = name
示例#3
0
class Agent(BaseAgent):
    '''Deep Trading Agent based on Deep Q Learning'''
    '''TODO: 
        1. play
    '''
    def __init__(self, sess, logger, config, env):
        super(Agent, self).__init__(config, logger)
        self.sess = sess
        self.logger = logger
        self.config = config
        params = DeepSenseParams(config)

        self.env = env
        self.history = History(logger, config)
        self.replay_memory = ReplayMemory(logger, config)

        with tf.variable_scope(STEPS):
            self.step_op = tf.Variable(0, trainable=False, name=STEP)
            self.step_input = tf.placeholder('int32', None, name=STEP_INPUT)
            self.step_assign_op = self.step_op.assign(self.step_input)

        self.build_dqn(params)

    @property
    def summary_writer(self):
        return self._summary_writer

    def train(self):
        start_step = self.step_op.eval()

        num_episodes, self.update_count, ep_reward = 0, 0, 0.
        total_reward, self.total_loss, self.total_q = 0., 0., 0.
        max_avg_ep_reward = 0
        ep_rewards, actions = [], []

        self.env.new_random_episode(self.history)

        for self.step in tqdm(range(start_step, self.max_step),
                              ncols=70,
                              initial=start_step):
            if self.step == self.learn_start:
                num_episodes, self.update_count, ep_reward = 0, 0, 0.
                total_reward, self.total_loss, self.total_q = 0., 0., 0.
                ep_rewards, actions = [], []

            # 1. predict
            action = self.predict(self.history.get())
            # 2. act
            screen, reward, terminal = self.env.act(action)
            # 3. observe
            self.observe(screen, reward, action, terminal)

            if terminal:
                self.env.new_random_episode(self.history)
                num_episodes += 1
                ep_rewards.append(ep_reward)
                ep_reward = 0.

            else:
                ep_reward += reward

            actions.append(action)
            total_reward += reward

            if self.step >= self.learn_start:
                if self.step % self.test_step == self.test_step - 1:
                    avg_reward = total_reward / self.test_step
                    avg_loss = self.total_loss / self.update_count
                    avg_q = self.total_q / self.update_count

                    try:
                        max_ep_reward = np.max(ep_rewards)
                        min_ep_reward = np.min(ep_rewards)
                        avg_ep_reward = np.mean(ep_rewards)
                    except:
                        max_ep_reward, min_ep_reward, avg_ep_reward = 0, 0, 0

                    message = 'avg_r: %.4f, avg_l: %.6f, avg_q: %3.6f, avg_ep_r: %.4f, max_ep_r: %.4f, min_ep_r: %.4f, # game: %d' \
                        % (avg_reward, avg_loss, avg_q, avg_ep_reward, max_ep_reward, min_ep_reward, num_game)
                    print_and_log_message(message, self.logger)

                    if max_avg_ep_reward * 0.9 <= avg_ep_reward:
                        self.step_assign_op.eval(
                            {self.step_input: self.step + 1})
                        self.save_model(self.step + 1)

                        max_avg_ep_reward = max(max_avg_ep_reward,
                                                avg_ep_reward)

                    if self.step > 180:
                        self.inject_summary(
                            {
                                'average.reward':
                                avg_reward,
                                'average.loss':
                                avg_loss,
                                'average.q':
                                avg_q,
                                'episode.max reward':
                                max_ep_reward,
                                'episode.min reward':
                                min_ep_reward,
                                'episode.avg reward':
                                avg_ep_reward,
                                'episode.num of game':
                                num_game,
                                'episode.rewards':
                                ep_rewards,
                                'episode.actions':
                                actions,
                                'training.learning_rate':
                                self.learning_rate_op.eval(
                                    {self.learning_rate_step: self.step}),
                            }, self.step)

                    num_game = 0
                    total_reward = 0.
                    self.total_loss = 0.
                    self.total_q = 0.
                    self.update_count = 0
                    ep_reward = 0.
                    ep_rewards = []
                    actions = []

    def predict(self, s_t, test_ep=None):
        ep = test_ep or (self.ep_end +
            max(0., (self.ep_start - self.ep_end) \
            * (self.ep_end_t - max(0., self.step - self.learn_start)) / self.ep_end_t))

        if random.random() < ep:
            action = random.randrange(self.env.action_size)
        else:
            action = self.q.action.eval({self.s_t: [s_t]})[0]

        return action

    def observe(self, screen, reward, action, terminal):
        #clip reward in the range min to max
        reward = max(self.min_reward, min(self.max_reward, reward))

        self.history.add(screen)
        self.replay_memory.add(screen, reward, action, terminal)

        if self.step > self.learn_start:
            if self.step % self.train_frequency == 0:
                self.q_learning_mini_batch()

            if self.step % self.target_q_update_step == self.target_q_update_step - 1:
                self.update_target_network()

    def q_learning_mini_batch(self):
        if self.replay_memory.count >= self.replay_memory.history_length:
            s_t, action, reward, s_t_plus_1, terminal = self.replay_memory.sample(
            )

            max_q_t_plus_1 = self.t_q.action.eval({self.t_s_t: s_t_plus_1})
            terminal = np.array(terminal) + 0.
            target_q = reward + (1 - terminal) * max_q_t_plus_1

            _, q_t, loss, avg_q_summary = self.sess.run(
                [
                    self.optimizer, self.q.values, self.loss,
                    self.q.avg_q_summary
                ], {
                    self.target_q: target_q,
                    self.action: action,
                    self.s_t: s_t,
                    self.learning_rate_step: self.step,
                })

            self.summary_writer.add_summary(avg_q_summary, self.step)
            self.total_loss += loss
            self.total_q += q_t.mean()
            self.update_count += 1

    def build_dqn(self, params):
        with tf.variable_scope(PREDICTION):
            self.s_t = tf.placeholder(dtype=tf.float32,
                                      shape=[
                                          None,
                                          self.replay_memory.history_length,
                                          self.replay_memory.num_channels
                                      ])
        self.q = DeepSense(params,
                           self.logger,
                           self.sess,
                           self.config,
                           name=Q_NETWORK)
        self.q.build_model(self.s_t)

        with tf.variable_scope(TARGET):
            self.t_s_t = tf.placeholder(dtype=tf.float32,
                                        shape=[
                                            None,
                                            self.replay_memory.history_length,
                                            self.replay_memory.num_channels
                                        ])
        self.t_q = DeepSense(params,
                             self.logger,
                             self.sess,
                             self.config,
                             name=T_Q_NETWORK)
        self.t_q.build_model(self.t_s_t, train=False)

        with tf.variable_scope(UPDATE_TARGET_NETWORK):
            self.q_weights_placeholders = {}
            self.t_weights_assign_ops = {}

            for name in self.q.weights.keys():
                self.q_weights_placeholders[name] = tf.placeholder(
                    tf.float32, self.q.weights[name].get_shape().as_list())
            for name in self.q.weights.keys():
                self.t_weights_assign_ops[name] = self.t_q.weights[
                    name].assign(self.q_weights_placeholders[name])

        with tf.variable_scope(TRAINING):
            self.target_q = tf.placeholder(tf.float32, [None], name=TARGET_Q)
            self.action = tf.placeholder(tf.int64, [None], name=ACTION)

            action_one_hot = tf.one_hot(self.action,
                                        self.env.action_size,
                                        1.0,
                                        0.0,
                                        name=ACTION_ONE_HOT)
            q_acted = tf.reduce_sum(self.q.values * action_one_hot,
                                    reduction_indices=1,
                                    name=Q_ACTED)

            with tf.variable_scope(LOSS):
                self.delta = self.target_q - q_acted

                self.global_step = tf.Variable(0, trainable=False)

                self.loss = tf.reduce_mean(clipped_error(self.delta),
                                           name=LOSS)

            with tf.variable_scope(OPTIMIZER):
                self.learning_rate_step = tf.placeholder(
                    tf.int64, None, name=LEARNING_RATE_STEP)
                self.learning_rate_op = tf.maximum(
                    self.learning_rate_minimum,
                    tf.train.exponential_decay(self.learning_rate,
                                               self.learning_rate_step,
                                               self.learning_rate_decay_step,
                                               self.learning_rate_decay,
                                               staircase=True))

                self.optimizer = tf.train.RMSPropOptimizer(
                    self.learning_rate_op, momentum=0.95,
                    epsilon=0.01).minimize(self.loss)

        with tf.variable_scope(SUMMARY):
            scalar_summary_tags = ['average.reward', 'average.loss', 'average.q', \
                'episode.max reward', 'episode.min reward', 'episode.avg reward', \
                'episode.num of game', 'training.learning_rate']

            self.summary_placeholders = {}
            self.summary_ops = {}

            for tag in scalar_summary_tags:
                self.summary_placeholders[tag] = \
                    tf.placeholder('float32', None, name=tag.replace(' ', '_'))
                self.summary_ops[tag] = \
                    tf.summary.scalar(
                        name="{}-{}".format(self.env_name, tag),
                        tensor=self.summary_placeholders[tag]
                    )

            histogram_summary_tags = ['episode.rewards', 'episode.actions']
            for tag in histogram_summary_tags:
                self.summary_placeholders[tag] = \
                    tf.placeholder('float32', None, name=tag.replace(' ', '_'))
                self.summary_ops[tag] = \
                    tf.summary.histogram(
                        name=tag,
                        self.summary_placeholders[tag]
                    )

        self._summary_writer = tf.summary.FileWriter(
            config[TENSORBOARD_LOG_DIR])
        self._summary_writer.add_graph(sess.graph)

        tf.initialize_all_variables().run()
        self._saver = tf.train.Saver(self.q.weights.values + [self.step_op],
                                     max_to_keep=30)

        self.load_model()
        self.update_target_network()

    def update_target_network(self):
        for name in self.q.weights.keys():
            self.t_weights_assign_ops[name].eval({
                self.q_weights_placeholders[name]:
                self.q.weights[name].eval()
            })

    def inject_summary(self, tag_dict, step):
        summary_str_lists = self.sess.run(
            [self.summary_ops[tag] for tag in tag_dict.keys()], {
                self.summary_placeholders[tag]: value
                for tag, value in tag_dict.items()
            })
        for summary_str in summary_str_lists:
            self.writer.add_summary(summary_str, self.step)
示例#4
0
class Agent(BaseAgent):
    '''Deep Trading Agent based on Deep Q Learning'''
    '''TODO: 
        1. add summary ops
        2. timing and logging
        3. model saving
        4. increment self.step
    '''
    def __init__(self, sess, logger, config, env):
        super(Agent, self).__init__(config)
        self.sess = sess
        self.logger = logger
        self.config = config
        params = DeepSenseParams(config)

        self.env = env
        self.history = History(logger, config)
        self.replay_memory = ReplayMemory(logger, config)

        with tf.variable_scope(STEPS):
            self.step_op = tf.Variable(0, trainable=False, name=STEP)
            self.step_input = tf.placeholder('int32', None, name=STEP_INPUT)
            self.step_assign_op = self.step_op.assign(self.step_input)

        self.build_dqn(params)

    def train(self):
        start_step = self.step_op.eval()

        num_episodes, self.update_count, ep_reward = 0, 0, 0.
        total_reward, self.total_loss, self.total_q = 0., 0., 0.
        max_avg_ep_reward = 0
        ep_rewards, actions = [], []

        self.env.new_random_episode(self.history)

        for self.step in range(start_step, self.max_step):
            if self.step == self.learn_start:
                num_episodes, self.update_count, ep_reward = 0, 0, 0.
                total_reward, self.total_loss, self.total_q = 0., 0., 0.
                ep_rewards, actions = [], []

            # 1. predict
            action = self.predict(self.history.get())
            # 2. act
            screen, reward, terminal = self.env.act(action)
            # 3. observe
            self.observe(screen, reward, action, terminal)

            if terminal:
                self.env.new_random_episode(self.history)
                num_episodes += 1
                ep_rewards.append(ep_reward)
                ep_reward = 0.

            else:
                ep_reward += reward

            actions.append(action)
            total_reward += reward

    def predict(self, s_t, test_ep=None):
        ep = test_ep or (self.ep_end +
            max(0., (self.ep_start - self.ep_end) \
            * (self.ep_end_t - max(0., self.step - self.learn_start)) / self.ep_end_t))

        if random.random() < ep:
            action = random.randrange(self.env.action_size)
        else:
            action = self.q.action.eval({self.s_t: [s_t]})[0]

        return action

    def observe(self, screen, reward, action, terminal):
        #clip reward in the range min to max
        reward = max(self.min_reward, min(self.max_reward, reward))

        self.history.add(screen)
        self.replay_memory.add(screen, reward, action, terminal)

        if self.step > self.learn_start:
            if self.step % self.train_frequency == 0:
                self.q_learning_mini_batch()

            if self.step % self.target_q_update_step == self.target_q_update_step - 1:
                self.update_target_network()

    def q_learning_mini_batch(self):
        if self.replay_memory.count >= self.replay_memory.history_length:
            s_t, action, reward, s_t_plus_1, terminal = self.replay_memory.sample(
            )

            max_q_t_plus_1 = self.t_q.action.eval({self.t_s_t: s_t_plus_1})
            terminal = np.array(terminal) + 0.
            target_q = reward + (1 - terminal) * max_q_t_plus_1

            _, q_t, loss = self.sess.run(
                [self.optimizer, self.q.values, self.loss], {
                    self.target_q: target_q,
                    self.action: action,
                    self.s_t: s_t,
                    self.learning_rate_step: self.step,
                })

            self.total_loss += loss
            self.total_q += q_t.mean()
            self.update_count += 1

    def build_dqn(self, params):
        with tf.variable_scope(PREDICTION):
            self.s_t = tf.placeholder(dtype=tf.float32,
                                      shape=[
                                          None,
                                          self.replay_memory.history_length,
                                          self.replay_memory.num_channels
                                      ])
        self.q = DeepSense(params,
                           self.logger,
                           self.sess,
                           self.config,
                           name=Q_NETWORK)
        self.q.build_model(self.s_t)

        with tf.variable_scope(TARGET):
            self.t_s_t = tf.placeholder(dtype=tf.float32,
                                        shape=[
                                            None,
                                            self.replay_memory.history_length,
                                            self.replay_memory.num_channels
                                        ])
        self.t_q = DeepSense(params,
                             self.logger,
                             self.sess,
                             self.config,
                             name=T_Q_NETWORK)
        self.t_q.build_model(self.t_s_t, train=False)

        with tf.variable_scope(UPDATE_TARGET_NETWORK):
            self.q_weights_placeholders = {}
            self.t_weights_assign_ops = {}

            for name in self.q.weights.keys():
                self.q_weights_placeholders[name] = tf.placeholder(
                    tf.float32, self.q.weights[name].get_shape().as_list())
            for name in self.q.weights.keys():
                self.t_weights_assign_ops[name] = self.t_q.weights[
                    name].assign(self.q_weights_placeholders[name])

        with tf.variable_scope(TRAINING):
            self.target_q = tf.placeholder(tf.float32, [None], name=TARGET_Q)
            self.action = tf.placeholder(tf.int64, [None], name=ACTION)

            action_one_hot = tf.one_hot(self.action,
                                        self.env.action_size,
                                        1.0,
                                        0.0,
                                        name=ACTION_ONE_HOT)
            q_acted = tf.reduce_sum(self.q.values * action_one_hot,
                                    reduction_indices=1,
                                    name=Q_ACTED)

            with tf.variable_scope(LOSS):
                self.delta = self.target_q - q_acted

                self.global_step = tf.Variable(0, trainable=False)

                self.loss = tf.reduce_mean(clipped_error(self.delta),
                                           name=LOSS)
            with tf.variable_scope(OPTIMIZER):
                self.learning_rate_step = tf.placeholder(
                    tf.int64, None, name=LEARNING_RATE_STEP)
                self.learning_rate_op = tf.maximum(
                    self.learning_rate_minimum,
                    tf.train.exponential_decay(self.learning_rate,
                                               self.learning_rate_step,
                                               self.learning_rate_decay_step,
                                               self.learning_rate_decay,
                                               staircase=True))

                self.optimizer = tf.train.RMSPropOptimizer(
                    self.learning_rate_op, momentum=0.95,
                    epsilon=0.01).minimize(self.loss)

        # tf.initialize_all_variables().run()
        #initialize the q network and the target network with the same weights
        # self.update_target_network()

    def update_target_network(self):
        for name in self.q.weights.keys():
            self.t_weights_assign_ops[name].eval({
                self.q_weights_placeholders[name]:
                self.q.weights[name].eval()
            })