Пример #1
0
 def __init__(self):
     self.agent = Framework()
     config = tf.ConfigProto()
     config.gpu_options.allow_growth = True
     self.sess = tf.Session(config=config)
     self.saver = tf.train.Saver()
     self.sess.run(tf.global_variables_initializer())
     self.sess.graph.finalize()
Пример #2
0
class Agent(object):
    def __init__(self):
        self.agent = Framework()

        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        self.sess = tf.Session(config=config)
        self.saver = tf.train.Saver()
        self.sess.run(tf.global_variables_initializer())
        self.sess.graph.finalize()

    def get_deterministic_policy(self, inputs):
        return self.agent.get_deterministic_policy(self.sess, inputs)

    def get_stochastic_policy(self, inputs, epsilon=0.9):
        return self.agent.get_stochastic_policy(self.sess, inputs, epsilon)

    def update_cache(self, state, action, reward, next_state, done):
        self.agent.update_cache(state, action, reward, next_state, done)

    def update_eval(self):
        self.agent.update_value_net(self.sess)

    def update_target(self):
        self.agent.update_target_net(self.sess)

    def save_model(self, path="model/ddqn.ckpt"):
        self.saver.save(self.sess, path)

    def restore_model(self, path="model/ddqn.ckpt"):
        self.saver.restore(self.sess, path)

    def close(self):
        self.sess.close()
Пример #3
0
    def __init__(self, train=True):
        if train:
            self.agent = Framework(0.5)
        else:
            self.agent = Framework(1.0)

        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        self.sess = tf.Session(config=config)
        trainable_variables = tf.get_collection(
            tf.GraphKeys.TRAINABLE_VARIABLES, "value")
        # print(trainable_variables)
        self.saver = tf.train.Saver(trainable_variables)
        self.sess.run(tf.global_variables_initializer())
        self.sess.graph.finalize()
Пример #4
0
class ExplorerFramework(object):
    def __init__(self, access, name, observation, action_size):
        self.Access = access
        self.AC = Framework(self.Access, observation, action_size, name)
        self.env = Account()
        self.name = name

    def get_bootstrap(self, done, sess, next_state):
        if done:
            terminal = 0
        else:
            terminal = self.AC.get_value(
                sess, np.expand_dims(next_state, axis=0))[0][0]
        return terminal

    def get_output(self, sess, inputs, actions, targets):
        return self.AC.get_losses(sess, inputs, actions, targets)

    def run(self, sess, max_episodes, t_max=32):
        episode = 0
        while episode < max_episodes:
            episode += 1
            _ = self.run_episode(sess, t_max)

    def run_episode(self, sess, t_max=32):
        t_start = t = 0
        episode_score = 0
        buffer_state = []
        buffer_action = []
        buffer_reward = []

        self.AC.init_network(sess)
        state = self.env.reset()
        while True:
            t += 1
            action = self.AC.get_stochastic_action(sess, state)
            reward, next_state, done = self.env.step(action)
            # buffer for loop
            episode_score += reward
            buffer_state.append(state)
            buffer_action.append(action)
            buffer_reward.append(reward)
            state = next_state

            if t - t_start == t_max or done:
                t_start = t
                terminal = self.get_bootstrap(done, sess, next_state)

                buffer_target = []
                for r in buffer_reward[::-1]:
                    terminal = r + GAMMA * terminal
                    buffer_target.append(terminal)
                buffer_target.reverse()

                inputs = np.stack(buffer_state, axis=0)
                actions = np.squeeze(np.vstack(buffer_action), axis=1)
                targets = np.squeeze(np.vstack(buffer_target), axis=1)
                buffer_state = []
                buffer_action = []
                buffer_reward = []
                # update Access gradients
                self.AC.train_step(sess, inputs, actions, targets)

                # update local network
                self.AC.init_network(sess)

            if done or t > MAX_EPISODE_LENGTH:
                if self.name == 'W0':
                    outputs = tuple(self.get_output(sess, inputs, actions, targets))
                    print('actor: %f, actor_grad: %f, policy mean: %f, policy: %f, entropy: %f, actor_norm: %f, '
                          'critic: %f, critic_grad: %f, value: %f, critic_norm: %f, value_mean: %f, advantage: %f'
                          % outputs)
                return episode_score
Пример #5
0
 def __init__(self, access, name, observation, action_size):
     self.Access = access
     self.AC = Framework(self.Access, observation, action_size, name)
     self.env = Account()
     self.name = name
from agent.framework import Framework
from agent.access import Access

state_size = [50, 58, 5]
A = Access(state_size, 3)
F = Framework(A, state_size, 3, "W0")
Пример #7
0
 def __init__(self, name, access, batch_size, state_size, action_size):
     self.Access = access
     self.AC = Framework(name, self.Access, batch_size, state_size, action_size)
     self.env = Account()
     self.name = name
Пример #8
0
class Agent(object):
    def __init__(self, name, access, batch_size, state_size, action_size):
        self.Access = access
        self.AC = Framework(name, self.Access, batch_size, state_size, action_size)
        self.env = Account()
        self.name = name

    def run(self, sess, max_episodes, t_max=8):
        buffer_score = []
        buffer_loss = []
        episode = 0
        while episode < max_episodes:
            episode += 1
            episode_score, outputs = self.run_episode(sess, t_max)
            buffer_score.append(episode_score)
            buffer_loss.append(outputs)
        return buffer_score, buffer_loss

    def run_episode(self, sess, t_max=8):
        t_start = t = 0
        episode_score = 1
        buffer_state = []
        buffer_action = []
        buffer_reward = []

        self.AC.init_or_update_local(sess)
        state = self.env.reset()
        while True:
            t += 1
            action = self.AC.get_stochastic_action(sess, state)
            next_state, reward, done = self.env.step(action)

            # buffer for loop
            episode_score *= (1 + reward / 100)
            buffer_state.append(state)
            buffer_action.append(action)
            buffer_reward.append(reward)
            state = next_state

            if t - t_start == t_max or done:
                t_start = t
                terminal = self.get_bootstrap(sess, next_state, done)

                buffer_target = []
                for r in buffer_reward[::-1]:
                    terminal = r + GAMMA * terminal
                    buffer_target.append(terminal)
                buffer_target.reverse()

                # stack
                inputs, gather_list = batch_stack(buffer_state)
                actions = np.vstack(buffer_action)
                targets = np.squeeze(np.vstack(buffer_target), axis=1)

                # empty buffer
                buffer_state = []
                buffer_action = []
                buffer_reward = []

                # update Access gradients
                self.AC.train_step(sess, inputs, actions, targets, gather_list)

                # update local network
                self.AC.init_or_update_local(sess)

            if done or t > MAX_EPISODE_LENGTH:
                outputs = self.get_losses(sess, inputs, actions, targets, gather_list)
                outputs = tuple(outputs)
                if self.name == 'W0':
                    print('actor: %f, actor_grad: %f, policy mean: %f, policy: %f, entropy: %f, '
                          'critic: %f, critic_grad: %f, value: %f, value_mean: %f, advantage: %f'
                          % outputs)
                return episode_score, outputs

    def get_bootstrap(self, sess, next_state, done):
        if done:
            terminal = 0
        else:
            terminal = self.AC.get_step_value(sess, next_state)
        return terminal

    def get_losses(self, sess, inputs, actions, targets, gather_list):
        return self.AC.get_losses(sess, inputs, actions, targets, gather_list)
Пример #9
0
import numpy as np
import tensorflow as tf
from agent.framework import Framework
from emulator_v0.main import Account

A = Account()
F = Framework()
# print(len(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)))
state, universe = A.reset()

sess = tf.Session()
sess.run(tf.global_variables_initializer())

order = F.get_deterministic_policy(sess, state)
next_state, next_universe, reward, done, value, portfolio = \
    A.step(order, universe)

for i in range(2048):
    F.update_cache(state, order, reward, next_state, done)

F.update_value_net(sess)