def __init__(self): self.agent = Framework() config = tf.ConfigProto() config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) self.saver = tf.train.Saver() self.sess.run(tf.global_variables_initializer()) self.sess.graph.finalize()
class Agent(object): def __init__(self): self.agent = Framework() config = tf.ConfigProto() config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) self.saver = tf.train.Saver() self.sess.run(tf.global_variables_initializer()) self.sess.graph.finalize() def get_deterministic_policy(self, inputs): return self.agent.get_deterministic_policy(self.sess, inputs) def get_stochastic_policy(self, inputs, epsilon=0.9): return self.agent.get_stochastic_policy(self.sess, inputs, epsilon) def update_cache(self, state, action, reward, next_state, done): self.agent.update_cache(state, action, reward, next_state, done) def update_eval(self): self.agent.update_value_net(self.sess) def update_target(self): self.agent.update_target_net(self.sess) def save_model(self, path="model/ddqn.ckpt"): self.saver.save(self.sess, path) def restore_model(self, path="model/ddqn.ckpt"): self.saver.restore(self.sess, path) def close(self): self.sess.close()
def __init__(self, train=True): if train: self.agent = Framework(0.5) else: self.agent = Framework(1.0) config = tf.ConfigProto() config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) trainable_variables = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, "value") # print(trainable_variables) self.saver = tf.train.Saver(trainable_variables) self.sess.run(tf.global_variables_initializer()) self.sess.graph.finalize()
class ExplorerFramework(object): def __init__(self, access, name, observation, action_size): self.Access = access self.AC = Framework(self.Access, observation, action_size, name) self.env = Account() self.name = name def get_bootstrap(self, done, sess, next_state): if done: terminal = 0 else: terminal = self.AC.get_value( sess, np.expand_dims(next_state, axis=0))[0][0] return terminal def get_output(self, sess, inputs, actions, targets): return self.AC.get_losses(sess, inputs, actions, targets) def run(self, sess, max_episodes, t_max=32): episode = 0 while episode < max_episodes: episode += 1 _ = self.run_episode(sess, t_max) def run_episode(self, sess, t_max=32): t_start = t = 0 episode_score = 0 buffer_state = [] buffer_action = [] buffer_reward = [] self.AC.init_network(sess) state = self.env.reset() while True: t += 1 action = self.AC.get_stochastic_action(sess, state) reward, next_state, done = self.env.step(action) # buffer for loop episode_score += reward buffer_state.append(state) buffer_action.append(action) buffer_reward.append(reward) state = next_state if t - t_start == t_max or done: t_start = t terminal = self.get_bootstrap(done, sess, next_state) buffer_target = [] for r in buffer_reward[::-1]: terminal = r + GAMMA * terminal buffer_target.append(terminal) buffer_target.reverse() inputs = np.stack(buffer_state, axis=0) actions = np.squeeze(np.vstack(buffer_action), axis=1) targets = np.squeeze(np.vstack(buffer_target), axis=1) buffer_state = [] buffer_action = [] buffer_reward = [] # update Access gradients self.AC.train_step(sess, inputs, actions, targets) # update local network self.AC.init_network(sess) if done or t > MAX_EPISODE_LENGTH: if self.name == 'W0': outputs = tuple(self.get_output(sess, inputs, actions, targets)) print('actor: %f, actor_grad: %f, policy mean: %f, policy: %f, entropy: %f, actor_norm: %f, ' 'critic: %f, critic_grad: %f, value: %f, critic_norm: %f, value_mean: %f, advantage: %f' % outputs) return episode_score
def __init__(self, access, name, observation, action_size): self.Access = access self.AC = Framework(self.Access, observation, action_size, name) self.env = Account() self.name = name
from agent.framework import Framework from agent.access import Access state_size = [50, 58, 5] A = Access(state_size, 3) F = Framework(A, state_size, 3, "W0")
def __init__(self, name, access, batch_size, state_size, action_size): self.Access = access self.AC = Framework(name, self.Access, batch_size, state_size, action_size) self.env = Account() self.name = name
class Agent(object): def __init__(self, name, access, batch_size, state_size, action_size): self.Access = access self.AC = Framework(name, self.Access, batch_size, state_size, action_size) self.env = Account() self.name = name def run(self, sess, max_episodes, t_max=8): buffer_score = [] buffer_loss = [] episode = 0 while episode < max_episodes: episode += 1 episode_score, outputs = self.run_episode(sess, t_max) buffer_score.append(episode_score) buffer_loss.append(outputs) return buffer_score, buffer_loss def run_episode(self, sess, t_max=8): t_start = t = 0 episode_score = 1 buffer_state = [] buffer_action = [] buffer_reward = [] self.AC.init_or_update_local(sess) state = self.env.reset() while True: t += 1 action = self.AC.get_stochastic_action(sess, state) next_state, reward, done = self.env.step(action) # buffer for loop episode_score *= (1 + reward / 100) buffer_state.append(state) buffer_action.append(action) buffer_reward.append(reward) state = next_state if t - t_start == t_max or done: t_start = t terminal = self.get_bootstrap(sess, next_state, done) buffer_target = [] for r in buffer_reward[::-1]: terminal = r + GAMMA * terminal buffer_target.append(terminal) buffer_target.reverse() # stack inputs, gather_list = batch_stack(buffer_state) actions = np.vstack(buffer_action) targets = np.squeeze(np.vstack(buffer_target), axis=1) # empty buffer buffer_state = [] buffer_action = [] buffer_reward = [] # update Access gradients self.AC.train_step(sess, inputs, actions, targets, gather_list) # update local network self.AC.init_or_update_local(sess) if done or t > MAX_EPISODE_LENGTH: outputs = self.get_losses(sess, inputs, actions, targets, gather_list) outputs = tuple(outputs) if self.name == 'W0': print('actor: %f, actor_grad: %f, policy mean: %f, policy: %f, entropy: %f, ' 'critic: %f, critic_grad: %f, value: %f, value_mean: %f, advantage: %f' % outputs) return episode_score, outputs def get_bootstrap(self, sess, next_state, done): if done: terminal = 0 else: terminal = self.AC.get_step_value(sess, next_state) return terminal def get_losses(self, sess, inputs, actions, targets, gather_list): return self.AC.get_losses(sess, inputs, actions, targets, gather_list)
import numpy as np import tensorflow as tf from agent.framework import Framework from emulator_v0.main import Account A = Account() F = Framework() # print(len(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES))) state, universe = A.reset() sess = tf.Session() sess.run(tf.global_variables_initializer()) order = F.get_deterministic_policy(sess, state) next_state, next_universe, reward, done, value, portfolio = \ A.step(order, universe) for i in range(2048): F.update_cache(state, order, reward, next_state, done) F.update_value_net(sess)