def __init__(self, state_size, window_size, trend, skip): self.state_size = state_size self.window_size = window_size self.half_window = window_size // 2 self.trend = trend self.skip = skip tf.reset_default_graph() self.INITIAL_FEATURES = np.zeros((4, self.state_size)) self.X = tf.placeholder(tf.float32, (None, None, self.state_size)) self.Y = tf.placeholder(tf.float32, (None, self.OUTPUT_SIZE)) cell = tf.nn.rnn_cell.LSTMCell(self.LAYER_SIZE, state_is_tuple=False) self.hidden_layer = tf.placeholder(tf.float32, (None, 2 * self.LAYER_SIZE)) self.rnn, self.last_state = tf.nn.dynamic_rnn( inputs=self.X, cell=cell, dtype=tf.float32, initial_state=self.hidden_layer) tensor_action, tensor_validation = tf.split(self.rnn[:, -1], 2, 1) feed_action = tf.layers.dense(tensor_action, self.OUTPUT_SIZE) feed_validation = tf.layers.dense(tensor_validation, 1) self.logits = feed_validation + tf.subtract( feed_action, tf.reduce_mean(feed_action, axis=1, keep_dims=True)) self.cost = tf.reduce_sum(tf.square(self.Y - self.logits)) self.optimizer = tf.train.AdamOptimizer( learning_rate=self.LEARNING_RATE).minimize(self.cost) self.sess = tf.InteractiveSession() self.sess.run(tf.global_variables_initializer())
def __init__(self, state_size, window_size, trend, skip, batch_size): self.state_size = state_size self.window_size = window_size self.half_window = window_size // 2 self.trend = trend self.skip = skip self.action_size = 3 self.batch_size = batch_size self.memory = deque(maxlen=1000) self.inventory = [] self.gamma = 0.95 self.epsilon = 0.5 self.epsilon_min = 0.01 self.epsilon_decay = 0.999 tf.reset_default_graph() self.sess = tf.InteractiveSession() self.X = tf.placeholder(tf.float32, [None, self.state_size]) self.Y = tf.placeholder(tf.float32, [None, self.action_size]) feed = tf.layers.dense(self.X, 512, activation=tf.nn.relu) tensor_action, tensor_validation = tf.split(feed, 2, 1) feed_action = tf.layers.dense(tensor_action, self.action_size) feed_validation = tf.layers.dense(tensor_validation, 1) self.logits = feed_validation + tf.subtract( feed_action, tf.reduce_mean(feed_action, axis=1, keep_dims=True)) self.cost = tf.reduce_mean(tf.square(self.Y - self.logits)) self.optimizer = tf.train.GradientDescentOptimizer(1e-5).minimize( self.cost) self.sess.run(tf.global_variables_initializer())
def __init__(self, name, input_size, output_size, size_layer): with tf.variable_scope(name): self.X = tf.placeholder(tf.float32, (None, input_size)) feed_actor = tf.layers.dense(self.X, size_layer, activation = tf.nn.relu) tensor_action, tensor_validation = tf.split(feed_actor,2,1) feed_action = tf.layers.dense(tensor_action, output_size) feed_validation = tf.layers.dense(tensor_validation, 1) self.logits = feed_validation + tf.subtract(feed_action, tf.reduce_mean(feed_action,axis=1,keep_dims=True))
def __init__(self, input_size, output_size, layer_size, learning_rate): self.X = tf.placeholder(tf.float32, (None, input_size)) self.Y = tf.placeholder(tf.float32, (None, output_size)) feed = tf.layers.dense(self.X, layer_size, activation=tf.nn.relu) tensor_action, tensor_validation = tf.split(feed, 2, 1) feed_action = tf.layers.dense(tensor_action, output_size) feed_validation = tf.layers.dense(tensor_validation, 1) self.logits = feed_validation + tf.subtract( feed_action, tf.reduce_mean(feed_action, axis=1, keep_dims=True)) self.cost = tf.reduce_sum(tf.square(self.Y - self.logits)) self.optimizer = tf.train.AdamOptimizer( learning_rate=learning_rate).minimize(self.cost)
def __init__(self, name, input_size, output_size, size_layer): with tf.variable_scope(name): self.X = tf.placeholder(tf.float32, (None, None, input_size)) self.hidden_layer = tf.placeholder(tf.float32, (None, 2 * size_layer)) cell = tf.nn.rnn_cell.LSTMCell(size_layer, state_is_tuple = False) self.rnn,self.last_state = tf.nn.dynamic_rnn(inputs=self.X, cell=cell, dtype=tf.float32, initial_state=self.hidden_layer) tensor_action, tensor_validation = tf.split(self.rnn[:,-1],2,1) feed_action = tf.layers.dense(tensor_action, output_size) feed_validation = tf.layers.dense(tensor_validation, 1) self.logits = feed_validation + tf.subtract(feed_action, tf.reduce_mean(feed_action,axis=1,keep_dims=True))
def __init__(self, name, input_size, output_size, size_layer, learning_rate): with tf.variable_scope(name): self.X = tf.placeholder(tf.float32, (None, input_size)) self.Y = tf.placeholder(tf.float32, (None, output_size)) self.REWARD = tf.placeholder(tf.float32, (None, 1)) feed_critic = tf.layers.dense(self.X, size_layer, activation = tf.nn.relu) tensor_action, tensor_validation = tf.split(feed_critic,2,1) feed_action = tf.layers.dense(tensor_action, output_size) feed_validation = tf.layers.dense(tensor_validation, 1) feed_critic = feed_validation + tf.subtract(feed_action,tf.reduce_mean(feed_action,axis=1,keep_dims=True)) feed_critic = tf.nn.relu(feed_critic) + self.Y feed_critic = tf.layers.dense(feed_critic, size_layer//2, activation = tf.nn.relu) self.logits = tf.layers.dense(feed_critic, 1) self.cost = tf.reduce_mean(tf.square(self.REWARD - self.logits)) self.optimizer = tf.train.AdamOptimizer(learning_rate).minimize(self.cost)
def __init__(self, name, input_size, output_size, size_layer, learning_rate): with tf.variable_scope(name): self.X = tf.placeholder(tf.float32, (None, None, input_size)) self.Y = tf.placeholder(tf.float32, (None, output_size)) self.hidden_layer = tf.placeholder(tf.float32, (None, 2 * size_layer)) self.REWARD = tf.placeholder(tf.float32, (None, 1)) feed_critic = tf.layers.dense(self.X, size_layer, activation = tf.nn.relu) cell = tf.nn.rnn_cell.LSTMCell(size_layer, state_is_tuple = False) self.rnn,self.last_state = tf.nn.dynamic_rnn(inputs=self.X, cell=cell, dtype=tf.float32, initial_state=self.hidden_layer) tensor_action, tensor_validation = tf.split(self.rnn[:,-1],2,1) feed_action = tf.layers.dense(tensor_action, output_size) feed_validation = tf.layers.dense(tensor_validation, 1) feed_critic = feed_validation + tf.subtract(feed_action,tf.reduce_mean(feed_action,axis=1,keep_dims=True)) feed_critic = tf.nn.relu(feed_critic) + self.Y feed_critic = tf.layers.dense(feed_critic, size_layer//2, activation = tf.nn.relu) self.logits = tf.layers.dense(feed_critic, 1) self.cost = tf.reduce_mean(tf.square(self.REWARD - self.logits)) self.optimizer = tf.train.AdamOptimizer(learning_rate).minimize(self.cost)
def __init__(self, state_size, window_size, trend, skip): self.state_size = state_size self.window_size = window_size self.half_window = window_size // 2 self.trend = trend self.skip = skip tf.reset_default_graph() self.X = tf.placeholder(tf.float32, (None, self.state_size)) self.Y = tf.placeholder(tf.float32, (None, self.state_size)) self.ACTION = tf.placeholder(tf.float32, (None)) self.REWARD = tf.placeholder(tf.float32, (None)) self.batch_size = tf.shape(self.ACTION)[0] with tf.variable_scope('curiosity_model'): action = tf.reshape(self.ACTION, (-1, 1)) state_action = tf.concat([self.X, action], axis=1) save_state = tf.identity(self.Y) feed = tf.layers.dense(state_action, 32, activation=tf.nn.relu) self.curiosity_logits = tf.layers.dense(feed, self.state_size) self.curiosity_cost = tf.reduce_sum( tf.square(save_state - self.curiosity_logits), axis=1) self.curiosity_optimizer = tf.train.RMSPropOptimizer( self.LEARNING_RATE).minimize( tf.reduce_mean(self.curiosity_cost)) total_reward = tf.add(self.curiosity_cost, self.REWARD) with tf.variable_scope("q_model"): with tf.variable_scope("eval_net"): x_action = tf.layers.dense(self.X, 128, tf.nn.relu) tensor_action, tensor_validation = tf.split(x_action, 2, 1) feed_action = tf.layers.dense(tensor_action, self.OUTPUT_SIZE) feed_validation = tf.layers.dense(tensor_validation, 1) self.logits = feed_validation + tf.subtract( feed_action, tf.reduce_mean(feed_action, axis=1, keep_dims=True)) with tf.variable_scope("target_net"): y_action = tf.layers.dense(self.Y, 128, tf.nn.relu) tensor_action, tensor_validation = tf.split(y_action, 2, 1) feed_action = tf.layers.dense(tensor_action, self.OUTPUT_SIZE) feed_validation = tf.layers.dense(tensor_validation, 1) y_q = feed_validation + tf.subtract( feed_action, tf.reduce_mean(feed_action, axis=1, keep_dims=True)) q_target = total_reward + self.GAMMA * tf.reduce_max(y_q, axis=1) action = tf.cast(self.ACTION, tf.int32) action_indices = tf.stack( [tf.range(self.batch_size, dtype=tf.int32), action], axis=1) q = tf.gather_nd(params=self.logits, indices=action_indices) self.cost = tf.losses.mean_squared_error(labels=q_target, predictions=q) self.optimizer = tf.train.RMSPropOptimizer( self.LEARNING_RATE).minimize( self.cost, var_list=tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, "q_model/eval_net")) t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q_model/target_net') e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q_model/eval_net') self.target_replace_op = [ tf.assign(t, e) for t, e in zip(t_params, e_params) ] self.sess = tf.InteractiveSession() self.sess.run(tf.global_variables_initializer())