def __init__(self, num_actions=2, learning_rate=0.001): self.session = tf.Session() self.s = tf.placeholder(tf.float32, [None, 10]) net = tflearn.fully_connected(self.s, 10, activation='relu') self.q_values = tflearn.fully_connected(net, num_actions) network_params = tf.trainable_variables() self.st = tf.placeholder(tf.float32, [None, 10]) target_net = tflearn.fully_connected(self.st, 10, activation='relu') self.target_q_values = tflearn.fully_connected(target_net, num_actions) target_network_params = tf.trainable_variables()[len(network_params):] self.reset_target_network_params = [ target_network_params[i].assign(network_params[i]) for i in range(len(target_network_params)) ] self.a = tf.placeholder("float", [None, num_actions]) self.y = tf.placeholder("float", [None]) action_q_values = tf.reduce_sum(tf.mul(self.q_values, self.a), reduction_indices=1) cost = tflearn.mean_square(action_q_values, self.y) optimizer = tf.train.RMSPropOptimizer(learning_rate) self.grad_update = optimizer.minimize(cost, var_list=network_params) self.session.run(tf.initialize_all_variables()) self.session.run(self.reset_target_network_params) self.t = 0
def __init__(self, sess, state_dim, action_dim, tau, num_actor_vars): self.sess = sess self.s_dim = state_dim self.a_dim = action_dim self.tau = tau # Create the critic network self.inputs, self.action, self.out = self.create_critic_network() self.network_params = tf.trainable_variables()[num_actor_vars:] self.learning_rate = tf.placeholder(tf.float32, [None,]) # Target Network self.target_inputs, self.target_action, self.target_out = self.create_critic_network() self.target_network_params = tf.trainable_variables()[(len(self.network_params) + num_actor_vars):] # Op for periodically updating target network with online network weights with regularization self.update_target_network_params = \ [self.target_network_params[i].assign(tf.multiply(self.network_params[i], self.tau) + tf.multiply(self.target_network_params[i], 1. - self.tau)) for i in range(len(self.target_network_params))] # Network target (y_i) self.predicted_q_value = tf.placeholder(tf.float32, [None, 1]) # Define loss and optimization Op self.loss = tflearn.mean_square(self.predicted_q_value, self.out) self.lr = tf.gather_nd(self.learning_rate,[0]) self.optimize = tf.train.AdamOptimizer(self.lr).minimize(self.loss) # Get the gradient of the net w.r.t. the action self.action_grads = tf.gradients(self.out, self.action)
def __init__(self, sess, state_dim, action_dim, learning_rate, tau, num_actor_vars): self.sess = sess self.state_dim = state_dim self.action_dim = action_dim self.learning_rate = learning_rate self.tau = tau # Critic Network self._input, self._action, self._out = self.create_critic_network() self._network_params = tf.trainable_variables()[num_actor_vars:] self._input_clone, self._action_clone, self._out_clone = self.create_critic_network( ) self._network_clone_params = tf.trainable_variables( )[num_actor_vars + (len(self._network_params)):] # Clone network update self._update_network_clone_params = \ [self._network_clone_params[i].assign( tf.mul(self._network_params[i], tau) + tf.mul(self._network_clone_params[i], (1 - tau))) for i in range(len(self._network_clone_params)) ] # network target (y_t) self._predicted_q_value = tf.placeholder(tf.float32, [None, 1]) # define critic loss self._loss = tflearn.mean_square(self._predicted_q_value, self._out) self._optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate).minimize(self._loss) # Get the gradient w.r.t. the action self._action_grads = tf.gradients(self._out, self._action)
def __init__(self, session, dim_state, dim_action, learning_rate, tau=0.01): self._sess = session self._dim_s = dim_state self._dim_a = dim_action self._lr = learning_rate self._inputs = tflearn.input_data(shape=[None, self._dim_s]) self._out, self._params = self.buildNetwork(self._inputs, 'dqn') self._out_target, self._params_target = self.buildNetwork( self._inputs, 'target') self._actions = tf.placeholder(tf.float32, [None, self._dim_a]) self._y_values = tf.placeholder(tf.float32, [None]) #yahan change action_q_values = tf.reduce_sum(tf.multiply(self._out, self._actions), reduction_indices=1) #yahan bih self._update_target = \ [t_p.assign(tau * g_p - (1 - tau) * t_p) for g_p, t_p in zip(self._params, self._params_target)] self.loss = tflearn.mean_square(self._y_values, action_q_values) self.optimize = tf.train.AdamOptimizer(self._lr).minimize(self.loss)
def __init__(self, sess, state_dim, action_dim, learning_rate, tau, num_actor_vars): self.sess = sess self.s_dim = state_dim self.a_dim = action_dim self.learning_rate = learning_rate self.tau = tau # Create the critic network self.inputs, self.action, self.out = self.create_critic_network() self.network_params = tf.trainable_variables()[num_actor_vars:] # Target Network self.target_inputs, self.target_action, self.target_out = self.create_critic_network() self.target_network_params = tf.trainable_variables()[(len(self.network_params) + num_actor_vars):] # Op for periodically updating target network with online network weights with regularization self.update_target_network_params = \ [self.target_network_params[i].assign(tf.multiply(self.network_params[i], self.tau) + tf.multiply(self.target_network_params[i], 1. - self.tau)) for i in range(len(self.target_network_params))] # Network target (y_i) self.predicted_q_value = tf.placeholder(tf.float32, [None, 1]) # Define loss and optimization Op self.loss = tflearn.mean_square(self.predicted_q_value, self.out) self.optimize = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss) # Get the gradient of the net w.r.t. the action. # For each action in the minibatch (i.e., for each x in xs), # this will sum up the gradients of each critic output in the minibatch # w.r.t. that action (i.e., sum of dy/dx over all ys). We then divide # through by the minibatch size to scale the gradients down correctly. self.action_grads = tf.div(tf.gradients(self.out, self.action), tf.constant(MINIBATCH_SIZE, dtype=tf.float32))
def __init__(self, sess, scope, mode, logger): self.sess = sess self.state_dim = pm.STATE_DIM self.action_dim = pm.ACTION_DIM self.scope = scope self.mode = mode self.logger = logger self.input, self.output = self._create_nn() self.label = tf.placeholder(tf.float32, [None, self.action_dim]) self.action = tf.placeholder(tf.float32, [None, None]) self.entropy_weight = pm.ENTROPY_WEIGHT self.td_target = tf.placeholder(tf.float32, [None, 1]) self.loss = tflearn.mean_square(self.output, self.td_target) self.weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.scope) self.gradients = tf.gradients(self.loss, self.weights) self.lr = pm.LEARNING_RATE if pm.OPTIMIZER == "Adam": self.optimize = tf.train.AdamOptimizer(learning_rate=self.lr).apply_gradients(zip(self.gradients, self.weights)) elif pm.OPTIMIZER == "RMSProp": self.optimize = tf.train.RMSPropOptimizer(learning_rate=self.lr).apply_gradients(zip(self.gradients, self.weights)) self.weights_phs = [] for weight in self.weights: self.weights_phs.append(tf.placeholder(tf.float32, shape=weight.get_shape())) self.set_weights_op = [] for idx, weights_ph in enumerate(self.weights_phs): self.set_weights_op.append(self.weights[idx].assign(weights_ph))
def __init__(self, sess, state_dim, action_dim, learning_rate, tau, num_actor_vars): self.sess = sess self.s_dim = state_dim self.a_dim = action_dim self.learning_rate = learning_rate self.tau = tau # Create the critic network self.inputs, self.action, self.out = self.create_critic_network() self.network_params = tf.trainable_variables()[num_actor_vars:] # Target Network self.target_inputs, self.target_action, self.target_out = self.create_critic_network() self.target_network_params = tf.trainable_variables()[(len(self.network_params) + num_actor_vars):] # Op for periodically updating target network with online network weights with regularization self.update_target_network_params = \ [self.target_network_params[i].assign(tf.mul(self.network_params[i], self.tau) + tf.mul(self.target_network_params[i], 1. - self.tau)) for i in range(len(self.target_network_params))] # Network target (y_i) self.predicted_q_value = tf.placeholder(tf.float32, [None, 1]) # Define loss and optimization Op self.loss = tflearn.mean_square(self.predicted_q_value, self.out) self.optimize = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss) # Get the gradient of the net w.r.t. the action self.action_grads = tf.gradients(self.out, self.action)
def __init__(self, sess, state_dim, action_dim, learning_rate, tau, gamma): self.sess = sess self.s_dim = state_dim self.a_dim = action_dim self.learning_rate = learning_rate self.tau = tau self.gamma = gamma variable_start = len(tf.trainable_variables()) self.inputs, self.action, self.out = self.create_critic_network() self.network_params = tf.trainable_variables()[variable_start:] self.target_inputs, self.target_action, self.target_out = self.create_critic_network( ) self.target_network_params = tf.trainable_variables()[( len(self.network_params) + variable_start):] # Op for periodically updating target network with online network # weights with regularization self.update_target_network_params = \ [self.target_network_params[i].assign( tf.multiply(self.network_params[i], self.tau) + tf.multiply(self.target_network_params[i], 1. - self.tau)) for i in range(len(self.target_network_params))] # Network target (y_i) self.predicted_q_value = tf.placeholder(tf.float32, [None, 1]) # Define loss and optimization Op self.loss = tflearn.mean_square(self.predicted_q_value, self.out) self.optimize = tf.train.AdamOptimizer(self.learning_rate).minimize( self.loss) self.action_grads = tf.gradients( self.out, self.action) # shape === [batch_size, a_dim]
def build_graph(num_actions): inputs, q_values = build_deepQnetwork(num_actions, action_repeat) network_params = tf.trainable_variables() target_inputs, target_q_values = build_deepQnetwork( num_actions, action_repeat) target_network_params = tf.trainable_variables()[len(network_params):] reset_target_network_params = \ [target_network_params[i].assign(network_params[i]) for i in range(len(target_network_params))] a = tf.placeholder(tf.float32, [None, num_actions]) y = tf.placeholder(tf.float32, [None]) action_q_values = tf.reduce_sum(tf.multiply(q_values, a), reduction_indices=1) cost = tflearn.mean_square(action_q_values, y) optimizer = tf.train.RMSPropOptimizer(learning_rate) grad_update = optimizer.minimize(cost, var_list=network_params) graph_ops = { "inputs": inputs, "q_values": q_values, "target_inputs": target_inputs, "target_q_values": target_q_values, "reset_target_network_params": reset_target_network_params, "a": a, "y": y, "grad_update": grad_update } return graph_ops
def _create_optimizer(self): # global counter self._global_step = tf.Variable(0.0, trainable=False, name="global_step") # Network target (y_i) self.predicted_q_value = tf.placeholder(tf.float32, [None, 1]) # Define loss self.loss = tflearn.mean_square(self.predicted_q_value, self.out) # define optimization Op if self.opt == "adam": self.optimizer = tf.train.AdamOptimizer(self.learning_rate) elif self.opt == "sgd": self.optimizer = tf.train.GradientDescentOptimizer( self.learning_rate) else: raise ValueError("{} is not a valid optimizer".format(self.opt)) grad_and_vars = self.optimizer.compute_gradients( self.loss, self.network_params) grads = [g for g, _ in grad_and_vars] grads, self.grad_norm = tf.clip_by_global_norm(grads, 0.5) # grad_and_vars = [(g,v) for g,v in zip(grads, self.network_params)] # grad_and_vars = [(g,v) for g,v in zip(grads, self.network_params)] self.optimize = self.optimizer.apply_gradients( grad_and_vars, global_step=self._global_step) self.action_grads = tf.gradients(self.out, self.action) self._lr_decay = tf.assign( self.learning_rate, self.learning_rate * self.learning_rate_decay)
def __init__(self, cecc, s_dim, action_dim, learning_rate, tau, gamma, num_actor_vars): self.cecc = cecc self.s_dim = c_dim self.a_dim = action_dim self.learning_rate = learning_rate self.tau = tau self.gamma = gamma # Create the critic network self.inputs, self.action, self.out = self.create_critic_network() self.network_params = tf.trainable_variables()[num_actor_vars:] # Target Network self.target_inputs, self.target_action, self.target_out = self.create_critic_network() self.target_network_params = tf.trainable_variables()[(len(self.network_params) + num_actor_vars):] # Op for periodically updating target network with online network # weights with regularization self.update_target_network_params = \ [self.target_network_params[i].assign(tf.multiply(self.network_params[i], self.tau) \ + tf.multiply(self.target_network_params[i], 1. - self.tau)) for i in range(len(self.target_network_params))] # Network target (y_i) self.predicted_q_value = tf.placeholder(tf.float32, [None, 1]) # Define loss and optimization Op self.loss = tflearn.mean_square(self.predicted_q_value, self.out) self.optimize = tf.train.AdamOptimizer( self.learning_rate).minimize(self.loss) self.action_grads = tf.gradients(self.out, self.action)
def model(self, model_type=None, out_embedding_dim=32, layer_size=32, tensorboard_verbose=3, batch_norm=2, n_layers=2, learning_rate=0.001): if self.data_encoding == 'one_hot': input_shape = [None, self.kmer, 20] else: input_shape = [None, self.kmer] # Adding layers based on model type net = tflearn.input_data(shape=input_shape) deep_layers_output = self.add_deep_layers(net, model_type, out_embedding_dim, layer_size, n_layers) net = tflearn.fully_connected(deep_layers_output, 100, activation='prelu') if batch_norm > 0: net = tflearn.layers.normalization.batch_normalization(net) net = tflearn.dropout(net, 0.4) net = tflearn.fully_connected(net, 1, activation='sigmoid') if batch_norm > 1: net = tflearn.layers.normalization.batch_normalization(net) with tf.name_scope("TargetsData"): # placeholder for target variable (i.e. trainY input) targetY = tf.placeholder(shape=[None, 1], dtype=tf.float32, name="Y") network = tflearn.regression(net, placeholder=targetY, optimizer=self.optimizer(learning_rate), learning_rate=learning_rate, loss=tflearn.mean_square(net, targetY), metric=self.accuracy(net, targetY)) model = tflearn.DNN(network, tensorboard_verbose=tensorboard_verbose) return model
def model(self, type=None, mode="train", num_layers=2, state_size=32, learning_rate=0.001, tensorboard_verbose=3): net = tflearn.input_data(shape=[None, 9]) net = tflearn.embedding(net, input_dim=21, output_dim=32, weights_init='xavier') if type == 'bi_rnn': out_rnn = tflearn.bidirectional_rnn(net, tflearn.BasicLSTMCell(32), tflearn.BasicLSTMCell(32)) elif type == 'basic_lstm': for i in range(4): net = tflearn.lstm(net, n_units=40, return_seq=True) #net = tflearn.lstm(net, n_units=40, return_seq=True) out_rnn = tflearn.lstm(net, n_units=40, return_seq=False) elif type == 'basic_rnn': out_rnn = tflearn.simple_rnn(net, 40) else: out_rnn = net net = tflearn.fully_connected(out_rnn, 100, activation='prelu') net = tflearn.layers.normalization.batch_normalization(net) net = tflearn.dropout(net, 0.1) net = tflearn.fully_connected(net, 1, activation='sigmoid') """ single_cell = getattr(tf.contrib.rnn, cell_type)(cell_size, state_is_tuple=True) if num_layers == 1: cell = single_cell else: cell = tf.contrib.rnn.MultiRNNCell([single_cell] * num_layers) """ with tf.name_scope( "TargetsData" ): # placeholder for target variable (i.e. trainY input) targetY = tf.placeholder(shape=[None, 1], dtype=tf.float32, name="Y") network = tflearn.regression(net, placeholder=targetY, optimizer=self.optimizer(learning_rate), learning_rate=learning_rate, loss=tflearn.mean_square(net, targetY), metric=self.accuracy(net, targetY), name='no rnn') model = tflearn.DNN(network, tensorboard_verbose=tensorboard_verbose) return model
def __init__(self, session, dim_state, dim_action, learning_rate, net_name='pong_syr'): self.__sess = session self.__dim_s = dim_state self.__dim_a = dim_action self.__lr = learning_rate if net_name == 'pong_syr': self.__inputs, self.__out = build_cnn_pong(dim_state, dim_action) elif net_name == 'pong_org': self.__inputs, self.__out = build_simple_cnn(dim_state, dim_action) else: self.__inputs, self.__out = build_cnn_bird(dim_state, dim_action) self.__actions = tf.placeholder(tf.float32, [None, self.__dim_a]) self.__y_values = tf.placeholder(tf.float32, [None]) action_q_values = tf.reduce_sum(tf.multiply(self.__out, self.__actions), reduction_indices=1) self.loss = tflearn.mean_square(self.__y_values, action_q_values) self.optimize = tf.train.AdamOptimizer(self.__lr).minimize(self.loss)
def __init__(self, session, dim_state, dim_action, learning_rate, tau, num_actor_vars): self.__sess = session self.__dim_s = dim_state self.__dim_a = dim_action self.__learning_rate = learning_rate self.__tau = tau cur_para_num = len(tf.trainable_variables()) self.__inputs, self.__action, self.__out = self.buildNetwork() #self.__paras = tf.trainable_variables()[num_actor_vars:] self.__paras = tf.trainable_variables()[cur_para_num:] self.__target_inputs, self.__target_action, self.__target_out = self.buildNetwork( ) #self.__target_paras = tf.trainable_variables()[(len(self.__paras) + num_actor_vars):] self.__target_paras = tf.trainable_variables()[(len(self.__paras) + cur_para_num):] self.__ops_update_target = [] for i in range(len(self.__target_paras)): val = tf.add(tf.multiply(self.__paras[i], self.__tau), tf.multiply(self.__target_paras[i], 1. - self.__tau)) op = self.__target_paras[i].assign(val) self.__ops_update_target.append(op) self.__q_predicted = tf.placeholder(tf.float32, [None, 1]) self.__is_weight = tf.placeholder(tf.float32, [None, 1]) self.loss = tflearn.mean_square(self.__q_predicted, self.__out) self.loss = tf.multiply(self.loss, self.__is_weight) self.optimize = tf.train.AdamOptimizer(self.__learning_rate).minimize( self.loss) self.__gradient_action = tf.gradients(self.__out, self.__action)
def __init__(self, sess, state_dim, action_dim, learning_rate, scope): self.sess = sess self.s_dim = state_dim self.a_dim = action_dim self.learning_rate = learning_rate self.scope = scope # Create the critic network self.input, self.action, self.out = self.create_critic_network( scope=self.scope) # network parameters self.network_params = sorted([ t for t in tf.trainable_variables() if t.name.startswith(self.get_scope()) ], key=lambda v: v.name) self.target_q_value = tf.placeholder(tf.float32, [None, 1]) # loss & optimize op self.loss = tflearn.mean_square(self.target_q_value, self.out) self.optimize = tf.train.AdamOptimizer(self.learning_rate).minimize( self.loss) # compute the partial derivatives of self.out with respect to self.action # Hint: tf.gradients() https://www.tensorflow.org/versions/r1.2/api_docs/python/tf/gradients self.action_grads = tf.gradients(self.out, self.action)
def __init__(self, sess, state_dim, action_dim, learning_rate, tau, gamma, user_id): self.sess = sess self.s_dim = state_dim self.a_dim = action_dim self.learning_rate = learning_rate self.tau = tau self.gamma = gamma self.user_id = user_id # Create the critic network self.inputs, self.q_out = self.create_deep_q_network() self.network_params = tf.trainable_variables() # Target Network self.target_inputs, self.target_q_out = self.create_deep_q_network() self.target_network_params = tf.trainable_variables( )[len(self.network_params):] # Op for periodically updating target network with online network # weights with regularization self.update_target_network_params = \ [self.target_network_params[i].assign(tf.multiply(self.network_params[i], self.tau) \ + tf.multiply(self.target_network_params[i], 1. - self.tau)) for i in range(len(self.target_network_params))] # Network target (y_i) self.target_Q = tf.placeholder(tf.float32, [None, self.a_dim]) # Define loss and optimization Op self.loss = tflearn.mean_square(self.target_Q, self.q_out) self.optimize = tf.train.AdamOptimizer(self.learning_rate).minimize( self.loss)
def __init__(self, sess, learning_rate=1e-4, obs_dim=None, num_actions=None, conv=False, name=None): self.sess = sess self.learning_rate = learning_rate self.num_actions = num_actions self.obs_dim = obs_dim self.conv = conv self.name = name self.obs, self.q_estm = self.q_network(scope=name + "_q") self.best_action = tf.argmax(self.q_estm, 1) self.predicted_q_value = tf.placeholder(tf.float32, [None, 1], name=name + "_pred_q") self.actions = tf.placeholder(tf.int32, [None], name=name + "_actions") actions_onehot = tf.one_hot(self.actions, self.num_actions, dtype=tf.float32) Q = tf.reduce_sum(actions_onehot * self.q_estm, axis=1) # TODO: add entropy loss self.loss = tflearn.mean_square(self.predicted_q_value, Q) self.optimizer = tf.train.AdamOptimizer(self.learning_rate) self.optimize = self.optimizer.minimize(self.loss) self.network_params = tf.get_collection(tf.GraphKeys.VARIABLES, scope=name + "_q") self.initializer = tf.initializers.variables( self.network_params + self.optimizer.variables())
def model(self, layer_size=None, tensorboard_verbose=3, learning_rate=0.001): input_shape = [None, self.x_train.shape[1]] net = tflearn.input_data(shape=input_shape) net = tflearn.fully_connected(net, layer_size[0], activation='prelu') net = tflearn.fully_connected(net, layer_size[1], activation='prelu') net = tflearn.fully_connected(net, layer_size[2], activation='prelu') net = tflearn.layers.normalization.batch_normalization(net) net = tflearn.fully_connected(net, 1, activation='sigmoid') with tf.name_scope("TargetsData"): # placeholder for target variable (i.e. trainY input) targetY = tf.placeholder(shape=[None, 1], dtype=tf.float32, name="Y") network = tflearn.regression(net, placeholder=targetY, optimizer=self.optimizer(learning_rate), learning_rate=learning_rate, loss=tflearn.mean_square(net, targetY), metric=self.accuracy(net, targetY)) model = tflearn.DNN(network, tensorboard_verbose=tensorboard_verbose) self.populate_params(['model_type', 'layer_size', 'tensorboard_verbose', 'learning_rate'], [self.model_type, layer_size, tensorboard_verbose, learning_rate]) return model
def __init__(self, sess, state_dim, action_dim, learning_rate, tau, gamma, num_actor_vars): self.sess = sess self.s_dim = state_dim self.a_dim = action_dim self.lr = learning_rate self.tau = tau self.gamma = gamma # network parameters # (1) Critic network self.inputs, self.action, self.out = self.create_critic_network() self.network_params = tf.trainable_variables()[num_actor_vars:] # (2) Target network self.target_inputs, self.target_action, self.target_out = self.create_critic_network( ) self.target_network_params = tf.trainable_variables()[( num_actor_vars + len(self.network_params)):] # (3) update target network with online network parameters self.update_target_network_params = \ [self.target_network_params[i].assign(tf.multiply(self.network_params[i], self.tau)+ \ tf.multiply(self.target_network_params[i], 1.0-self.tau)) for i in range(len(self.target_network_params))] # critic network update # (1) network target (y_i) self.predicted_q_value = tf.placeholder(tf.float32, [None, 1]) # (2) define loss # self.loss = tf.reduce_mean(tf.square(self.predicted_q_value, self.out)) self.loss = tflearn.mean_square(self.predicted_q_value, self.out) # (3) optimization op self.optimize = tf.train.AdamOptimizer(self.lr).minimize(self.loss) # (4) action gradients self.action_grads = tf.gradients(self.out, self.action)
def __init__(self, sess, state_dim, action_dim, learning_rate): self.quality = 0 self.s_dim = state_dim self.a_dim = action_dim self.lr_rate = learning_rate self.sess = sess self.outputs = tf.placeholder(tf.float32, [None, 1]) self.inputs = tf.placeholder(tf.float32, [None, self.s_dim[0], self.s_dim[1]]) self.acts = tf.placeholder(tf.float32, [None, self.a_dim]) self.pi, self.val = self.CreateNetwork(inputs=self.inputs) self.real_out = tf.clip_by_value(self.pi, EPS, 1. - EPS) # Get all network parameters self.network_params = \ tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='actor') # Set all network parameters self.input_network_params = [] for param in self.network_params: self.input_network_params.append( tf.placeholder(tf.float32, shape=param.get_shape())) self.set_network_params_op = [] for idx, param in enumerate(self.input_network_params): self.set_network_params_op.append( self.network_params[idx].assign(param)) self.loss = 0.5 * tflearn.mean_square(self.val, self.outputs) \ + tflearn.objectives.categorical_crossentropy(self.real_out, self.acts * (self.outputs - tf.stop_gradient(self.val))) \ - 0.05 * tflearn.objectives.categorical_crossentropy(self.real_out, self.real_out) self.optimize = tf.train.AdamOptimizer(self.lr_rate).minimize( self.loss)
def build_graph(num_actions): # Create shared deep q network s, q_network = build_dqn(num_actions=num_actions, screen_buffer_size=screen_buffer_size) network_params = tf.trainable_variables() q_values = q_network # Create shared target network st, target_q_network = build_dqn(num_actions=num_actions, screen_buffer_size=screen_buffer_size) target_network_params = tf.trainable_variables()[len(network_params):] target_q_values = target_q_network # Op for periodically updating target network with online network weights reset_target_network_params = \ [target_network_params[i].assign(network_params[i]) for i in range(len(target_network_params))] # Define cost and gradient update op a = tf.placeholder("float", [None, num_actions]) y = tf.placeholder("float", [None]) action_q_values = tf.reduce_sum(tf.multiply(q_values, a), reduction_indices=1) cost = tflearn.mean_square(action_q_values, y) optimizer = tf.train.RMSPropOptimizer(learning_rate) grad_update = optimizer.minimize(cost, var_list=network_params) graph_ops = {"s": s, "q_values": q_values, "st": st, "target_q_values": target_q_values, "reset_target_network_params": reset_target_network_params, "a": a, "y": y, "grad_update": grad_update} return graph_ops
def __init__(self, sess, obs_dim=None, num_options=None, learning_rate=0.0001, name="high_ctrl"): self.sess = sess self.learning_rate = learning_rate self.num_options = num_options self.obs_dim = obs_dim self.name = name self.obs, self.q_estm = self.q_network(scope=name + "_q") self.applicable_options = tf.placeholder(tf.float32, shape=[None, self.num_options], name=name+"_q_applicable_options") qmin = tf.reduce_min(self.q_estm, axis=1) - 1.0 # If applicable_op is True, then return q_estm. Return qmin - 1.0 else.g self.applicable_q_value = tf.multiply(self.q_estm, self.applicable_options) + tf.multiply(qmin, tf.add(1.0, -self.applicable_options)) # TODO: How do we make the agent choose the applicable options. # self.applicable_q_value = tf.multiply(self.applicable_options, self.q_estm) self.best_action = tf.argmax(self.applicable_q_value, 1) self.predicted_q_value = tf.placeholder(tf.float32, [None, 1], name=name+"_pred_q") self.options = tf.placeholder(tf.int32, [None], name=name+"_options") actions_onehot = tf.one_hot(self.options, self.num_options, dtype=tf.float32) Q = tf.reduce_sum(actions_onehot * self.q_estm, axis=1) # TODO: add entropy loss self.loss = tflearn.mean_square(self.predicted_q_value, Q) self.optimizer = tf.train.AdamOptimizer(self.learning_rate) self.optimize = self.optimizer.minimize(self.loss) self.network_params = tf.get_collection(tf.GraphKeys.VARIABLES, scope=name + "_q") self.initializer = tf.initializers.variables(self.network_params + self.optimizer.variables())
def __init__(self, session, action_dim, state_dim, learning_rate): self.sess, self.a_dim, self.s_dim, self.lr_rate = \ session, action_dim, state_dim, learning_rate # Create the critic network self.create_critic_network() # [network_params] Get all network parameters self.network_params = \ tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='critic') # [input_network_params, set_network_params_op] Set all network parameters self.input_network_params = [] for param in self.network_params: self.input_network_params.append( tf.placeholder(tf.float32, shape=param.get_shape())) self.set_network_params_op = [] for idx, param in enumerate(self.input_network_params): self.set_network_params_op.append( self.network_params[idx].assign(param)) # [td_target] Network target V(s) self.td_target = tf.placeholder(tf.float32, [None, 1]) # [td] Temporal Difference, will also be weights for actor_gradients self.td = tf.subtract(self.td_target, self.out) # [loss] Mean square error self.loss = tflearn.mean_square(self.td_target, self.out) # [gradient] Compute critic gradient self.critic_gradients = tf.gradients(self.loss, self.network_params) # Optimization Op self.optimize = tf.train.RMSPropOptimizer(self.lr_rate) \ .apply_gradients(zip(self.critic_gradients, self.network_params)) pass
def main(args): LEARNING_RATE = 0.0001 manage_model_dir(args['model_name'],args['model_dir']) json = load_data(args['data_path']) states,actions,amp,mid = format_data(json) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) with tf.variable_scope('actor_model'): inputs, out, scaled_out = create_actor_network(size(state,2),size(action,2),amp,mid) # Saver actor_model_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="actor_model") saver = tf.train.Saver(actor_model_variables) ground_truth_actions = tf.placeholder(tf.float32, [None, size(action,2)]) # Define loss and optimization Op loss = tflearn.mean_square(ground_truth_actions, out) optimize = tf.train.AdamOptimizer(LEARNING_RATE).minimize(loss) sess.run([out, optimize], feed_dict={ inputs: states, ground_truth_actions: actions }) saver.save(sess, args['model_dir']+'/'+args['model_name'] + '/' + args['model_name'])
def __init__(self, sess, state_dim, action_dim, learning_rate, tau, num_actor_vars): self.sess = sess self.s_dim = state_dim self.a_dim = action_dim self.learning_rate = learning_rate self.tau = tau self.num_actor_vars = num_actor_vars self.state, self.action, self.out = self.create_critic_network() self.network_params = tf.trainable_variables()[num_actor_vars:] self.target_state, self.target_action, self.target_out = self.create_critic_network( ) self.target_network_params = tf.trainable_variables()[( len(self.network_params) + num_actor_vars):] self.update_target_network_params = \ [self.target_network_params[i].assign(tf.multiply(self.network_params[i], self.tau) + tf.multiply(self.target_network_params[i], 1.0 - self.tau)) for i in range(len(self.target_network_params))] self.td_value = tf.placeholder(tf.float32, [None, 1]) self.loss = tflearn.mean_square(self.td_value, self.out) self.optimize = tf.train.AdamOptimizer(self.learning_rate).minimize( self.loss) self.action_grads = tf.gradients(self.out, self.action)
def __init__(self, sess, state_size, action_size, learning_rate, temperature, num_actor_vars): self.sess = sess self.state_size = state_size self.action_size = action_size self.learning_rate = learning_rate self.temperature = temperature self.inputs, self.action, self.outputs = self.createNetwork() self.params = tf.trainable_variables()[num_actor_vars:] self.target_inputs, self.target_action, self.target_outputs = self.createNetwork( ) self.target_params = tf.trainable_variables()[(len(self.params) + num_actor_vars):] self.update_target_params = [ self.target_params[i].assign( tf.mul(self.params[i], self.temperature) + tf.mul(self.target_params[i], 1. - self.temperature)) for i in range(len(self.target_params)) ] self.predicted_q = tf.placeholder(tf.float32, [None, 1]) # calculate the cost self.loss = tflearn.mean_square(self.predicted_q, self.outputs) # optimize to reduce the cost self.optimize = tf.train.AdamOptimizer(self.learning_rate).minimize( self.loss) # assemble the gradients: d(outputs)/d(parameters) self.action_gradient = tf.gradients(self.outputs, self.action)
def setupOptim(self, learningRate): qref = tf.placeholder(tf.float32, [None]) loss = tflearn.mean_square(qref, self.qvalue) optim = tf.train.AdamOptimizer(learningRate).minimize(loss) self.qref = qref # Reference Q-values self.optim = optim # Optimizer return self
def setupOptim(self): x2ref = tf.placeholder(tf.float32, [None, NX]) loss = tflearn.mean_square(x2ref, self.x2) optim = tf.train.AdamOptimizer(POLICY_LEARNING_RATE).minimize(loss) self.x2ref = x2ref # Reference next state self.optim = optim # Optimizer return self
def setupOptim(self): vref = tf.placeholder(tf.float32, [None, 1]) loss = tflearn.mean_square(vref, self.value) optim = tf.train.AdamOptimizer(QVALUE_LEARNING_RATE).minimize(loss) self.vref = vref # Reference Q-values self.optim = optim # Optimizer return self
def __init__(self, sess, obs_dim=None, learning_rate=0.001, training_steps=100, batch_size=32, n_units=16, beta=2.0, delta=0.1, feature=None, conv=False, name=NAME): # Beta : Lagrange multiplier. Higher beta would make the vector more orthogonal. # delta : Orthogonality parameter. self.sess = sess self.learning_rate = learning_rate self.obs_dim = obs_dim self.n_units = n_units # self.beta = 1000000.0 self.beta = beta self.delta = 0.05 # self.delta = delta self.feature = feature self.conv = conv self.name = name self.obs, self.f_value = self.network(scope=name + "_eval") self.next_f_value = tf.placeholder(tf.float32, [None, 1], name=name + "_next_f") # TODO: Is this what we are looking for? self.loss = tflearn.mean_square(self.f_value, self.next_f_value) + \ self.beta * tf.reduce_mean(tf.multiply(self.f_value - self.delta, self.next_f_value - self.delta)) + \ self.beta * tf.reduce_mean(self.f_value * self.f_value * self.next_f_value * self.next_f_value) + \ self.beta * (self.f_value - self.next_f_value) # This is to let f(s) <= f(s'). # with tf.variable_scope(self.name, reuse=tf.AUTO_REUSE): self.optimizer = tf.train.AdamOptimizer(learning_rate) self.optimize = self.optimizer.minimize(self.loss) self.network_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=name + "_eval") self.initializer = tf.initializers.variables( self.network_params + self.optimizer.variables()) # print('network param names for ', self.name) # for n in self.network_params: # print(n.name) self.saver = tf.train.Saver(self.network_params)
def _build_training_graph(self, double_q=True): self.r_t = tf.placeholder(tf.float32) self.last_reward = tf.placeholder(tf.float32) self.a_t_selected = tf.placeholder(tf.float32, [None, self.num_actions]) self.q_input, self.q_values, self.q_params, self.q_action = self._build_Q_network() self.qt_input, self.qt_values, self.qt_params, _ = self._build_Q_network() self.update_target_params = [self.qt_params[i].assign(self.q_params[i]) \ for i in range(len(self.qt_params))] self.actor_fn = tf.mul(tf.pow(0.99, self.last_reward), self.r_t + tf.mul(FLAGS.discount_factor, self.qt_values)) self.selected_a_q = tf.reduce_sum(tf.mul(self.q_values, self.a_t_selected), reduction_indices=1) self.selected_a_actor = tf.reduce_sum(tf.mul(self.actor_fn, self.a_t_selected), reduction_indices=1) self.cost_fn = tflearn.mean_square(self.selected_a_q, self.selected_a_actor) optimizer = tf.train.RMSPropOptimizer(FLAGS.learning_rate, momentum=FLAGS.gradient_momentum) self.grad_fn = optimizer.minimize(self.cost_fn, var_list=self.q_params)
def build_graph(num_actions): # Create shared deep q network s, q_network = build_dqn(num_actions=num_actions, action_repeat=action_repeat) network_params = tf.trainable_variables() q_values = q_network # Create shared target network st, target_q_network = build_dqn(num_actions=num_actions, action_repeat=action_repeat) target_network_params = tf.trainable_variables()[len(network_params):] target_q_values = target_q_network # Op for periodically updating target network with online network weights reset_target_network_params = \ [target_network_params[i].assign(network_params[i]) for i in range(len(target_network_params))] # Define cost and gradient update op a = tf.placeholder("float", [None, num_actions]) y = tf.placeholder("float", [None]) action_q_values = tf.reduce_sum(tf.multiply(q_values, a), reduction_indices=1) cost = tflearn.mean_square(action_q_values, y) optimizer = tf.train.RMSPropOptimizer(learning_rate) grad_update = optimizer.minimize(cost, var_list=network_params) graph_ops = {"s": s, "q_values": q_values, "st": st, "target_q_values": target_q_values, "reset_target_network_params": reset_target_network_params, "a": a, "y": y, "grad_update": grad_update} return graph_ops