예제 #1
0
    def __init__(self, num_actions=2, learning_rate=0.001):
        self.session = tf.Session()

        self.s = tf.placeholder(tf.float32, [None, 10])

        net = tflearn.fully_connected(self.s, 10, activation='relu')
        self.q_values = tflearn.fully_connected(net, num_actions)
        network_params = tf.trainable_variables()

        self.st = tf.placeholder(tf.float32, [None, 10])
        target_net = tflearn.fully_connected(self.st, 10, activation='relu')
        self.target_q_values = tflearn.fully_connected(target_net, num_actions)
        target_network_params = tf.trainable_variables()[len(network_params):]

        self.reset_target_network_params = [
            target_network_params[i].assign(network_params[i])
            for i in range(len(target_network_params))
        ]

        self.a = tf.placeholder("float", [None, num_actions])
        self.y = tf.placeholder("float", [None])
        action_q_values = tf.reduce_sum(tf.mul(self.q_values, self.a),
                                        reduction_indices=1)
        cost = tflearn.mean_square(action_q_values, self.y)
        optimizer = tf.train.RMSPropOptimizer(learning_rate)
        self.grad_update = optimizer.minimize(cost, var_list=network_params)

        self.session.run(tf.initialize_all_variables())
        self.session.run(self.reset_target_network_params)
        self.t = 0
예제 #2
0
    def __init__(self, sess, state_dim, action_dim, tau, num_actor_vars):
        self.sess = sess
        self.s_dim = state_dim
        self.a_dim = action_dim
        self.tau = tau

        # Create the critic network
        self.inputs, self.action, self.out = self.create_critic_network()

        self.network_params = tf.trainable_variables()[num_actor_vars:]
        self.learning_rate = tf.placeholder(tf.float32, [None,])

        # Target Network
        self.target_inputs, self.target_action, self.target_out = self.create_critic_network()
        
        self.target_network_params = tf.trainable_variables()[(len(self.network_params) + num_actor_vars):]

        # Op for periodically updating target network with online network weights with regularization
        self.update_target_network_params = \
            [self.target_network_params[i].assign(tf.multiply(self.network_params[i], self.tau) + tf.multiply(self.target_network_params[i], 1. - self.tau))
                for i in range(len(self.target_network_params))]
    
        # Network target (y_i)
        self.predicted_q_value = tf.placeholder(tf.float32, [None, 1])

        # Define loss and optimization Op
        self.loss = tflearn.mean_square(self.predicted_q_value, self.out)
        self.lr = tf.gather_nd(self.learning_rate,[0])
        self.optimize = tf.train.AdamOptimizer(self.lr).minimize(self.loss)

        # Get the gradient of the net w.r.t. the action
        self.action_grads = tf.gradients(self.out, self.action)
예제 #3
0
파일: ddpg.py 프로젝트: ghif/gprop
    def __init__(self, sess, state_dim, action_dim, learning_rate, tau,
                 num_actor_vars):
        self.sess = sess
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.learning_rate = learning_rate
        self.tau = tau

        # Critic Network
        self._input, self._action, self._out = self.create_critic_network()
        self._network_params = tf.trainable_variables()[num_actor_vars:]

        self._input_clone, self._action_clone, self._out_clone = self.create_critic_network(
        )
        self._network_clone_params = tf.trainable_variables(
        )[num_actor_vars + (len(self._network_params)):]

        # Clone network update
        self._update_network_clone_params = \
         [self._network_clone_params[i].assign(
          tf.mul(self._network_params[i], tau) + tf.mul(self._network_clone_params[i], (1 - tau)))
          for i in range(len(self._network_clone_params))
         ]

        # network target (y_t)
        self._predicted_q_value = tf.placeholder(tf.float32, [None, 1])

        # define critic loss
        self._loss = tflearn.mean_square(self._predicted_q_value, self._out)
        self._optimizer = tf.train.AdamOptimizer(
            learning_rate=self.learning_rate).minimize(self._loss)

        # Get the gradient w.r.t. the action
        self._action_grads = tf.gradients(self._out, self._action)
예제 #4
0
    def __init__(self,
                 session,
                 dim_state,
                 dim_action,
                 learning_rate,
                 tau=0.01):
        self._sess = session
        self._dim_s = dim_state
        self._dim_a = dim_action
        self._lr = learning_rate

        self._inputs = tflearn.input_data(shape=[None, self._dim_s])

        self._out, self._params = self.buildNetwork(self._inputs, 'dqn')
        self._out_target, self._params_target = self.buildNetwork(
            self._inputs, 'target')

        self._actions = tf.placeholder(tf.float32, [None, self._dim_a])
        self._y_values = tf.placeholder(tf.float32, [None])  #yahan change

        action_q_values = tf.reduce_sum(tf.multiply(self._out, self._actions),
                                        reduction_indices=1)  #yahan bih

        self._update_target = \
            [t_p.assign(tau * g_p - (1 - tau) * t_p) for g_p, t_p in zip(self._params, self._params_target)]

        self.loss = tflearn.mean_square(self._y_values, action_q_values)
        self.optimize = tf.train.AdamOptimizer(self._lr).minimize(self.loss)
예제 #5
0
    def __init__(self, sess, state_dim, action_dim, learning_rate, tau, num_actor_vars):
        self.sess = sess
        self.s_dim = state_dim
        self.a_dim = action_dim
        self.learning_rate = learning_rate
        self.tau = tau

        # Create the critic network
        self.inputs, self.action, self.out = self.create_critic_network()

        self.network_params = tf.trainable_variables()[num_actor_vars:]

        # Target Network
        self.target_inputs, self.target_action, self.target_out = self.create_critic_network()

        self.target_network_params = tf.trainable_variables()[(len(self.network_params) + num_actor_vars):]

        # Op for periodically updating target network with online network weights with regularization
        self.update_target_network_params = \
            [self.target_network_params[i].assign(tf.multiply(self.network_params[i], self.tau) + tf.multiply(self.target_network_params[i], 1. - self.tau))
                for i in range(len(self.target_network_params))]

        # Network target (y_i)
        self.predicted_q_value = tf.placeholder(tf.float32, [None, 1])

        # Define loss and optimization Op
        self.loss = tflearn.mean_square(self.predicted_q_value, self.out)
        self.optimize = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)

        # Get the gradient of the net w.r.t. the action.
        # For each action in the minibatch (i.e., for each x in xs),
        # this will sum up the gradients of each critic output in the minibatch
        # w.r.t. that action (i.e., sum of dy/dx over all ys). We then divide
        # through by the minibatch size to scale the gradients down correctly.
        self.action_grads = tf.div(tf.gradients(self.out, self.action), tf.constant(MINIBATCH_SIZE, dtype=tf.float32))
예제 #6
0
파일: network.py 프로젝트: yxd886/scheduler
	def __init__(self, sess, scope, mode, logger):
		self.sess = sess
		self.state_dim = pm.STATE_DIM
		self.action_dim = pm.ACTION_DIM
		self.scope = scope
		self.mode = mode
		self.logger = logger

		self.input, self.output = self._create_nn()
		self.label = tf.placeholder(tf.float32, [None, self.action_dim])
		self.action = tf.placeholder(tf.float32, [None, None])

		self.entropy_weight = pm.ENTROPY_WEIGHT

		self.td_target = tf.placeholder(tf.float32, [None, 1])
		self.loss = tflearn.mean_square(self.output, self.td_target)

		self.weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.scope)
		self.gradients = tf.gradients(self.loss, self.weights)

		self.lr = pm.LEARNING_RATE
		if pm.OPTIMIZER == "Adam":
			self.optimize = tf.train.AdamOptimizer(learning_rate=self.lr).apply_gradients(zip(self.gradients, self.weights))
		elif pm.OPTIMIZER == "RMSProp":
			self.optimize = tf.train.RMSPropOptimizer(learning_rate=self.lr).apply_gradients(zip(self.gradients, self.weights))
		self.weights_phs = []
		for weight in self.weights:
			self.weights_phs.append(tf.placeholder(tf.float32, shape=weight.get_shape()))
		self.set_weights_op = []
		for idx, weights_ph in enumerate(self.weights_phs):
			self.set_weights_op.append(self.weights[idx].assign(weights_ph))
예제 #7
0
파일: ddpg.py 프로젝트: ataitler/DQN
    def __init__(self, sess, state_dim, action_dim, learning_rate, tau, num_actor_vars):
        self.sess = sess
        self.s_dim = state_dim
        self.a_dim = action_dim
        self.learning_rate = learning_rate
        self.tau = tau

        # Create the critic network
        self.inputs, self.action, self.out = self.create_critic_network()

        self.network_params = tf.trainable_variables()[num_actor_vars:]

        # Target Network
        self.target_inputs, self.target_action, self.target_out = self.create_critic_network()
        
        self.target_network_params = tf.trainable_variables()[(len(self.network_params) + num_actor_vars):]

        # Op for periodically updating target network with online network weights with regularization
        self.update_target_network_params = \
            [self.target_network_params[i].assign(tf.mul(self.network_params[i], self.tau) + tf.mul(self.target_network_params[i], 1. - self.tau))
                for i in range(len(self.target_network_params))]
    
        # Network target (y_i)
        self.predicted_q_value = tf.placeholder(tf.float32, [None, 1])

        # Define loss and optimization Op
        self.loss = tflearn.mean_square(self.predicted_q_value, self.out)
        self.optimize = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)

        # Get the gradient of the net w.r.t. the action
        self.action_grads = tf.gradients(self.out, self.action)
예제 #8
0
    def __init__(self, sess, state_dim, action_dim, learning_rate, tau, gamma):
        self.sess = sess
        self.s_dim = state_dim
        self.a_dim = action_dim
        self.learning_rate = learning_rate
        self.tau = tau
        self.gamma = gamma

        variable_start = len(tf.trainable_variables())
        self.inputs, self.action, self.out = self.create_critic_network()
        self.network_params = tf.trainable_variables()[variable_start:]

        self.target_inputs, self.target_action, self.target_out = self.create_critic_network(
        )
        self.target_network_params = tf.trainable_variables()[(
            len(self.network_params) + variable_start):]

        # Op for periodically updating target network with online network
        # weights with regularization
        self.update_target_network_params = \
            [self.target_network_params[i].assign(
                tf.multiply(self.network_params[i], self.tau) +
                tf.multiply(self.target_network_params[i], 1. - self.tau))
                for i in range(len(self.target_network_params))]

        # Network target (y_i)
        self.predicted_q_value = tf.placeholder(tf.float32, [None, 1])

        # Define loss and optimization Op
        self.loss = tflearn.mean_square(self.predicted_q_value, self.out)
        self.optimize = tf.train.AdamOptimizer(self.learning_rate).minimize(
            self.loss)

        self.action_grads = tf.gradients(
            self.out, self.action)  # shape === [batch_size, a_dim]
예제 #9
0
def build_graph(num_actions):
    inputs, q_values = build_deepQnetwork(num_actions, action_repeat)
    network_params = tf.trainable_variables()

    target_inputs, target_q_values = build_deepQnetwork(
        num_actions, action_repeat)
    target_network_params = tf.trainable_variables()[len(network_params):]

    reset_target_network_params = \
        [target_network_params[i].assign(network_params[i])
         for i in range(len(target_network_params))]

    a = tf.placeholder(tf.float32, [None, num_actions])
    y = tf.placeholder(tf.float32, [None])

    action_q_values = tf.reduce_sum(tf.multiply(q_values, a),
                                    reduction_indices=1)

    cost = tflearn.mean_square(action_q_values, y)

    optimizer = tf.train.RMSPropOptimizer(learning_rate)
    grad_update = optimizer.minimize(cost, var_list=network_params)

    graph_ops = {
        "inputs": inputs,
        "q_values": q_values,
        "target_inputs": target_inputs,
        "target_q_values": target_q_values,
        "reset_target_network_params": reset_target_network_params,
        "a": a,
        "y": y,
        "grad_update": grad_update
    }
    return graph_ops
예제 #10
0
    def _create_optimizer(self):
        # global counter
        self._global_step = tf.Variable(0.0,
                                        trainable=False,
                                        name="global_step")

        # Network target (y_i)
        self.predicted_q_value = tf.placeholder(tf.float32, [None, 1])

        # Define loss
        self.loss = tflearn.mean_square(self.predicted_q_value, self.out)

        # define optimization Op
        if self.opt == "adam":
            self.optimizer = tf.train.AdamOptimizer(self.learning_rate)
        elif self.opt == "sgd":
            self.optimizer = tf.train.GradientDescentOptimizer(
                self.learning_rate)
        else:
            raise ValueError("{} is not a valid optimizer".format(self.opt))

        grad_and_vars = self.optimizer.compute_gradients(
            self.loss, self.network_params)

        grads = [g for g, _ in grad_and_vars]
        grads, self.grad_norm = tf.clip_by_global_norm(grads, 0.5)
        # grad_and_vars = [(g,v) for g,v in zip(grads, self.network_params)]
        # grad_and_vars = [(g,v) for g,v in zip(grads, self.network_params)]
        self.optimize = self.optimizer.apply_gradients(
            grad_and_vars, global_step=self._global_step)

        self.action_grads = tf.gradients(self.out, self.action)

        self._lr_decay = tf.assign(
            self.learning_rate, self.learning_rate * self.learning_rate_decay)
예제 #11
0
    def __init__(self, cecc, s_dim, action_dim, learning_rate, tau, gamma, num_actor_vars):
        self.cecc = cecc
        self.s_dim = c_dim
        self.a_dim = action_dim
        self.learning_rate = learning_rate
        self.tau = tau
        self.gamma = gamma

        # Create the critic network
        self.inputs, self.action, self.out = self.create_critic_network()

        self.network_params = tf.trainable_variables()[num_actor_vars:]

        # Target Network
        self.target_inputs, self.target_action, self.target_out = self.create_critic_network()

        self.target_network_params = tf.trainable_variables()[(len(self.network_params) + num_actor_vars):]

        # Op for periodically updating target network with online network
        # weights with regularization
        self.update_target_network_params = \
            [self.target_network_params[i].assign(tf.multiply(self.network_params[i], self.tau) \
            + tf.multiply(self.target_network_params[i], 1. - self.tau))
                for i in range(len(self.target_network_params))]

        # Network target (y_i)
        self.predicted_q_value = tf.placeholder(tf.float32, [None, 1])

        # Define loss and optimization Op
        self.loss = tflearn.mean_square(self.predicted_q_value, self.out)
        self.optimize = tf.train.AdamOptimizer(
            self.learning_rate).minimize(self.loss)

        self.action_grads = tf.gradients(self.out, self.action)
예제 #12
0
    def model(self, model_type=None, out_embedding_dim=32, layer_size=32, tensorboard_verbose=3, batch_norm=2, n_layers=2, learning_rate=0.001):

        if self.data_encoding == 'one_hot':
            input_shape = [None, self.kmer, 20]

        else:
            input_shape = [None, self.kmer]

        # Adding layers based on model type
        net = tflearn.input_data(shape=input_shape)
        deep_layers_output = self.add_deep_layers(net, model_type, out_embedding_dim, layer_size, n_layers)
        net = tflearn.fully_connected(deep_layers_output, 100, activation='prelu')

        if batch_norm > 0:
            net = tflearn.layers.normalization.batch_normalization(net)
        net = tflearn.dropout(net, 0.4)
        net = tflearn.fully_connected(net, 1, activation='sigmoid')
        if batch_norm > 1:
            net = tflearn.layers.normalization.batch_normalization(net)

        with tf.name_scope("TargetsData"):  # placeholder for target variable (i.e. trainY input)
            targetY = tf.placeholder(shape=[None, 1], dtype=tf.float32, name="Y")

        network = tflearn.regression(net,
                                     placeholder=targetY,
                                     optimizer=self.optimizer(learning_rate),
                                     learning_rate=learning_rate,
                                     loss=tflearn.mean_square(net, targetY),
                                     metric=self.accuracy(net, targetY))

        model = tflearn.DNN(network, tensorboard_verbose=tensorboard_verbose)
        return model
예제 #13
0
    def model(self,
              type=None,
              mode="train",
              num_layers=2,
              state_size=32,
              learning_rate=0.001,
              tensorboard_verbose=3):

        net = tflearn.input_data(shape=[None, 9])
        net = tflearn.embedding(net,
                                input_dim=21,
                                output_dim=32,
                                weights_init='xavier')

        if type == 'bi_rnn':
            out_rnn = tflearn.bidirectional_rnn(net, tflearn.BasicLSTMCell(32),
                                                tflearn.BasicLSTMCell(32))

        elif type == 'basic_lstm':
            for i in range(4):
                net = tflearn.lstm(net, n_units=40, return_seq=True)
            #net = tflearn.lstm(net, n_units=40, return_seq=True)
            out_rnn = tflearn.lstm(net, n_units=40, return_seq=False)

        elif type == 'basic_rnn':
            out_rnn = tflearn.simple_rnn(net, 40)

        else:
            out_rnn = net

        net = tflearn.fully_connected(out_rnn, 100, activation='prelu')
        net = tflearn.layers.normalization.batch_normalization(net)
        net = tflearn.dropout(net, 0.1)
        net = tflearn.fully_connected(net, 1, activation='sigmoid')
        """
        single_cell = getattr(tf.contrib.rnn, cell_type)(cell_size, state_is_tuple=True)

        if num_layers == 1:
            cell = single_cell
        else:
            cell = tf.contrib.rnn.MultiRNNCell([single_cell] * num_layers)
        """

        with tf.name_scope(
                "TargetsData"
        ):  # placeholder for target variable (i.e. trainY input)
            targetY = tf.placeholder(shape=[None, 1],
                                     dtype=tf.float32,
                                     name="Y")

        network = tflearn.regression(net,
                                     placeholder=targetY,
                                     optimizer=self.optimizer(learning_rate),
                                     learning_rate=learning_rate,
                                     loss=tflearn.mean_square(net, targetY),
                                     metric=self.accuracy(net, targetY),
                                     name='no rnn')

        model = tflearn.DNN(network, tensorboard_verbose=tensorboard_verbose)
        return model
예제 #14
0
    def __init__(self,
                 session,
                 dim_state,
                 dim_action,
                 learning_rate,
                 net_name='pong_syr'):
        self.__sess = session
        self.__dim_s = dim_state
        self.__dim_a = dim_action
        self.__lr = learning_rate

        if net_name == 'pong_syr':
            self.__inputs, self.__out = build_cnn_pong(dim_state, dim_action)
        elif net_name == 'pong_org':
            self.__inputs, self.__out = build_simple_cnn(dim_state, dim_action)
        else:
            self.__inputs, self.__out = build_cnn_bird(dim_state, dim_action)

        self.__actions = tf.placeholder(tf.float32, [None, self.__dim_a])
        self.__y_values = tf.placeholder(tf.float32, [None])

        action_q_values = tf.reduce_sum(tf.multiply(self.__out,
                                                    self.__actions),
                                        reduction_indices=1)

        self.loss = tflearn.mean_square(self.__y_values, action_q_values)
        self.optimize = tf.train.AdamOptimizer(self.__lr).minimize(self.loss)
예제 #15
0
    def __init__(self, session, dim_state, dim_action, learning_rate, tau,
                 num_actor_vars):
        self.__sess = session
        self.__dim_s = dim_state
        self.__dim_a = dim_action
        self.__learning_rate = learning_rate
        self.__tau = tau

        cur_para_num = len(tf.trainable_variables())
        self.__inputs, self.__action, self.__out = self.buildNetwork()
        #self.__paras = tf.trainable_variables()[num_actor_vars:]
        self.__paras = tf.trainable_variables()[cur_para_num:]

        self.__target_inputs, self.__target_action, self.__target_out = self.buildNetwork(
        )
        #self.__target_paras = tf.trainable_variables()[(len(self.__paras) + num_actor_vars):]
        self.__target_paras = tf.trainable_variables()[(len(self.__paras) +
                                                        cur_para_num):]

        self.__ops_update_target = []
        for i in range(len(self.__target_paras)):
            val = tf.add(tf.multiply(self.__paras[i], self.__tau),
                         tf.multiply(self.__target_paras[i], 1. - self.__tau))
            op = self.__target_paras[i].assign(val)
            self.__ops_update_target.append(op)

        self.__q_predicted = tf.placeholder(tf.float32, [None, 1])
        self.__is_weight = tf.placeholder(tf.float32, [None, 1])

        self.loss = tflearn.mean_square(self.__q_predicted, self.__out)
        self.loss = tf.multiply(self.loss, self.__is_weight)
        self.optimize = tf.train.AdamOptimizer(self.__learning_rate).minimize(
            self.loss)

        self.__gradient_action = tf.gradients(self.__out, self.__action)
예제 #16
0
    def __init__(self, sess, state_dim, action_dim, learning_rate, scope):
        self.sess = sess
        self.s_dim = state_dim
        self.a_dim = action_dim
        self.learning_rate = learning_rate
        self.scope = scope

        # Create the critic network
        self.input, self.action, self.out = self.create_critic_network(
            scope=self.scope)

        # network parameters
        self.network_params = sorted([
            t for t in tf.trainable_variables()
            if t.name.startswith(self.get_scope())
        ],
                                     key=lambda v: v.name)

        self.target_q_value = tf.placeholder(tf.float32, [None, 1])

        # loss & optimize op
        self.loss = tflearn.mean_square(self.target_q_value, self.out)
        self.optimize = tf.train.AdamOptimizer(self.learning_rate).minimize(
            self.loss)

        # compute the partial derivatives of self.out with respect to self.action
        # Hint: tf.gradients()		https://www.tensorflow.org/versions/r1.2/api_docs/python/tf/gradients
        self.action_grads = tf.gradients(self.out, self.action)
예제 #17
0
    def __init__(self, sess, state_dim, action_dim, learning_rate, tau, gamma,
                 user_id):
        self.sess = sess
        self.s_dim = state_dim
        self.a_dim = action_dim
        self.learning_rate = learning_rate
        self.tau = tau
        self.gamma = gamma
        self.user_id = user_id

        # Create the critic network
        self.inputs, self.q_out = self.create_deep_q_network()

        self.network_params = tf.trainable_variables()

        # Target Network
        self.target_inputs, self.target_q_out = self.create_deep_q_network()

        self.target_network_params = tf.trainable_variables(
        )[len(self.network_params):]

        # Op for periodically updating target network with online network
        # weights with regularization
        self.update_target_network_params = \
            [self.target_network_params[i].assign(tf.multiply(self.network_params[i], self.tau) \
            + tf.multiply(self.target_network_params[i], 1. - self.tau))
                for i in range(len(self.target_network_params))]

        # Network target (y_i)
        self.target_Q = tf.placeholder(tf.float32, [None, self.a_dim])

        # Define loss and optimization Op
        self.loss = tflearn.mean_square(self.target_Q, self.q_out)
        self.optimize = tf.train.AdamOptimizer(self.learning_rate).minimize(
            self.loss)
    def __init__(self,
                 sess,
                 learning_rate=1e-4,
                 obs_dim=None,
                 num_actions=None,
                 conv=False,
                 name=None):
        self.sess = sess
        self.learning_rate = learning_rate
        self.num_actions = num_actions
        self.obs_dim = obs_dim
        self.conv = conv
        self.name = name
        self.obs, self.q_estm = self.q_network(scope=name + "_q")

        self.best_action = tf.argmax(self.q_estm, 1)

        self.predicted_q_value = tf.placeholder(tf.float32, [None, 1],
                                                name=name + "_pred_q")

        self.actions = tf.placeholder(tf.int32, [None], name=name + "_actions")
        actions_onehot = tf.one_hot(self.actions,
                                    self.num_actions,
                                    dtype=tf.float32)
        Q = tf.reduce_sum(actions_onehot * self.q_estm, axis=1)
        # TODO: add entropy loss
        self.loss = tflearn.mean_square(self.predicted_q_value, Q)
        self.optimizer = tf.train.AdamOptimizer(self.learning_rate)
        self.optimize = self.optimizer.minimize(self.loss)

        self.network_params = tf.get_collection(tf.GraphKeys.VARIABLES,
                                                scope=name + "_q")
        self.initializer = tf.initializers.variables(
            self.network_params + self.optimizer.variables())
예제 #19
0
    def model(self, layer_size=None, tensorboard_verbose=3, learning_rate=0.001):

        input_shape = [None, self.x_train.shape[1]]

        net = tflearn.input_data(shape=input_shape)
        net = tflearn.fully_connected(net, layer_size[0], activation='prelu')
        net = tflearn.fully_connected(net, layer_size[1], activation='prelu')
        net = tflearn.fully_connected(net, layer_size[2], activation='prelu')
        net = tflearn.layers.normalization.batch_normalization(net)
        net = tflearn.fully_connected(net, 1, activation='sigmoid')

        with tf.name_scope("TargetsData"):  # placeholder for target variable (i.e. trainY input)
            targetY = tf.placeholder(shape=[None, 1], dtype=tf.float32, name="Y")

        network = tflearn.regression(net,
                                     placeholder=targetY,
                                     optimizer=self.optimizer(learning_rate),
                                     learning_rate=learning_rate,
                                     loss=tflearn.mean_square(net, targetY),
                                     metric=self.accuracy(net, targetY))

        model = tflearn.DNN(network, tensorboard_verbose=tensorboard_verbose)

        self.populate_params(['model_type', 'layer_size', 'tensorboard_verbose', 'learning_rate'],
                             [self.model_type, layer_size, tensorboard_verbose, learning_rate])
        return model
예제 #20
0
    def __init__(self, sess, state_dim, action_dim, learning_rate, tau, gamma,
                 num_actor_vars):
        self.sess = sess
        self.s_dim = state_dim
        self.a_dim = action_dim
        self.lr = learning_rate
        self.tau = tau
        self.gamma = gamma

        # network parameters
        # (1) Critic network
        self.inputs, self.action, self.out = self.create_critic_network()
        self.network_params = tf.trainable_variables()[num_actor_vars:]
        # (2) Target network
        self.target_inputs, self.target_action, self.target_out = self.create_critic_network(
        )
        self.target_network_params = tf.trainable_variables()[(
            num_actor_vars + len(self.network_params)):]
        # (3) update target network with online network parameters
        self.update_target_network_params = \
            [self.target_network_params[i].assign(tf.multiply(self.network_params[i], self.tau)+ \
                                                  tf.multiply(self.target_network_params[i], 1.0-self.tau))
                for i in range(len(self.target_network_params))]

        # critic network update
        # (1) network target (y_i)
        self.predicted_q_value = tf.placeholder(tf.float32, [None, 1])
        # (2) define loss
        # self.loss = tf.reduce_mean(tf.square(self.predicted_q_value, self.out))
        self.loss = tflearn.mean_square(self.predicted_q_value, self.out)
        # (3) optimization op
        self.optimize = tf.train.AdamOptimizer(self.lr).minimize(self.loss)
        # (4) action gradients
        self.action_grads = tf.gradients(self.out, self.action)
예제 #21
0
파일: network.py 프로젝트: zchao520/Zwei
    def __init__(self, sess, state_dim, action_dim, learning_rate):
        self.quality = 0
        self.s_dim = state_dim
        self.a_dim = action_dim
        self.lr_rate = learning_rate
        self.sess = sess
        self.outputs = tf.placeholder(tf.float32, [None, 1])
        self.inputs = tf.placeholder(tf.float32,
                                     [None, self.s_dim[0], self.s_dim[1]])
        self.acts = tf.placeholder(tf.float32, [None, self.a_dim])
        self.pi, self.val = self.CreateNetwork(inputs=self.inputs)
        self.real_out = tf.clip_by_value(self.pi, EPS, 1. - EPS)

        # Get all network parameters
        self.network_params = \
            tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='actor')

        # Set all network parameters
        self.input_network_params = []
        for param in self.network_params:
            self.input_network_params.append(
                tf.placeholder(tf.float32, shape=param.get_shape()))
        self.set_network_params_op = []
        for idx, param in enumerate(self.input_network_params):
            self.set_network_params_op.append(
                self.network_params[idx].assign(param))

        self.loss = 0.5 * tflearn.mean_square(self.val, self.outputs) \
            + tflearn.objectives.categorical_crossentropy(self.real_out, self.acts * (self.outputs - tf.stop_gradient(self.val))) \
            - 0.05 * tflearn.objectives.categorical_crossentropy(self.real_out, self.real_out)

        self.optimize = tf.train.AdamOptimizer(self.lr_rate).minimize(
            self.loss)
예제 #22
0
def build_graph(num_actions):
    # Create shared deep q network
    s, q_network = build_dqn(num_actions=num_actions, screen_buffer_size=screen_buffer_size)
    network_params = tf.trainable_variables()
    q_values = q_network

    # Create shared target network
    st, target_q_network = build_dqn(num_actions=num_actions, screen_buffer_size=screen_buffer_size)
    target_network_params = tf.trainable_variables()[len(network_params):]
    target_q_values = target_q_network

    # Op for periodically updating target network with online network weights
    reset_target_network_params = \
        [target_network_params[i].assign(network_params[i])
         for i in range(len(target_network_params))]

    # Define cost and gradient update op
    a = tf.placeholder("float", [None, num_actions])
    y = tf.placeholder("float", [None])
    action_q_values = tf.reduce_sum(tf.multiply(q_values, a), reduction_indices=1)
    cost = tflearn.mean_square(action_q_values, y)
    optimizer = tf.train.RMSPropOptimizer(learning_rate)
    grad_update = optimizer.minimize(cost, var_list=network_params)

    graph_ops = {"s": s,
                 "q_values": q_values,
                 "st": st,
                 "target_q_values": target_q_values,
                 "reset_target_network_params": reset_target_network_params,
                 "a": a,
                 "y": y,
                 "grad_update": grad_update}

    return graph_ops
    def __init__(self, sess, obs_dim=None, num_options=None, learning_rate=0.0001, name="high_ctrl"):
        self.sess = sess
        self.learning_rate = learning_rate
        self.num_options = num_options
        self.obs_dim = obs_dim
        self.name = name
        self.obs, self.q_estm = self.q_network(scope=name + "_q")

        self.applicable_options = tf.placeholder(tf.float32, shape=[None, self.num_options], name=name+"_q_applicable_options")

        qmin = tf.reduce_min(self.q_estm, axis=1) - 1.0

        # If applicable_op is True, then return q_estm. Return qmin - 1.0 else.g
        self.applicable_q_value = tf.multiply(self.q_estm, self.applicable_options) + tf.multiply(qmin, tf.add(1.0, -self.applicable_options))

        # TODO: How do we make the agent choose the applicable options.
        # self.applicable_q_value = tf.multiply(self.applicable_options, self.q_estm)
        self.best_action = tf.argmax(self.applicable_q_value, 1)

        self.predicted_q_value = tf.placeholder(tf.float32, [None, 1], name=name+"_pred_q")

        self.options = tf.placeholder(tf.int32, [None], name=name+"_options")
        actions_onehot = tf.one_hot(self.options, self.num_options, dtype=tf.float32)
        Q = tf.reduce_sum(actions_onehot * self.q_estm, axis=1)
        # TODO: add entropy loss
        self.loss = tflearn.mean_square(self.predicted_q_value, Q)
        self.optimizer = tf.train.AdamOptimizer(self.learning_rate)
        self.optimize = self.optimizer.minimize(self.loss)


        self.network_params = tf.get_collection(tf.GraphKeys.VARIABLES, scope=name + "_q")
        self.initializer = tf.initializers.variables(self.network_params + self.optimizer.variables())
예제 #24
0
    def __init__(self, session, action_dim, state_dim, learning_rate):
        self.sess, self.a_dim, self.s_dim, self.lr_rate = \
            session, action_dim, state_dim, learning_rate

        # Create the critic network
        self.create_critic_network()

        # [network_params] Get all network parameters
        self.network_params = \
            tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='critic')

        # [input_network_params, set_network_params_op] Set all network parameters
        self.input_network_params = []
        for param in self.network_params:
            self.input_network_params.append(
                tf.placeholder(tf.float32, shape=param.get_shape()))
        self.set_network_params_op = []
        for idx, param in enumerate(self.input_network_params):
            self.set_network_params_op.append(
                self.network_params[idx].assign(param))

        # [td_target] Network target V(s)
        self.td_target = tf.placeholder(tf.float32, [None, 1])
        # [td] Temporal Difference, will also be weights for actor_gradients
        self.td = tf.subtract(self.td_target, self.out)
        # [loss] Mean square error
        self.loss = tflearn.mean_square(self.td_target, self.out)
        # [gradient] Compute critic gradient
        self.critic_gradients = tf.gradients(self.loss, self.network_params)

        # Optimization Op
        self.optimize = tf.train.RMSPropOptimizer(self.lr_rate) \
                        .apply_gradients(zip(self.critic_gradients, self.network_params))
        pass
예제 #25
0
def main(args):

    LEARNING_RATE = 0.0001

    manage_model_dir(args['model_name'],args['model_dir'])
    json = load_data(args['data_path'])
    states,actions,amp,mid = format_data(json)


    with tf.Session() as sess:

        sess.run(tf.global_variables_initializer())

        with tf.variable_scope('actor_model'):
            inputs, out, scaled_out = create_actor_network(size(state,2),size(action,2),amp,mid)

        # Saver
        actor_model_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="actor_model")
        saver = tf.train.Saver(actor_model_variables)

        ground_truth_actions = tf.placeholder(tf.float32, [None, size(action,2)])

        # Define loss and optimization Op
        loss = tflearn.mean_square(ground_truth_actions, out)
        optimize = tf.train.AdamOptimizer(LEARNING_RATE).minimize(loss)

        sess.run([out, optimize], feed_dict={
            inputs: states,
            ground_truth_actions: actions
        })

        saver.save(sess, args['model_dir']+'/'+args['model_name'] + '/' + args['model_name'])
예제 #26
0
    def __init__(self, sess, state_dim, action_dim, learning_rate, tau,
                 num_actor_vars):
        self.sess = sess
        self.s_dim = state_dim
        self.a_dim = action_dim
        self.learning_rate = learning_rate
        self.tau = tau
        self.num_actor_vars = num_actor_vars

        self.state, self.action, self.out = self.create_critic_network()
        self.network_params = tf.trainable_variables()[num_actor_vars:]

        self.target_state, self.target_action, self.target_out = self.create_critic_network(
        )
        self.target_network_params = tf.trainable_variables()[(
            len(self.network_params) + num_actor_vars):]

        self.update_target_network_params = \
            [self.target_network_params[i].assign(tf.multiply(self.network_params[i], self.tau) +
                                                  tf.multiply(self.target_network_params[i], 1.0 - self.tau))
                for i in range(len(self.target_network_params))]

        self.td_value = tf.placeholder(tf.float32, [None, 1])

        self.loss = tflearn.mean_square(self.td_value, self.out)
        self.optimize = tf.train.AdamOptimizer(self.learning_rate).minimize(
            self.loss)

        self.action_grads = tf.gradients(self.out, self.action)
예제 #27
0
    def __init__(self, sess, state_size, action_size, learning_rate,
                 temperature, num_actor_vars):
        self.sess = sess
        self.state_size = state_size
        self.action_size = action_size
        self.learning_rate = learning_rate
        self.temperature = temperature

        self.inputs, self.action, self.outputs = self.createNetwork()
        self.params = tf.trainable_variables()[num_actor_vars:]

        self.target_inputs, self.target_action, self.target_outputs = self.createNetwork(
        )
        self.target_params = tf.trainable_variables()[(len(self.params) +
                                                       num_actor_vars):]

        self.update_target_params = [
            self.target_params[i].assign(
                tf.mul(self.params[i], self.temperature) +
                tf.mul(self.target_params[i], 1. - self.temperature))
            for i in range(len(self.target_params))
        ]

        self.predicted_q = tf.placeholder(tf.float32, [None, 1])

        # calculate the cost
        self.loss = tflearn.mean_square(self.predicted_q, self.outputs)

        # optimize to reduce the cost
        self.optimize = tf.train.AdamOptimizer(self.learning_rate).minimize(
            self.loss)

        # assemble the gradients: d(outputs)/d(parameters)
        self.action_gradient = tf.gradients(self.outputs, self.action)
예제 #28
0
    def setupOptim(self, learningRate):
        qref = tf.placeholder(tf.float32, [None])
        loss = tflearn.mean_square(qref, self.qvalue)
        optim = tf.train.AdamOptimizer(learningRate).minimize(loss)

        self.qref = qref  # Reference Q-values
        self.optim = optim  # Optimizer
        return self
예제 #29
0
    def setupOptim(self):
        x2ref = tf.placeholder(tf.float32, [None, NX])
        loss = tflearn.mean_square(x2ref, self.x2)
        optim = tf.train.AdamOptimizer(POLICY_LEARNING_RATE).minimize(loss)

        self.x2ref = x2ref  # Reference next state
        self.optim = optim  # Optimizer
        return self
예제 #30
0
    def setupOptim(self):
        vref = tf.placeholder(tf.float32, [None, 1])
        loss = tflearn.mean_square(vref, self.value)
        optim = tf.train.AdamOptimizer(QVALUE_LEARNING_RATE).minimize(loss)

        self.vref = vref  # Reference Q-values
        self.optim = optim  # Optimizer
        return self
예제 #31
0
    def __init__(self,
                 sess,
                 obs_dim=None,
                 learning_rate=0.001,
                 training_steps=100,
                 batch_size=32,
                 n_units=16,
                 beta=2.0,
                 delta=0.1,
                 feature=None,
                 conv=False,
                 name=NAME):
        # Beta  : Lagrange multiplier. Higher beta would make the vector more orthogonal.
        # delta : Orthogonality parameter.
        self.sess = sess
        self.learning_rate = learning_rate
        self.obs_dim = obs_dim

        self.n_units = n_units

        # self.beta = 1000000.0
        self.beta = beta
        self.delta = 0.05
        # self.delta = delta

        self.feature = feature

        self.conv = conv

        self.name = name

        self.obs, self.f_value = self.network(scope=name + "_eval")

        self.next_f_value = tf.placeholder(tf.float32, [None, 1],
                                           name=name + "_next_f")

        # TODO: Is this what we are looking for?
        self.loss = tflearn.mean_square(self.f_value, self.next_f_value) + \
                    self.beta * tf.reduce_mean(tf.multiply(self.f_value - self.delta, self.next_f_value - self.delta)) + \
                    self.beta * tf.reduce_mean(self.f_value * self.f_value * self.next_f_value * self.next_f_value) + \
                    self.beta * (self.f_value - self.next_f_value) # This is to let f(s) <= f(s').

        # with tf.variable_scope(self.name, reuse=tf.AUTO_REUSE):
        self.optimizer = tf.train.AdamOptimizer(learning_rate)

        self.optimize = self.optimizer.minimize(self.loss)

        self.network_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                                scope=name + "_eval")
        self.initializer = tf.initializers.variables(
            self.network_params + self.optimizer.variables())

        # print('network param names for ', self.name)
        # for n in self.network_params:
        #     print(n.name)

        self.saver = tf.train.Saver(self.network_params)
예제 #32
0
 def _build_training_graph(self, double_q=True):
     self.r_t = tf.placeholder(tf.float32)
     self.last_reward = tf.placeholder(tf.float32)
     self.a_t_selected = tf.placeholder(tf.float32, [None, self.num_actions])
     self.q_input, self.q_values, self.q_params, self.q_action = self._build_Q_network()
     self.qt_input, self.qt_values, self.qt_params, _ = self._build_Q_network()
     self.update_target_params = [self.qt_params[i].assign(self.q_params[i]) \
                                  for i in range(len(self.qt_params))]
     self.actor_fn = tf.mul(tf.pow(0.99, self.last_reward), self.r_t + tf.mul(FLAGS.discount_factor, self.qt_values))
     self.selected_a_q = tf.reduce_sum(tf.mul(self.q_values, self.a_t_selected), reduction_indices=1)
     self.selected_a_actor = tf.reduce_sum(tf.mul(self.actor_fn, self.a_t_selected), reduction_indices=1)
     self.cost_fn = tflearn.mean_square(self.selected_a_q, self.selected_a_actor)
     optimizer = tf.train.RMSPropOptimizer(FLAGS.learning_rate, momentum=FLAGS.gradient_momentum)
     self.grad_fn = optimizer.minimize(self.cost_fn, var_list=self.q_params)
예제 #33
0
def build_graph(num_actions):
    # Create shared deep q network
    s, q_network = build_dqn(num_actions=num_actions,
                             action_repeat=action_repeat)
    network_params = tf.trainable_variables()
    q_values = q_network

    # Create shared target network
    st, target_q_network = build_dqn(num_actions=num_actions,
                                     action_repeat=action_repeat)
    target_network_params = tf.trainable_variables()[len(network_params):]
    target_q_values = target_q_network

    # Op for periodically updating target network with online network weights
    reset_target_network_params = \
        [target_network_params[i].assign(network_params[i])
         for i in range(len(target_network_params))]

    # Define cost and gradient update op
    a = tf.placeholder("float", [None, num_actions])
    y = tf.placeholder("float", [None])
    action_q_values = tf.reduce_sum(tf.multiply(q_values, a), reduction_indices=1)
    cost = tflearn.mean_square(action_q_values, y)
    optimizer = tf.train.RMSPropOptimizer(learning_rate)
    grad_update = optimizer.minimize(cost, var_list=network_params)

    graph_ops = {"s": s,
                 "q_values": q_values,
                 "st": st,
                 "target_q_values": target_q_values,
                 "reset_target_network_params": reset_target_network_params,
                 "a": a,
                 "y": y,
                 "grad_update": grad_update}

    return graph_ops