Exemplo n.º 1
0
    def __init__(self,num_states,num_actions):
        tf.reset_default_graph()
        self.g=tf.Graph()
        with self.g.as_default():
            self.sess = tf.InteractiveSession()
            
            #actor network model parameters:
            self.actor_state_in = tf.placeholder("float",[None,num_states]) 
            self.W1_a = tf.Variable(tf.random_uniform([num_states,N_HIDDEN_1],-1/math.sqrt(num_states),1/math.sqrt(num_states)))
            self.B1_a=tf.Variable(tf.random_uniform([N_HIDDEN_1],-1/math.sqrt(num_states),1/math.sqrt(num_states)))
            self.W2_a = tf.Variable(tf.random_uniform([N_HIDDEN_1,N_HIDDEN_2],-1/math.sqrt(N_HIDDEN_1),1/math.sqrt(N_HIDDEN_1)))
            self.B2_a=tf.Variable(tf.random_uniform([N_HIDDEN_2],-1/math.sqrt(N_HIDDEN_1),1/math.sqrt(N_HIDDEN_1)))
            self.W3_a = tf.Variable(tf.random_uniform([N_HIDDEN_2,num_actions],-0.003,0.003))
            self.B3_a = tf.Variable(tf.random_uniform([num_actions],-0.003,0.003))
            
            self.is_training = tf.placeholder(tf.bool, [])
            self.H1_t= tf.matmul(self.actor_state_in,self.W1_a)
            self.H1_a_bn = batch_norm(self.H1_t,N_HIDDEN_1, self.is_training, self.sess)
            self.H1_a = tf.nn.softplus(self.H1_a_bn.bnorm) + self.B1_a
            
            self.H2_t=tf.matmul(self.H1_a,self.W2_a)
            self.H2_a_bn = batch_norm(self.H2_t,N_HIDDEN_2,self.is_training,self.sess)
            self.H2_a = tf.nn.tanh(self.H2_a_bn.bnorm) + self.B2_a
            self.actor_model=tf.matmul(self.H2_a,self.W3_a) + self.B3_a
            
                                   
            #target actor network model parameters:
            self.t_actor_state_in = tf.placeholder("float",[None,num_states]) 
            self.t_W1_a = tf.Variable(tf.random_uniform([num_states,N_HIDDEN_1],-1/math.sqrt(num_states),1/math.sqrt(num_states)))
            self.t_B1_a=tf.Variable(tf.random_uniform([N_HIDDEN_1],-1/math.sqrt(num_states),1/math.sqrt(num_states)))
            self.t_W2_a = tf.Variable(tf.random_uniform([N_HIDDEN_1,N_HIDDEN_2],-1/math.sqrt(N_HIDDEN_1),1/math.sqrt(N_HIDDEN_1)))
            self.t_B2_a=tf.Variable(tf.random_uniform([N_HIDDEN_2],-1/math.sqrt(N_HIDDEN_1),1/math.sqrt(N_HIDDEN_1)))
            self.t_W3_a = tf.Variable(tf.random_uniform([N_HIDDEN_2,num_actions],-0.003,0.003))
            self.t_B3_a = tf.Variable(tf.random_uniform([num_actions],-0.003,0.003))
            
            self.t_is_training = tf.placeholder(tf.bool, [])
            self.t_H1_t= tf.matmul(self.t_actor_state_in,self.t_W1_a)
            self.t_H1_a_bn = batch_norm(self.t_H1_t,N_HIDDEN_1, self.t_is_training, self.sess,self.H1_a_bn)
            self.t_H1_a = tf.nn.softplus(self.t_H1_a_bn.bnorm) + self.t_B1_a
            
            self.t_H2_t=tf.matmul(self.t_H1_a,self.t_W2_a)
            self.t_H2_a_bn = batch_norm(self.t_H2_t,N_HIDDEN_2,self.t_is_training,self.sess,self.H2_a_bn)
            self.t_H2_a = tf.nn.tanh(self.t_H2_a_bn.bnorm) + self.t_B2_a
            self.t_actor_model=tf.matmul(self.t_H2_a,self.t_W3_a) + self.t_B3_a
            
            #cost of actor network:
            self.q_gradient_input = tf.placeholder("float",[None,num_actions]) #gets input from action_gradient computed in critic network file
            self.actor_parameters = [self.W1_a, self.B1_a, self.W2_a, self.B2_a,self.W3_a, self.B3_a, self.H1_a_bn.scale,self.H1_a_bn.beta,self.H2_a_bn.scale,self.H2_a_bn.beta]
            self.parameters_gradients = tf.gradients(self.actor_model,self.actor_parameters,-self.q_gradient_input)#/BATCH_SIZE) changed -self.q_gradient to -
            
            self.optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE,epsilon=1e-08).apply_gradients(zip(self.parameters_gradients,self.actor_parameters))  
            #initialize all tensor variable parameters:
            self.sess.run(tf.initialize_all_variables())    
            
            #To make sure actor and target have same intial parmameters copy the parameters:
            # copy target parameters
            self.sess.run([
				self.t_W1_a.assign(self.W1_a),
				self.t_B1_a.assign(self.B1_a),
				self.t_W2_a.assign(self.W2_a),
				self.t_B2_a.assign(self.B2_a),
				self.t_W3_a.assign(self.W3_a),
				self.t_B3_a.assign(self.B3_a)])
    def __init__(self,num_states,num_actions):

        #print ("BOOOOOO")
        tf.reset_default_graph()
        self.g=tf.Graph()
        with self.g.as_default():
            self.sess = tf.InteractiveSession()
            
            #Critic Q Network:
            self.critic_state_in =  tf.placeholder("float",[None,num_states])
            self.critic_action_in = tf.placeholder("float",[None,num_actions]) 
            self.W1_c = tf.Variable(tf.random_uniform([num_states,N_HIDDEN_1],-1.0/math.sqrt(num_states),1.0/math.sqrt(num_states)))
            self.B1_c = tf.Variable(tf.random_uniform([N_HIDDEN_1],-1/math.sqrt(num_states),1/math.sqrt(num_states)))
            self.W2_c = tf.Variable(tf.random_uniform([N_HIDDEN_1,N_HIDDEN_2],-1/math.sqrt(N_HIDDEN_1+num_actions),1/math.sqrt(N_HIDDEN_1+num_actions)))  
            self.B2_c= tf.Variable(tf.random_uniform([N_HIDDEN_2],-1/math.sqrt(N_HIDDEN_1+num_actions),1/math.sqrt(N_HIDDEN_1+num_actions)))
            self.W2_action_c = tf.Variable(tf.random_uniform([num_actions,N_HIDDEN_2],-1/math.sqrt(N_HIDDEN_1+num_actions),1/math.sqrt(N_HIDDEN_1+num_actions)))
            self.W3_c = tf.Variable(tf.random_uniform([N_HIDDEN_2,1],-0.003,0.003))
            self.B3_c = tf.Variable(tf.random_uniform([1],-0.003,0.003))
            
            self.is_training = tf.placeholder(tf.bool, [])
            self.H1_t = tf.matmul(self.critic_state_in,self.W1_c)
            self.H1_c_bn = batch_norm(self.H1_t,N_HIDDEN_1,self.is_training,self.sess)
            
            self.H1_c = tf.nn.softplus(self.H1_c_bn.bnorm) + self.B1_c

        
            self.H2_t = tf.matmul(self.H1_c,self.W2_c)+tf.matmul(self.critic_action_in,self.W2_action_c)
            self.H2_c_bn = batch_norm(self.H2_t,N_HIDDEN_2,self.is_training,self.sess)
            self.H2_c = tf.nn.tanh(self.H2_c_bn.bnorm) + self.B2_c
            
            self.critic_q_model = tf.matmul(self.H2_c,self.W3_c)+self.B3_c
            
           # Target Critic Q Network:
            self.t_critic_state_in =  tf.placeholder("float",[None,num_states])
            self.t_critic_action_in = tf.placeholder("float",[None,num_actions])
            self.t_W1_c = tf.Variable(tf.random_uniform([num_states,N_HIDDEN_1],-1/math.sqrt(num_states),1/math.sqrt(num_states)))
            self.t_B1_c = tf.Variable(tf.random_uniform([N_HIDDEN_1],-1/math.sqrt(num_states),1/math.sqrt(num_states)))
            self.t_W2_c = tf.Variable(tf.random_uniform([N_HIDDEN_1,N_HIDDEN_2],-1/math.sqrt(N_HIDDEN_1+num_actions),1/math.sqrt(N_HIDDEN_1+num_actions)))  
            self.t_W2_action_c = tf.Variable(tf.random_uniform([num_actions,N_HIDDEN_2],-1/math.sqrt(N_HIDDEN_1+num_actions),1/math.sqrt(N_HIDDEN_1+num_actions)))
            self.t_B2_c= tf.Variable(tf.random_uniform([N_HIDDEN_2],-1/math.sqrt(N_HIDDEN_1+num_actions),1/math.sqrt(N_HIDDEN_1+num_actions)))
            self.t_W3_c = tf.Variable(tf.random_uniform([N_HIDDEN_2,1],-0.003,0.003))
            self.t_B3_c = tf.Variable(tf.random_uniform([1],-0.003,0.003))
            
            self.t_H1_t = tf.matmul(self.t_critic_state_in,self.t_W1_c)
            self.t_H1_c_bn = batch_norm(self.t_H1_t,N_HIDDEN_1,self.is_training,self.sess,self.H1_c_bn)        
            self.t_H1_c = tf.nn.softplus(self.t_H1_c_bn.bnorm) + self.t_B1_c

            self.t_H2_t = tf.matmul(self.t_H1_c,self.t_W2_c)+tf.matmul(self.t_critic_action_in,self.t_W2_action_c)
            print (self.t_H2_t)
            self.t_H2_c_bn = batch_norm(self.t_H2_t,N_HIDDEN_2,self.is_training,self.sess,self.H2_c_bn)
            self.t_H2_c = tf.nn.tanh(self.t_H2_c_bn.bnorm) + self.t_B2_c
            
            self.t_critic_q_model = tf.matmul(self.t_H2_c,self.t_W3_c)+self.t_B3_c
            
            
            self.q_value_in=tf.placeholder("float",[None,1]) #supervisor
            #self.l2_regularizer_loss = tf.nn.l2_loss(self.W1_c)+tf.nn.l2_loss(self.W2_c)+ tf.nn.l2_loss(self.W2_action_c) + tf.nn.l2_loss(self.W3_c)+tf.nn.l2_loss(self.B1_c)+tf.nn.l2_loss(self.B2_c)+tf.nn.l2_loss(self.B3_c) 
            self.l2_regularizer_loss = 0.0001*tf.reduce_sum(tf.pow(self.W2_c,2))             
            self.cost=tf.pow(self.critic_q_model-self.q_value_in,2)/BATCH_SIZE + self.l2_regularizer_loss#/tf.to_float(tf.shape(self.q_value_in)[0])
            self.optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE).minimize(self.cost)
            self.act_grad_v = tf.gradients(self.critic_q_model, self.critic_action_in)
            self.action_gradients = [self.act_grad_v[0]/tf.to_float(tf.shape(self.act_grad_v[0])[0])] #this is just divided by batch size
            #from simple actor net:
            self.check_fl = self.action_gradients             
            
            #initialize all tensor variable parameters:
            self.sess.run(tf.initialize_all_variables())
            
            #To initialize critic and target with the same values:
            # copy target parameters
            self.sess.run([
				self.t_W1_c.assign(self.W1_c),
				self.t_B1_c.assign(self.B1_c),
				self.t_W2_c.assign(self.W2_c),
				self.t_W2_action_c.assign(self.W2_action_c),
				self.t_B2_c.assign(self.B2_c),
				self.t_W3_c.assign(self.W3_c),
				self.t_B3_c.assign(self.B3_c)
			])
Exemplo n.º 3
0
    def __init__(self,num_states,num_actions):
        tf.reset_default_graph()
        self.g=tf.Graph()
        with self.g.as_default():
            self.sess = tf.InteractiveSession()
            
            #actor network model parameters:
            self.actor_state_in = tf.placeholder("float",[None,num_states]) 
            self.W1_a = tf.Variable(tf.random_uniform([num_states,N_HIDDEN_1],-1/math.sqrt(num_states),1/math.sqrt(num_states)))
            self.B1_a=tf.Variable(tf.random_uniform([N_HIDDEN_1],-1/math.sqrt(num_states),1/math.sqrt(num_states)))
            self.W2_a = tf.Variable(tf.random_uniform([N_HIDDEN_1,N_HIDDEN_2],-1/math.sqrt(N_HIDDEN_1),1/math.sqrt(N_HIDDEN_1)))
            self.B2_a=tf.Variable(tf.random_uniform([N_HIDDEN_2],-1/math.sqrt(N_HIDDEN_1),1/math.sqrt(N_HIDDEN_1)))
            self.W3_a = tf.Variable(tf.random_uniform([N_HIDDEN_2,num_actions],-0.003,0.003))
            self.B3_a = tf.Variable(tf.random_uniform([num_actions],-0.003,0.003))
            
            self.is_training = tf.placeholder(tf.bool, [])
            self.H1_t= tf.matmul(self.actor_state_in,self.W1_a)
            self.H1_a_bn = batch_norm(self.H1_t,N_HIDDEN_1, self.is_training, self.sess)
            self.H1_a = tf.nn.softplus(self.H1_a_bn.bnorm) + self.B1_a
            
            self.H2_t=tf.matmul(self.H1_a,self.W2_a)
            self.H2_a_bn = batch_norm(self.H2_t,N_HIDDEN_2,self.is_training,self.sess)
            self.H2_a = tf.nn.tanh(self.H2_a_bn.bnorm) + self.B2_a
            self.actor_model=tf.matmul(self.H2_a,self.W3_a) + self.B3_a
            
                                   
            #target actor network model parameters:
            self.t_actor_state_in = tf.placeholder("float",[None,num_states]) 
            self.t_W1_a = tf.Variable(tf.random_uniform([num_states,N_HIDDEN_1],-1/math.sqrt(num_states),1/math.sqrt(num_states)))
            self.t_B1_a=tf.Variable(tf.random_uniform([N_HIDDEN_1],-1/math.sqrt(num_states),1/math.sqrt(num_states)))
            self.t_W2_a = tf.Variable(tf.random_uniform([N_HIDDEN_1,N_HIDDEN_2],-1/math.sqrt(N_HIDDEN_1),1/math.sqrt(N_HIDDEN_1)))
            self.t_B2_a=tf.Variable(tf.random_uniform([N_HIDDEN_2],-1/math.sqrt(N_HIDDEN_1),1/math.sqrt(N_HIDDEN_1)))
            self.t_W3_a = tf.Variable(tf.random_uniform([N_HIDDEN_2,num_actions],-0.003,0.003))
            self.t_B3_a = tf.Variable(tf.random_uniform([num_actions],-0.003,0.003))
            
            self.t_is_training = tf.placeholder(tf.bool, [])
            self.t_H1_t= tf.matmul(self.t_actor_state_in,self.t_W1_a)
            self.t_H1_a_bn = batch_norm(self.t_H1_t,N_HIDDEN_1, self.t_is_training, self.sess,self.H1_a_bn)
            self.t_H1_a = tf.nn.softplus(self.t_H1_a_bn.bnorm) + self.t_B1_a
            
            self.t_H2_t=tf.matmul(self.t_H1_a,self.t_W2_a)
            self.t_H2_a_bn = batch_norm(self.t_H2_t,N_HIDDEN_2,self.t_is_training,self.sess,self.H2_a_bn)
            self.t_H2_a = tf.nn.tanh(self.t_H2_a_bn.bnorm) + self.t_B2_a
            self.t_actor_model=tf.matmul(self.t_H2_a,self.t_W3_a) + self.t_B3_a
            
            #cost of actor network:
            self.q_gradient_input = tf.placeholder("float",[None,num_actions]) #gets input from action_gradient computed in critic network file
            self.actor_parameters = [self.W1_a, self.B1_a, self.W2_a, self.B2_a,self.W3_a, self.B3_a, self.H1_a_bn.scale,self.H1_a_bn.beta,self.H2_a_bn.scale,self.H2_a_bn.beta]
            self.parameters_gradients = tf.gradients(self.actor_model,self.actor_parameters,-self.q_gradient_input)#/BATCH_SIZE) changed -self.q_gradient to -
            
            self.optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE,epsilon=1e-08).apply_gradients(zip(self.parameters_gradients,self.actor_parameters))  
            #initialize all tensor variable parameters:
            self.sess.run(tf.initialize_all_variables())    
            
            #To make sure actor and target have same intial parmameters copy the parameters:
            # copy target parameters
            self.sess.run([
				self.t_W1_a.assign(self.W1_a),
				self.t_B1_a.assign(self.B1_a),
				self.t_W2_a.assign(self.W2_a),
				self.t_B2_a.assign(self.B2_a),
				self.t_W3_a.assign(self.W3_a),
				self.t_B3_a.assign(self.B3_a)])

            self.update_target_actor_op = [
                self.t_W1_a.assign(TAU*self.W1_a+(1-TAU)*self.t_W1_a),
                self.t_B1_a.assign(TAU*self.B1_a+(1-TAU)*self.t_B1_a),  
                self.t_W2_a.assign(TAU*self.W2_a+(1-TAU)*self.t_W2_a),
                self.t_B2_a.assign(TAU*self.B2_a+(1-TAU)*self.t_B2_a),  
                self.t_W3_a.assign(TAU*self.W3_a+(1-TAU)*self.t_W3_a),
                self.t_B3_a.assign(TAU*self.B3_a+(1-TAU)*self.t_B3_a),
                self.t_H1_a_bn.updateTarget,
                self.t_H2_a_bn.updateTarget,
            ]