def __init__(self, agent, policy_model, total_reward): self.agent = agent self.policy_model = policy_model self.total_reward = total_reward # Compute MLE loss function. MLE is used to initialize parameters for policy gradient self.mle_policy_gradient = MaximumLikelihoodEstimation(agent, policy_model) # Compute loss function loss, entropy_penalty = self.calc_loss( self.policy_model.model_output, self.policy_model.model_output_indices, self.policy_model.target) optimizer = tf.train.AdamOptimizer(AbstractLearning.rl_learning_rate) using_grad_clip = True grad_clip_val = 5.0 if not using_grad_clip: train_step = optimizer.minimize(loss) else: gvs = optimizer.compute_gradients(loss) capped_gvs = [(tf.clip_by_norm(grad, grad_clip_val), var) if grad is not None else (grad, var) for grad, var in gvs] train_step = optimizer.apply_gradients(capped_gvs) # Create summaries for training summary_loss = tf.scalar_summary("Loss", loss) summary_target_min = tf.scalar_summary("Target Min", tf.reduce_min(self.policy_model.target)) summary_target_max = tf.scalar_summary("Target Max", tf.reduce_max(self.policy_model.target)) summary_target_mean = tf.scalar_summary("Target Mean", tf.reduce_mean(self.policy_model.target)) summary_entropy_penalty = tf.scalar_summary("Entropy Penalty", entropy_penalty) update_summaries = [summary_loss, summary_target_min, summary_target_max, summary_target_mean, summary_entropy_penalty] AbstractLearning.__init__(self, policy_model, loss, train_step, update_summaries)
def __init__(self, train_alg, config, constants): # Initialize logger logger.Log.open("./log_" + str(datetime.now()) + ".txt") self.config = config # Connect to simulator if len(sys.argv) < 2: logger.Log.info("IP not given. Using localhost i.e. 0.0.0.0") self.unity_ip = "0.0.0.0" else: self.unity_ip = sys.argv[1] if len(sys.argv) < 3: logger.Log.info("PORT not given. Using 11000") self.PORT = 11000 else: self.PORT = int(sys.argv[2]) # Size of image image_dim = self.config.screen_size self.connection = rc.ReliableConnect(self.unity_ip, self.PORT, image_dim) self.connection.connect() # Dataset specific parameters self.num_block = 20 self.num_direction = 4 use_stop = True if use_stop: self.num_actions = self.num_block * self.num_direction + 1 # 1 for stopping else: self.num_actions = self.num_block * self.num_direction # Create toolkit of message protocol between simulator and agent self.message_protocol_kit = mpu.MessageProtocolUtil( self.num_direction, self.num_actions, use_stop) # Test policy self.test_policy = gp.GenericPolicy.get_argmax_action # MDP details self.gamma = 1.0 # Training algorithm behaviour self.train_alg = train_alg # Define model and learning algorithm if self.train_alg == SUPERVISEDMLE: self.model = PolicyNetwork(image_dim, self.num_actions, constants) self.learning_alg = MaximumLikelihoodEstimation(self, self.model) elif self.train_alg == REINFORCE: self.model = PolicyNetwork(image_dim, self.num_actions, constants) self.learning_alg = PolicyGradient(self, self.model, total_reward=True) elif self.train_alg == CONTEXTUALBANDIT: self.model = PolicyNetwork(image_dim, self.num_actions, constants) self.learning_alg = PolicyGradient(self, self.model, total_reward=False) elif self.train_alg == PGADVANTAGE: self.model = PolicyNetwork(image_dim, self.num_actions, constants) self.state_value_model = StateValueFunctionModel( 250, image_dim, 200, 24, 32) self.learning_alg = PolicyGradientWithAdvantage( self, self.model, self.state_value_model, total_reward=True) elif self.train_alg == SIMPLEQLEARNING: self.model = ActionValueFunctionNetwork(250, image_dim, 200, 24, 32) self.target_q_network = ActionValueFunctionNetwork( 250, image_dim, 200, 24, 32, scope_name="Target_Q_Network") self.learning_alg = QLearning(self, self.model, self.target_q_network) else: raise AssertionError("Training algorithm " + str(self.train_alg) + " not found or implemented.") self.sess = None self.train_writer = None self.config.log_flag() logger.Log.info("Training Algorithm: " + str(self.train_alg) + ", Gamma: " + str(self.gamma)) logger.Log.info("Created Agent.")