def __init__(self, in_policy_file, out_policy_file, domainString='CamRestaurants', is_training=False): super(BDQNPolicy, self).__init__(domainString, is_training) tf.reset_default_graph() self.domainString = domainString self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString) self.in_policy_file = in_policy_file self.out_policy_file = out_policy_file self.is_training = is_training self.accum_belief = [] self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString) self.prev_state_check = None # parameter settings self.n_in = 260 if cfg.has_option('dqnpolicy', 'n_in'): self.n_in = cfg.getint('dqnpolicy', 'n_in') self.actor_lr = 0.0001 if cfg.has_option('dqnpolicy', 'actor_lr'): self.actor_lr = cfg.getfloat('dqnpolicy', 'actor_lr') self.critic_lr = 0.001 if cfg.has_option('dqnpolicy', 'critic_lr'): self.critic_lr = cfg.getfloat('dqnpolicy', 'critic_lr') self.tau = 0.001 if cfg.has_option('dqnpolicy', 'tau'): self.tau = cfg.getfloat('dqnpolicy', 'tau') self.randomseed = 1234 if cfg.has_option('GENERAL', 'seed'): self.randomseed = cfg.getint('GENERAL', 'seed') self.gamma = 1.0 if cfg.has_option('dqnpolicy', 'gamma'): self.gamma = cfg.getfloat('dqnpolicy', 'gamma') self.regularisation = 'l2' if cfg.has_option('dqnpolicy', 'regularisation'): self.regularisation = cfg.get('dqnpolicy', 'regulariser') self.learning_rate = 0.001 # ct506 #0.001 if cfg.has_option('dqnpolicy', 'learning_rate'): self.learning_rate = cfg.getfloat('dqnpolicy', 'learning_rate') self.exploration_type = 'e-greedy' # Boltzman if cfg.has_option('dqnpolicy', 'exploration_type'): self.exploration_type = cfg.get('dqnpolicy', 'exploration_type') self.episodeNum = 1000 if cfg.has_option('dqnpolicy', 'episodeNum'): self.episodeNum = cfg.getfloat('dqnpolicy', 'episodeNum') self.maxiter = 5000 if cfg.has_option('dqnpolicy', 'maxiter'): self.maxiter = cfg.getfloat('dqnpolicy', 'maxiter') self.epsilon = 1 if cfg.has_option('dqnpolicy', 'epsilon'): self.epsilon = cfg.getfloat('dqnpolicy', 'epsilon') self.epsilon_start = 1 if cfg.has_option('dqnpolicy', 'epsilon_start'): self.epsilon_start = cfg.getfloat('dqnpolicy', 'epsilon_start') self.epsilon_end = 1 if cfg.has_option('dqnpolicy', 'epsilon_end'): self.epsilon_end = cfg.getfloat('dqnpolicy', 'epsilon_end') self.priorProbStart = 1.0 if cfg.has_option('dqnpolicy', 'prior_sample_prob_start'): self.priorProbStart = cfg.getfloat('dqnpolicy', 'prior_sample_prob_start') self.priorProbEnd = 0.1 if cfg.has_option('dqnpolicy', 'prior_sample_prob_end'): self.priorProbEnd = cfg.getfloat('dqnpolicy', 'prior_sample_prob_end') self.policyfeatures = [] if cfg.has_option('dqnpolicy', 'features'): logger.info('Features: ' + str(cfg.get('dqnpolicy', 'features'))) self.policyfeatures = json.loads(cfg.get('dqnpolicy', 'features')) self.max_k = 5 if cfg.has_option('dqnpolicy', 'max_k'): self.max_k = cfg.getint('dqnpolicy', 'max_k') self.learning_algorithm = 'drl' if cfg.has_option('dqnpolicy', 'learning_algorithm'): self.learning_algorithm = cfg.get('dqnpolicy', 'learning_algorithm') logger.info('Learning algorithm: ' + self.learning_algorithm) self.minibatch_size = 32 if cfg.has_option('dqnpolicy', 'minibatch_size'): self.minibatch_size = cfg.getint('dqnpolicy', 'minibatch_size') self.capacity = 1000 # max(self.minibatch_size, 2000) if cfg.has_option('dqnpolicy', 'capacity'): self.capacity = max(cfg.getint('dqnpolicy', 'capacity'), 2000) self.replay_type = 'vanilla' if cfg.has_option('dqnpolicy', 'replay_type'): self.replay_type = cfg.get('dqnpolicy', 'replay_type') self.architecture = 'vanilla' if cfg.has_option('dqnpolicy', 'architecture'): self.architecture = cfg.get('dqnpolicy', 'architecture') self.q_update = 'single' if cfg.has_option('dqnpolicy', 'q_update'): self.q_update = cfg.get('dqnpolicy', 'q_update') self.h1_size = 130 if cfg.has_option('dqnpolicy', 'h1_size'): self.h1_size = cfg.getint('dqnpolicy', 'h1_size') self.h2_size = 130 if cfg.has_option('dqnpolicy', 'h2_size'): self.h2_size = cfg.getint('dqnpolicy', 'h2_size') self.save_step = 200 if cfg.has_option('policy', 'save_step'): self.save_step = cfg.getint('policy', 'save_step') # BDQN parameteres self.n_samples = 1 if cfg.has_option('dqnpolicy', 'n_samples'): self.n_samples = cfg.getint('dqnpolicy', 'n_samples') sigma_prior = 1.5 # np.array(-3.0, dtype=np.float32) if cfg.has_option('dqnpolicy', 'sigma_prior'): sigma_prior = cfg.getfloat('dqnpolicy', 'sigma_prior') self.sigma_prior = tf.exp(sigma_prior) # np.exp(np.array(sigma_prior, dtype=np.float32)) self.stddev_var_mu = 0.01 if cfg.has_option('dqnpolicy', 'stddev_var_mu'): self.stddev_var_mu = cfg.getfloat('dqnpolicy', 'stddev_var_mu') self.stddev_var_logsigma = 0.01 if cfg.has_option('dqnpolicy', 'stddev_var_logsigma'): self.stddev_var_logsigma = cfg.getfloat('dqnpolicy', 'stddev_var_logsigma') self.mean_log_sigma = 0.000001 if cfg.has_option('dqnpolicy', 'mean_log_sigma'): self.mean_log_sigma = cfg.getfloat('dqnpolicy', 'mean_log_sigma') self.n_batches = 1000.0 if cfg.has_option('dqnpolicy', 'n_batches'): self.n_batches = cfg.getfloat('dqnpolicy', 'n_batches') self.importance_sampling = False if cfg.has_option('dqnpolicy', 'importance_sampling'): self.importance_sampling = cfg.getboolean('dqnpolicy', 'importance_sampling') self.alpha = 0.85 if cfg.has_option('dqnpolicy', 'alpha'): self.alpha = cfg.getfloat('dqnpolicy', 'alpha') self.alpha_divergence = False if cfg.has_option('dqnpolicy', 'alpha_divergence'): self.alpha_divergence = cfg.getboolean('dqnpolicy', 'alpha_divergence') self.sigma_eps = 0.01 if cfg.has_option('dqnpolicy', 'sigma_eps'): self.sigma_eps = cfg.getfloat('dqnpolicy', 'sigma_eps') self.training_frequency = 2 if cfg.has_option('dqnpolicy', 'training_frequency'): self.training_frequency = cfg.getint('dqnpolicy', 'training_frequency') # domain specific parameter settings (overrides general policy parameter settings) if cfg.has_option('dqnpolicy_' + domainString, 'n_in'): self.n_in = cfg.getint('dqnpolicy_' + domainString, 'n_in') if cfg.has_option('dqnpolicy_' + domainString, 'actor_lr'): self.actor_lr = cfg.getfloat('dqnpolicy_' + domainString, 'actor_lr') if cfg.has_option('dqnpolicy_' + domainString, 'critic_lr'): self.critic_lr = cfg.getfloat('dqnpolicy_' + domainString, 'critic_lr') if cfg.has_option('dqnpolicy_' + domainString, 'tau'): self.tau = cfg.getfloat('dqnpolicy_' + domainString, 'tau') if cfg.has_option('dqnpolicy_' + domainString, 'gamma'): self.gamma = cfg.getfloat('dqnpolicy_' + domainString, 'gamma') if cfg.has_option('dqnpolicy_' + domainString, 'regularisation'): self.regularisation = cfg.get('dqnpolicy_' + domainString, 'regulariser') if cfg.has_option('dqnpolicy_' + domainString, 'learning_rate'): self.learning_rate = cfg.getfloat('dqnpolicy_' + domainString, 'learning_rate') if cfg.has_option('dqnpolicy_' + domainString, 'exploration_type'): self.exploration_type = cfg.get('dqnpolicy_' + domainString, 'exploration_type') if cfg.has_option('dqnpolicy_' + domainString, 'episodeNum'): self.episodeNum = cfg.getfloat('dqnpolicy_' + domainString, 'episodeNum') if cfg.has_option('dqnpolicy_' + domainString, 'maxiter'): self.maxiter = cfg.getfloat('dqnpolicy_' + domainString, 'maxiter') if cfg.has_option('dqnpolicy_' + domainString, 'epsilon'): self.epsilon = cfg.getfloat('dqnpolicy_' + domainString, 'epsilon') if cfg.has_option('dqnpolicy_' + domainString, 'epsilon_start'): self.epsilon_start = cfg.getfloat('dqnpolicy_' + domainString, 'epsilon_start') if cfg.has_option('dqnpolicy_' + domainString, 'epsilon_end'): self.epsilon_end = cfg.getfloat('dqnpolicy_' + domainString, 'epsilon_end') if cfg.has_option('dqnpolicy_' + domainString, 'prior_sample_prob_start'): self.priorProbStart = cfg.getfloat('dqnpolicy_' + domainString, 'prior_sample_prob_start') if cfg.has_option('dqnpolicy_' + domainString, 'prior_sample_prob_end'): self.priorProbEnd = cfg.getfloat('dqnpolicy_' + domainString, 'prior_sample_prob_end') if cfg.has_option('dqnpolicy_' + domainString, 'features'): logger.info('Features: ' + str(cfg.get('dqnpolicy_' + domainString, 'features'))) self.policyfeatures = json.loads(cfg.get('dqnpolicy_' + domainString, 'features')) if cfg.has_option('dqnpolicy_' + domainString, 'max_k'): self.max_k = cfg.getint('dqnpolicy_' + domainString, 'max_k') if cfg.has_option('dqnpolicy_' + domainString, 'learning_algorithm'): self.learning_algorithm = cfg.get('dqnpolicy_' + domainString, 'learning_algorithm') logger.info('Learning algorithm: ' + self.learning_algorithm) if cfg.has_option('dqnpolicy_' + domainString, 'minibatch_size'): self.minibatch_size = cfg.getint('dqnpolicy_' + domainString, 'minibatch_size') if cfg.has_option('dqnpolicy_' + domainString, 'capacity'): self.capacity = cfg.getint('dqnpolicy_' + domainString, 'capacity') if cfg.has_option('dqnpolicy_' + domainString, 'replay_type'): self.replay_type = cfg.get('dqnpolicy_' + domainString, 'replay_type') if cfg.has_option('dqnpolicy_' + domainString, 'architecture'): self.architecture = cfg.get('dqnpolicy_' + domainString, 'architecture') if cfg.has_option('dqnpolicy_' + domainString, 'q_update'): self.q_update = cfg.get('dqnpolicy_' + domainString, 'q_update') if cfg.has_option('dqnpolicy_' + domainString, 'h1_size'): self.h1_size = cfg.getint('dqnpolicy_' + domainString, 'h1_size') if cfg.has_option('dqnpolicy_' + domainString, 'h2_size'): self.h2_size = cfg.getint('dqnpolicy_' + domainString, 'h2_size') if cfg.has_option('policy_' + domainString, 'save_step'): self.save_step = cfg.getint('policy_' + domainString, 'save_step') # BDQN parameteres if cfg.has_option('dqnpolicy_' + domainString, 'n_samples'): self.n_samples = cfg.getint('dqnpolicy_' + domainString, 'n_samples') if cfg.has_option('dqnpolicy_' + domainString, 'sigma_prior'): sigma_prior = cfg.getfloat('dqnpolicy_' + domainString, 'sigma_prior') self.sigma_prior = tf.exp(sigma_prior) # np.exp(np.array(sigma_prior, dtype=np.float32)) if cfg.has_option('dqnpolicy_' + domainString, 'stddev_var_mu'): self.stddev_var_mu = cfg.getfloat('dqnpolicy_' + domainString, 'stddev_var_mu') if cfg.has_option('dqnpolicy_' + domainString, 'stddev_var_logsigma'): self.stddev_var_logsigma = cfg.getfloat('dqnpolicy_' + domainString, 'stddev_var_logsigma') if cfg.has_option('dqnpolicy_' + domainString, 'mean_log_sigma'): self.mean_log_sigma = cfg.getfloat('dqnpolicy_' + domainString, 'mean_log_sigma') if cfg.has_option('dqnpolicy_' + domainString, 'n_batches'): self.n_batches = cfg.getfloat('dqnpolicy_' + domainString, 'n_batches') if cfg.has_option('dqnpolicy_' + domainString, 'importance_sampling'): self.importance_sampling = cfg.getboolean('dqnpolicy_' + domainString, 'importance_sampling') if cfg.has_option('dqnpolicy_' + domainString, 'alpha'): self.alpha = cfg.getfloat('dqnpolicy_' + domainString, 'alpha') if cfg.has_option('dqnpolicy_' + domainString, 'alpha_divergence'): self.alpha_divergence = cfg.getboolean('dqnpolicy_' + domainString, 'alpha_divergence') if cfg.has_option('dqnpolicy_' + domainString, 'sigma_eps'): self.sigma_eps = cfg.getfloat('dqnpolicy_' + domainString, 'sigma_eps') if cfg.has_option('dqnpolicy_' + domainString, 'training_frequency'): self.training_frequency = cfg.getint('dqnpolicy_' + domainString, 'training_frequency') print 'ct506', 'sigma_eps', self.sigma_eps, 'lr', self.learning_rate, 'm', self.n_batches self.episode_ave_max_q = [] os.environ["CUDA_VISIBLE_DEVICES"] = "" # init session self.sess = tf.Session() with tf.device("/cpu:0"): np.random.seed(self.randomseed) tf.set_random_seed(self.randomseed) # initialise an replay buffer if self.replay_type == 'vanilla': self.episodes[self.domainString] = ReplayBuffer(self.capacity, self.minibatch_size, self.randomseed) elif self.replay_type == 'prioritized': self.episodes[self.domainString] = ReplayPrioritised(self.capacity, self.minibatch_size, self.randomseed) # replay_buffer = ReplayBuffer(self.capacity, self.randomseed) # self.episodes = [] self.samplecount = 0 self.episodecount = 0 # construct the models self.state_dim = self.n_in self.summaryaction = SummaryAction.SummaryAction(domainString) self.action_dim = len(self.summaryaction.action_names) action_bound = len(self.summaryaction.action_names) self.stats = [0 for _ in range(self.action_dim)] self.stdVar = [] self.meanVar = [] self.stdMean = [] self.meanMean = [] self.td_error = [] self.td_errorVar = [] self.bbqn = bbqn.DeepQNetwork(self.sess, self.state_dim, self.action_dim, self.learning_rate, self.tau, action_bound, self.architecture, self.h1_size, self.h2_size, self.n_samples, self.minibatch_size, self.sigma_prior, self.n_batches, self.stddev_var_mu, self.stddev_var_logsigma, self.mean_log_sigma, self.importance_sampling, self.alpha_divergence, self.alpha, self.sigma_eps) # when all models are defined, init all variables init_op = tf.global_variables_initializer() self.sess.run(init_op) self.loadPolicy(self.in_policy_file) print 'loaded replay size: ', self.episodes[self.domainString].size() self.bbqn.update_target_network()
def __init__(self, in_policy_file, out_policy_file, domainString='CamRestaurants', is_training=False): super(ENACPolicy, self).__init__(domainString, is_training) tf.reset_default_graph() self.in_policy_file = in_policy_file self.out_policy_file = out_policy_file self.is_training = is_training self.accum_belief = [] self.prev_state_check = None self.domainString = domainString self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString) # parameter settings if 0:#cfg.has_option('dqnpolicy', 'n_in'): #ic304: this was giving me a weird error, disabled it until i can check it deeper self.n_in = cfg.getint('dqnpolicy', 'n_in') else: self.n_in = self.get_n_in(domainString) self.actor_lr = 0.0001 if cfg.has_option('dqnpolicy', 'actor_lr'): self.actor_lr = cfg.getfloat('dqnpolicy', 'actor_lr') self.critic_lr = 0.001 if cfg.has_option('dqnpolicy', 'critic_lr'): self.critic_lr = cfg.getfloat('dqnpolicy', 'critic_lr') self.tau = 0.001 if cfg.has_option('dqnpolicy', 'tau'): self.tau = cfg.getfloat('dqnpolicy', 'tau') self.randomseed = 1234 if cfg.has_option('GENERAL', 'seed'): self.randomseed = cfg.getint('GENERAL', 'seed') self.gamma = 1.0 if cfg.has_option('dqnpolicy', 'gamma'): self.gamma = cfg.getfloat('dqnpolicy', 'gamma') self.regularisation = 'l2' if cfg.has_option('dqnpolicy', 'regularisation'): self.regularisation = cfg.get('dqnpolicy', 'regulariser') self.learning_rate = 0.001 if cfg.has_option('dqnpolicy', 'learning_rate'): self.learning_rate = cfg.getfloat('dqnpolicy', 'learning_rate') self.exploration_type = 'e-greedy' # Boltzman if cfg.has_option('dqnpolicy', 'exploration_type'): self.exploration_type = cfg.get('dqnpolicy', 'exploration_type') self.episodeNum = 1000 if cfg.has_option('dqnpolicy', 'episodeNum'): self.episodeNum = cfg.getfloat('dqnpolicy', 'episodeNum') self.maxiter = 5000 if cfg.has_option('dqnpolicy', 'maxiter'): self.maxiter = cfg.getfloat('dqnpolicy', 'maxiter') self.epsilon = 1 if cfg.has_option('dqnpolicy', 'epsilon'): self.epsilon = cfg.getfloat('dqnpolicy', 'epsilon') self.epsilon_start = 1 if cfg.has_option('dqnpolicy', 'epsilon_start'): self.epsilon_start = cfg.getfloat('dqnpolicy', 'epsilon_start') self.epsilon_end = 1 if cfg.has_option('dqnpolicy', 'epsilon_end'): self.epsilon_end = cfg.getfloat('dqnpolicy', 'epsilon_end') self.priorProbStart = 1.0 if cfg.has_option('dqnpolicy', 'prior_sample_prob_start'): self.priorProbStart = cfg.getfloat('dqnpolicy', 'prior_sample_prob_start') self.priorProbEnd = 0.1 if cfg.has_option('dqnpolicy', 'prior_sample_prob_end'): self.priorProbEnd = cfg.getfloat('dqnpolicy', 'prior_sample_prob_end') self.policyfeatures = [] if cfg.has_option('dqnpolicy', 'features'): logger.info('Features: ' + str(cfg.get('dqnpolicy', 'features'))) self.policyfeatures = json.loads(cfg.get('dqnpolicy', 'features')) self.max_k = 5 if cfg.has_option('dqnpolicy', 'max_k'): self.max_k = cfg.getint('dqnpolicy', 'max_k') self.learning_algorithm = 'drl' if cfg.has_option('dqnpolicy', 'learning_algorithm'): self.learning_algorithm = cfg.get('dqnpolicy', 'learning_algorithm') logger.info('Learning algorithm: ' + self.learning_algorithm) self.minibatch_size = 32 if cfg.has_option('dqnpolicy', 'minibatch_size'): self.minibatch_size = cfg.getint('dqnpolicy', 'minibatch_size') self.capacity = 1000 # max(self.minibatch_size, 2000) if cfg.has_option('dqnpolicy', 'capacity'): self.capacity = cfg.getint('dqnpolicy', 'capacity') self.replay_type = 'vanilla' if cfg.has_option('dqnpolicy', 'replay_type'): self.replay_type = cfg.get('dqnpolicy', 'replay_type') self.architecture = 'vanilla' if cfg.has_option('dqnpolicy', 'architecture'): self.architecture = cfg.get('dqnpolicy', 'architecture') self.q_update = 'single' if cfg.has_option('dqnpolicy', 'q_update'): self.q_update = cfg.get('dqnpolicy', 'q_update') self.h1_size = 130 if cfg.has_option('dqnpolicy', 'h1_size'): self.h1_size = cfg.getint('dqnpolicy', 'h1_size') self.h2_size = 50 if cfg.has_option('dqnpolicy', 'h2_size'): self.h2_size = cfg.getint('dqnpolicy', 'h2_size') self.save_step = 200 if cfg.has_option('policy', 'save_step'): self.save_step = cfg.getint('policy', 'save_step') self.importance_sampling = 'soft' if cfg.has_option('dqnpolicy', 'importance_sampling'): self.importance_sampling = cfg.get('dqnpolicy', 'importance_sampling') self.training_frequency = 2 if cfg.has_option('dqnpolicy', 'training_frequency'): self.training_frequency = cfg.getint('dqnpolicy', 'training_frequency') # domain specific parameter settings (overrides general policy parameter settings) if cfg.has_option('dqnpolicy_' + domainString, 'n_in'): self.n_in = cfg.getint('dqnpolicy_' + domainString, 'n_in') if cfg.has_option('dqnpolicy_' + domainString, 'actor_lr'): self.actor_lr = cfg.getfloat('dqnpolicy_' + domainString, 'actor_lr') if cfg.has_option('dqnpolicy_' + domainString, 'critic_lr'): self.critic_lr = cfg.getfloat('dqnpolicy_' + domainString, 'critic_lr') if cfg.has_option('dqnpolicy_' + domainString, 'tau'): self.tau = cfg.getfloat('dqnpolicy_' + domainString, 'tau') if cfg.has_option('dqnpolicy_' + domainString, 'gamma'): self.gamma = cfg.getfloat('dqnpolicy_' + domainString, 'gamma') if cfg.has_option('dqnpolicy_' + domainString, 'regularisation'): self.regularisation = cfg.get('dqnpolicy_' + domainString, 'regulariser') if cfg.has_option('dqnpolicy_' + domainString, 'learning_rate'): self.learning_rate = cfg.getfloat('dqnpolicy_' + domainString, 'learning_rate') if cfg.has_option('dqnpolicy_' + domainString, 'exploration_type'): self.exploration_type = cfg.get('dqnpolicy_' + domainString, 'exploration_type') if cfg.has_option('dqnpolicy_' + domainString, 'episodeNum'): self.episodeNum = cfg.getfloat('dqnpolicy_' + domainString, 'episodeNum') if cfg.has_option('dqnpolicy_' + domainString, 'maxiter'): self.maxiter = cfg.getfloat('dqnpolicy_' + domainString, 'maxiter') if cfg.has_option('dqnpolicy_' + domainString, 'epsilon'): self.epsilon = cfg.getfloat('dqnpolicy_' + domainString, 'epsilon') if cfg.has_option('dqnpolicy_' + domainString, 'epsilon_start'): self.epsilon_start = cfg.getfloat('dqnpolicy_' + domainString, 'epsilon_start') if cfg.has_option('dqnpolicy_' + domainString, 'epsilon_end'): self.epsilon_end = cfg.getfloat('dqnpolicy_' + domainString, 'epsilon_end') if cfg.has_option('dqnpolicy_' + domainString, 'prior_sample_prob_start'): self.priorProbStart = cfg.getfloat('dqnpolicy_' + domainString, 'prior_sample_prob_start') if cfg.has_option('dqnpolicy_' + domainString, 'prior_sample_prob_end'): self.priorProbEnd = cfg.getfloat('dqnpolicy_' + domainString, 'prior_sample_prob_end') if cfg.has_option('dqnpolicy_' + domainString, 'features'): logger.info('Features: ' + str(cfg.get('dqnpolicy_' + domainString, 'features'))) self.policyfeatures = json.loads(cfg.get('dqnpolicy_' + domainString, 'features')) if cfg.has_option('dqnpolicy_' + domainString, 'max_k'): self.max_k = cfg.getint('dqnpolicy_' + domainString, 'max_k') self.learning_algorithm = 'drl' if cfg.has_option('dqnpolicy_' + domainString, 'learning_algorithm'): self.learning_algorithm = cfg.get('dqnpolicy_' + domainString, 'learning_algorithm') logger.info('Learning algorithm: ' + self.learning_algorithm) if cfg.has_option('dqnpolicy_' + domainString, 'minibatch_size'): self.minibatch_size = cfg.getint('dqnpolicy_' + domainString, 'minibatch_size') if cfg.has_option('dqnpolicy_' + domainString, 'capacity'): self.capacity = cfg.getint('dqnpolicy_' + domainString, 'capacity') if cfg.has_option('dqnpolicy_' + domainString, 'replay_type'): self.replay_type = cfg.get('dqnpolicy_' + domainString, 'replay_type') if cfg.has_option('dqnpolicy_' + domainString, 'architecture'): self.architecture = cfg.get('dqnpolicy_' + domainString, 'architecture') if cfg.has_option('dqnpolicy_' + domainString, 'q_update'): self.q_update = cfg.get('dqnpolicy_' + domainString, 'q_update') if cfg.has_option('dqnpolicy_' + domainString, 'h1_size'): self.h1_size = cfg.getint('dqnpolicy_' + domainString, 'h1_size') if cfg.has_option('dqnpolicy_' + domainString, 'h2_size'): self.h2_size = cfg.getint('dqnpolicy_' + domainString, 'h2_size') if cfg.has_option('policy_' + domainString, 'save_step'): self.save_step = cfg.getint('policy_' + domainString, 'save_step') if cfg.has_option('dqnpolicy_' + domainString, 'importance_sampling'): self.importance_sampling = cfg.get('dqnpolicy_' + domainString, 'importance_sampling') if cfg.has_option('dqnpolicy_' + domainString, 'training_frequency'): self.training_frequency = cfg.getint('dqnpolicy_' + domainString, 'training_frequency') self.natural_gradient_prev = 0. """ self.shuffle = False if cfg.has_option('dqnpolicy_'+domainString, 'experience_replay'): self.shuffle = cfg.getboolean('dqnpolicy_'+domainString, 'experience_replay') if not self.shuffle: # If we don't use experience replay, we don't need to maintain # sliding window of experiences with maximum capacity. # We only need to maintain the data of minibatch_size self.capacity = self.minibatch_size """ self.episode_ave_max_q = [] self.mu_prob = 0. # behavioral policy os.environ["CUDA_VISIBLE_DEVICES"]="" # init session self.sess = tf.Session() with tf.device("/cpu:0"): np.random.seed(self.randomseed) tf.set_random_seed(self.randomseed) # initialise an replay buffer if self.replay_type == 'vanilla': self.episodes[self.domainString] = ReplayBufferEpisode(self.capacity, self.minibatch_size, self.randomseed) elif self.replay_type == 'prioritized': self.episodes[self.domainString] = ReplayPrioritisedEpisode(self.capacity, self.minibatch_size, self.randomseed) #replay_buffer = ReplayBuffer(self.capacity, self.randomseed) #self.episodes = [] self.samplecount = 0 self.episodecount = 0 # construct the models self.state_dim = self.n_in self.summaryaction = SummaryAction.SummaryAction(domainString) self.action_dim = len(self.summaryaction.action_names) action_bound = len(self.summaryaction.action_names) self.stats = [0 for _ in range(self.action_dim)] self.enac = enac.ENACNetwork(self.sess, self.state_dim, self.action_dim, \ self.critic_lr, self.tau, action_bound, self.architecture, self.h1_size, self.h2_size, self.is_training) # when all models are defined, init all variables init_op = tf.global_variables_initializer() self.sess.run(init_op) self.loadPolicy(self.in_policy_file) print 'loaded replay size: ', self.episodes[self.domainString].size()
def __init__(self, in_policy_file, out_policy_file, ontology, cfg, logger, SetObj, domainString='CamRestaurants', is_training=False): super(RBDQNPolicy, self).__init__(domainString, ontology, cfg, logger, SetObj, is_training) # tf.reset_default_graph() self.domainString = domainString self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString, cfg, ontology.OntologyUtils, SetObj) self.in_policy_file = in_policy_file self.out_policy_file = out_policy_file self.is_training = is_training self.accum_belief = [] self.prev_state_check = None self.ontology = ontology self.logger = logger self.SetObj =SetObj self.atoms = 21 self.vmin = -1 self.vmax = 1 self.support = np.linspace(self.vmin, self.vmax, self.atoms) self.delta_z = float(self.vmax - self.vmin) / (self.atoms - 1) # parameter settings if 0:#cfg.has_option('dqnpolicy', 'n_in'): #ic304: this was giving me a weird error, disabled it until i can check it deeper self.n_in = cfg.getint('dqnpolicy', 'n_in') else: self.n_in = self.get_n_in(domainString) self.learning_rate = 0.001 if cfg.has_option('dqnpolicy', 'learning_rate'): self.learning_rate = cfg.getfloat('dqnpolicy', 'learning_rate') self.tau = 0.001 if cfg.has_option('dqnpolicy', 'tau'): self.tau = cfg.getfloat('dqnpolicy', 'tau') self.randomseed = 1234 if cfg.has_option('GENERAL', 'seed'): self.randomseed = cfg.getint('GENERAL', 'seed') self.gamma = 1.0 if cfg.has_option('dqnpolicy', 'gamma'): self.gamma = cfg.getfloat('dqnpolicy', 'gamma') self.regularisation = 'l2' if cfg.has_option('dqnpolicy', 'regularisation'): self.regularisation = cfg.get('dqnpolicy', 'regulariser') self.exploration_type = 'e-greedy' # Boltzman if cfg.has_option('dqnpolicy', 'exploration_type'): self.exploration_type = cfg.get('dqnpolicy', 'exploration_type') self.episodeNum = 1000 if cfg.has_option('dqnpolicy', 'episodeNum'): self.episodeNum = cfg.getfloat('dqnpolicy', 'episodeNum') self.maxiter = 5000 if cfg.has_option('dqnpolicy', 'maxiter'): self.maxiter = cfg.getfloat('dqnpolicy', 'maxiter') self.epsilon = 0.0 # if cfg.has_option('dqnpolicy', 'epsilon'): # self.epsilon = cfg.getfloat('dqnpolicy', 'epsilon') self.epsilon_start = 0.0 # if cfg.has_option('dqnpolicy', 'epsilon_start'): # self.epsilon_start = cfg.getfloat('dqnpolicy', 'epsilon_start') self.epsilon_end = 0.0 if cfg.has_option('dqnpolicy', 'epsilon_end'): self.epsilon_end = cfg.getfloat('dqnpolicy', 'epsilon_end') self.save_step = 100 if cfg.has_option('policy', 'save_step'): self.save_step = cfg.getint('policy', 'save_step') self.priorProbStart = 1.0 if cfg.has_option('dqnpolicy', 'prior_sample_prob_start'): self.priorProbStart = cfg.getfloat('dqnpolicy', 'prior_sample_prob_start') self.priorProbEnd = 0.1 if cfg.has_option('dqnpolicy', 'prior_sample_prob_end'): self.priorProbEnd = cfg.getfloat('dqnpolicy', 'prior_sample_prob_end') self.policyfeatures = [] if cfg.has_option('dqnpolicy', 'features'): self.logger.info('Features: ' + str(cfg.get('dqnpolicy', 'features'))) self.policyfeatures = json.loads(cfg.get('dqnpolicy', 'features')) self.max_k = 5 if cfg.has_option('dqnpolicy', 'max_k'): self.max_k = cfg.getint('dqnpolicy', 'max_k') self.learning_algorithm = 'drl' if cfg.has_option('dqnpolicy', 'learning_algorithm'): self.learning_algorithm = cfg.get('dqnpolicy', 'learning_algorithm') self.logger.info('Learning algorithm: ' + self.learning_algorithm) self.minibatch_size = 32 if cfg.has_option('dqnpolicy', 'minibatch_size'): self.minibatch_size = cfg.getint('dqnpolicy', 'minibatch_size') self.capacity = 1000 # max(self.minibatch_size, 2000) if cfg.has_option('dqnpolicy', 'capacity'): self.capacity = max(cfg.getint('dqnpolicy', 'capacity'), 2000) self.replay_type = 'prioritized' if cfg.has_option('dqnpolicy', 'replay_type'): self.replay_type = cfg.get('dqnpolicy', 'replay_type') self.architecture = 'vanilla' if cfg.has_option('dqnpolicy', 'architecture'): self.architecture = cfg.get('dqnpolicy', 'architecture') self.q_update = 'double' if cfg.has_option('dqnpolicy', 'q_update'): self.q_update = cfg.get('dqnpolicy', 'q_update') self.h1_size = 130 if cfg.has_option('dqnpolicy', 'h1_size'): self.h1_size = cfg.getint('dqnpolicy', 'h1_size') self.h1_drop = None if cfg.has_option('dqnpolicy', 'h1_drop'): self.h1_drop = cfg.getfloat('dqnpolicy', 'h1_drop') self.h2_size = 130 if cfg.has_option('dqnpolicy', 'h2_size'): self.h2_size = cfg.getint('dqnpolicy', 'h2_size') self.h2_drop = None if cfg.has_option('dqnpolicy', 'h2_drop'): self.h2_drop = cfg.getfloat('dqnpolicy', 'h2_drop') self.nature_mode = None if cfg.has_option('dqnpolicy', 'nature_mode'): self.nature_mode = cfg.getboolean('dqnpolicy', 'nature_mode') self.madqn_hidden_layers = None if cfg.has_option('dqnpolicy', 'madqn_hidden_layers'): self.madqn_hidden_layers = cfg.getint('dqnpolicy', 'madqn_hidden_layers') self.madqn_local_hidden_units = None if cfg.has_option('dqnpolicy', 'madqn_local_hidden_units'): self.madqn_local_hidden_units = cfg.get('dqnpolicy', 'madqn_local_hidden_units') self.madqn_local_hidden_units = eval(self.madqn_local_hidden_units) self.madqn_local_dropouts = None if cfg.has_option('dqnpolicy', 'madqn_local_dropouts'): self.madqn_local_dropouts = cfg.get('dqnpolicy', 'madqn_local_dropouts') self.madqn_local_dropouts = eval(self.madqn_local_dropouts) self.madqn_global_hidden_units = None if cfg.has_option('dqnpolicy', 'madqn_global_hidden_units'): self.madqn_global_hidden_units = cfg.get('dqnpolicy', 'madqn_global_hidden_units') self.madqn_global_hidden_units = eval(self.madqn_global_hidden_units) self.madqn_global_dropouts = None if cfg.has_option('dqnpolicy', 'madqn_global_dropouts'): self.madqn_global_dropouts = cfg.get('dqnpolicy', 'madqn_global_dropouts') self.madqn_global_dropouts = eval(self.madqn_global_dropouts) self.madqn_private_rate = None if cfg.has_option('dqnpolicy', 'madqn_private_rate'): self.madqn_private_rate = cfg.getfloat('dqnpolicy', 'madqn_private_rate') self.madqn_sort_input_vec = False if cfg.has_option('dqnpolicy', 'madqn_sort_input_vec'): self.madqn_sort_input_vec = cfg.getboolean('dqnpolicy', 'madqn_sort_input_vec') self.madqn_share_last_layer = False if cfg.has_option('dqnpolicy', 'madqn_share_last_layer'): self.madqn_share_last_layer = cfg.getboolean('dqnpolicy', 'madqn_share_last_layer') self.madqn_shared_last_layer_use_bias = True if cfg.has_option('dqnpolicy', 'madqn_shared_last_layer_use_bias'): self.madqn_shared_last_layer_use_bias = cfg.getboolean('dqnpolicy', 'madqn_shared_last_layer_use_bias') self.madqn_recurrent_mode = False if cfg.has_option('dqnpolicy', 'madqn_recurrent_mode'): self.madqn_recurrent_mode = cfg.getboolean('dqnpolicy', 'madqn_recurrent_mode') self.madqn_input_comm = True if cfg.has_option('dqnpolicy', 'madqn_input_comm'): self.madqn_input_comm = cfg.getboolean('dqnpolicy', 'madqn_input_comm') self.madqn_target_explore = False if cfg.has_option('dqnpolicy', 'madqn_target_explore'): self.madqn_target_explore = cfg.getboolean('dqnpolicy', 'madqn_target_explore') self.madqn_concrete_share_rate = False if cfg.has_option('dqnpolicy', 'madqn_concrete_share_rate'): self.madqn_concrete_share_rate = cfg.getboolean('dqnpolicy', 'madqn_concrete_share_rate') self.madqn_dropout_regularizer = 0. if cfg.has_option('dqnpolicy', 'madqn_dropout_regularizer'): self.madqn_dropout_regularizer = cfg.getfloat('dqnpolicy', 'madqn_dropout_regularizer') self.madqn_weight_regularizer = 0. if cfg.has_option('dqnpolicy', 'madqn_weight_regularizer'): self.madqn_weight_regularizer = cfg.getfloat('dqnpolicy', 'madqn_weight_regularizer') self.madqn_non_local_mode = False if cfg.has_option('dqnpolicy', 'madqn_non_local_mode'): self.madqn_non_local_mode = cfg.getboolean('dqnpolicy', 'madqn_non_local_mode') self.madqn_block_mode = False if cfg.has_option('dqnpolicy', 'madqn_block_mode'): self.madqn_block_mode = cfg.getboolean('dqnpolicy', 'madqn_block_mode') self.madqn_slots_comm = True if cfg.has_option('dqnpolicy', 'madqn_slots_comm'): self.madqn_slots_comm = cfg.getboolean('dqnpolicy', 'madqn_slots_comm') self.madqn_use_dueling = False if cfg.has_option('dqnpolicy', 'madqn_use_dueling'): self.madqn_use_dueling = cfg.getboolean('dqnpolicy', 'madqn_use_dueling') self.madqn_topo_learning_mode = False if cfg.has_option('dqnpolicy', 'madqn_topo_learning_mode'): self.madqn_topo_learning_mode = cfg.getboolean('dqnpolicy', 'madqn_topo_learning_mode') self.madqn_message_embedding = False if cfg.has_option('dqnpolicy', 'madqn_message_embedding'): self.madqn_message_embedding = cfg.getboolean('dqnpolicy', 'madqn_message_embedding') self.madqn_dueling_share_last = False if cfg.has_option('dqnpolicy', 'madqn_dueling_share_last'): self.madqn_dueling_share_last = cfg.getboolean('dqnpolicy', 'madqn_dueling_share_last') self.state_feature = 'vanilla' if cfg.has_option('dqnpolicy', 'state_feature'): self.state_feature = cfg.get('dqnpolicy', 'state_feature') self.init_policy = None if cfg.has_option('dqnpolicy', 'init_policy'): self.init_policy = cfg.get('dqnpolicy', 'init_policy') self.training_frequency = 2 if cfg.has_option('dqnpolicy', 'training_frequency'): self.training_frequency = cfg.getint('dqnpolicy', 'training_frequency') # domain specific parameter settings (overrides general policy parameter settings) if cfg.has_option('dqnpolicy_' + domainString, 'n_in'): self.n_in = cfg.getint('dqnpolicy_' + domainString, 'n_in') if cfg.has_option('dqnpolicy_' + domainString, 'learning_rate'): self.learning_rate = cfg.getfloat('dqnpolicy_' + domainString, 'learning_rate') if cfg.has_option('dqnpolicy_' + domainString, 'tau'): self.tau = cfg.getfloat('dqnpolicy_' + domainString, 'tau') if cfg.has_option('dqnpolicy_' + domainString, 'gamma'): self.gamma = cfg.getfloat('dqnpolicy_' + domainString, 'gamma') if cfg.has_option('dqnpolicy_' + domainString, 'regularisation'): self.regularisation = cfg.get('dqnpolicy_' + domainString, 'regulariser') if cfg.has_option('dqnpolicy_' + domainString, 'exploration_type'): self.exploration_type = cfg.get('dqnpolicy_' + domainString, 'exploration_type') if cfg.has_option('dqnpolicy_' + domainString, 'episodeNum'): self.episodeNum = cfg.getfloat('dqnpolicy_' + domainString, 'episodeNum') if cfg.has_option('dqnpolicy_' + domainString, 'maxiter'): self.maxiter = cfg.getfloat('dqnpolicy_' + domainString, 'maxiter') if cfg.has_option('dqnpolicy_' + domainString, 'epsilon'): self.epsilon = cfg.getfloat('dqnpolicy_' + domainString, 'epsilon') if cfg.has_option('dqnpolicy_' + domainString, 'epsilon_start'): self.epsilon_start = cfg.getfloat('dqnpolicy_' + domainString, 'epsilon_start') if cfg.has_option('dqnpolicy_' + domainString, 'epsilon_end'): self.epsilon_end = cfg.getfloat('dqnpolicy_' + domainString, 'epsilon_end') if cfg.has_option('policy_' + domainString, 'save_step'): self.save_step = cfg.getint('policy_' + domainString, 'save_step') if cfg.has_option('dqnpolicy_' + domainString, 'prior_sample_prob_start'): self.priorProbStart = cfg.getfloat('dqnpolicy_' + domainString, 'prior_sample_prob_start') if cfg.has_option('dqnpolicy_' + domainString, 'prior_sample_prob_end'): self.priorProbEnd = cfg.getfloat('dqnpolicy_' + domainString, 'prior_sample_prob_end') if cfg.has_option('dqnpolicy_' + domainString, 'features'): self.logger.info('Features: ' + str(cfg.get('dqnpolicy_' + domainString, 'features'))) self.policyfeatures = json.loads(cfg.get('dqnpolicy_' + domainString, 'features')) if cfg.has_option('dqnpolicy_' + domainString, 'max_k'): self.max_k = cfg.getint('dqnpolicy_' + domainString, 'max_k') if cfg.has_option('dqnpolicy_' + domainString, 'learning_algorithm'): self.learning_algorithm = cfg.get('dqnpolicy_' + domainString, 'learning_algorithm') self.logger.info('Learning algorithm: ' + self.learning_algorithm) if cfg.has_option('dqnpolicy_' + domainString, 'minibatch_size'): self.minibatch_size = cfg.getint('dqnpolicy_' + domainString, 'minibatch_size') if cfg.has_option('dqnpolicy_' + domainString, 'capacity'): self.capacity = max(cfg.getint('dqnpolicy_' + domainString, 'capacity'), 2000) if cfg.has_option('dqnpolicy_' + domainString, 'replay_type'): self.replay_type = cfg.get('dqnpolicy_' + domainString, 'replay_type') if cfg.has_option('dqnpolicy_' + domainString, 'architecture'): self.architecture = cfg.get('dqnpolicy_' + domainString, 'architecture') if cfg.has_option('dqnpolicy_' + domainString, 'q_update'): self.q_update = cfg.get('dqnpolicy_' + domainString, 'q_update') if cfg.has_option('dqnpolicy_' + domainString, 'h1_size'): self.h1_size = cfg.getint('dqnpolicy_' + domainString, 'h1_size') if cfg.has_option('dqnpolicy_' + domainString, 'h1_drop'): self.h1_drop = cfg.getfloat('dqnpolicy_' + domainString, 'h1_drop') if cfg.has_option('dqnpolicy_' + domainString, 'h2_size'): self.h2_size = cfg.getint('dqnpolicy_' + domainString, 'h2_size') if cfg.has_option('dqnpolicy_' + domainString, 'h2_drop'): self.h2_drop = cfg.getfloat('dqnpolicy_' + domainString, 'h2_drop') if cfg.has_option('dqnpolicy_' + domainString, 'training_frequency'): self.training_frequency = cfg.getint('dqnpolicy_' + domainString, 'training_frequency') """ self.shuffle = False if cfg.has_option('dqnpolicy_'+domainString, 'experience_replay'): self.shuffle = cfg.getboolean('dqnpolicy_'+domainString, 'experience_replay') if not self.shuffle: # If we don't use experience replay, we don't need to maintain # sliding window of experiences with maximum capacity. # We only need to maintain the data of minibatch_size self.capacity = self.minibatch_size """ self.episode_ave_max_q = [] os.environ["CUDA_VISIBLE_DEVICES"] = "" # init session # self.sess = tf.Session() # with tf.device("/cpu:0"): np.random.seed(self.randomseed) # tf.set_random_seed(self.randomseed) # initialise an replay buffer if self.replay_type == 'vanilla': self.episodes[self.domainString] = ReplayBuffer(self.capacity, self.minibatch_size, self.randomseed) elif self.replay_type == 'prioritized': self.episodes[self.domainString] = ReplayPrioritised(self.capacity, self.minibatch_size, self.randomseed) self.samplecount = 0 self.episodecount = 0 # construct the models self.state_dim = self.n_in self.summaryaction = SummaryAction.SummaryAction(domainString, self.ontology, self.SetObj) self.action_dim = len(self.summaryaction.action_names) action_bound = len(self.summaryaction.action_names) self.stats = [0 for _ in range(self.action_dim)] import tube self.dqn = dqn.DeepRBQNetwork(self.state_dim, self.action_dim, self.atoms, \ self.learning_rate, self.tau, action_bound, self.minibatch_size, self.architecture, self.h1_size, self.h1_drop, self.h2_size, self.h2_drop, self.domainString, self.madqn_hidden_layers, self.madqn_local_hidden_units, self.madqn_local_dropouts, self.madqn_global_hidden_units, self.madqn_global_dropouts, self.madqn_private_rate, self.madqn_sort_input_vec, self.madqn_share_last_layer, self.madqn_recurrent_mode, self.madqn_input_comm, self.madqn_target_explore, concrete_share_rate=self.madqn_concrete_share_rate, dropout_regularizer=self.madqn_dropout_regularizer, weight_regularizer=self.madqn_weight_regularizer, non_local_mode=self.madqn_non_local_mode, block_mode=self.madqn_block_mode, slots_comm=self.madqn_slots_comm, topo_learning_mode=self.madqn_topo_learning_mode, use_dueling=self.madqn_use_dueling, dueling_share_last=self.madqn_dueling_share_last, message_embedding=self.madqn_message_embedding, state_feature=self.state_feature, init_policy=self.init_policy, shared_last_layer_use_bias=self.madqn_shared_last_layer_use_bias, seed=tube.seed) # when all models are defined, init all variables # init_op = tf.global_variables_initializer() # self.sess.run(init_op) lock.acquire() self.loadPolicy(self.in_policy_file) lock.release() print('###################################################') print(self.domainString + ' loaded replay size: ' + str(self.episodes[self.domainString].size())) # globalEpisodeCount = copy.deepcopy(Settings.get_count()) # globalEpisodeCount != 0: lock.acquire() # self.dqn.update_target_network() self._savePolicyInc() lock.release() Settings.load_policy(self.dqn, threading.currentThread().getName())
def __init__(self, in_policy_file, out_policy_file, domainString='CamRestaurants', is_training=False): super(FeudalPolicy, self).__init__(domainString, is_training) self.domainString = domainString self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString) self.in_policy_file = in_policy_file self.out_policy_file = out_policy_file self.is_training = is_training self.prev_state_check = None #feudalRL variables self.prev_sub_policy = None self.prev_master_act = None self.prev_master_belief = None self.prev_child_act = None self.prev_child_belief = None self.action_freq = np.zeros(len(self.actions.action_names)) self.master_dec_count = np.array([0., 0.]) self.gi_dec_inrow = 0 self.features = 'dip' if cfg.has_option('feudalpolicy', 'features'): self.features = cfg.get('feudalpolicy', 'features') self.si_policy_type = 'dqn' if cfg.has_option('feudalpolicy', 'si_policy_type'): self.si_policy_type = cfg.get('feudalpolicy', 'si_policy_type') self.sd_policy_type = 'dqn' if cfg.has_option('feudalpolicy', 'sd_policy_type'): self.sd_policy_type = cfg.get('feudalpolicy', 'sd_policy_type') self.master_policy_type = self.si_policy_type if cfg.has_option('feudalpolicy', 'master_policy_type'): self.master_policy_type = cfg.get('feudalpolicy', 'master_policy_type') self.sample_master = False if cfg.has_option('feudalpolicy', 'sample_master'): self.sample_master = cfg.getboolean('feudalpolicy', 'sample_master') self.correct_master = False if cfg.has_option('feudalpolicy', 'correct_master'): self.correct_master = cfg.getboolean('feudalpolicy', 'correct_master') self.use_bye = False if cfg.has_option('feudalpolicy', 'use_bye'): self.use_bye = cfg.getboolean('feudalpolicy', 'use_bye') self.reqmore_in_si = True if cfg.has_option('feudalpolicy', 'reqmore_in_si'): self.reqmore_in_si = cfg.getboolean('feudalpolicy', 'reqmore_in_si') self.correction_factor = 0 if cfg.has_option('feudalpolicy', 'correction_factor'): self.correction_factor = cfg.getfloat('feudalpolicy', 'correction_factor') self.actfreq_ds = False if cfg.has_option('feudalpolicy', 'actfreq_ds'): self.actfreq_ds = cfg.getboolean('feudalpolicy', 'actfreq_ds') # parameter settings self.randomseed = 1234 if cfg.has_option('GENERAL', 'seed'): self.randomseed = cfg.getint('GENERAL', 'seed') # Create the feudal structure (including feudal masks) self.summaryaction = SummaryAction.SummaryAction(domainString) self.full_action_list = self.summaryaction.action_names self.master_actions = ['give_info', 'request_info', 'pass'] self.slot_independent_actions = [ "inform", "inform_byname", "inform_alternatives" ] if self.reqmore_in_si: self.slot_independent_actions.append("reqmore") if self.use_bye: self.slot_independent_actions.append('bye') self.slot_independent_actions.append('pass') self.slot_specific_actions = ["request", "confirm", "select"] #if self.reqmore_in_sd is True: # self.slot_specific_actions.append("reqmore") self.slot_specific_actions.append('pass') self.master_freq = np.zeros(len(self.master_actions)) self.si_freq = np.zeros(len(self.slot_independent_actions)) self.sd_freq = np.zeros(len(self.slot_specific_actions)) # master policy if self.master_policy_type == 'acer': self.master_policy = FeudalACERPolicy( self._modify_policyfile('master', in_policy_file), self._modify_policyfile('master', out_policy_file), domainString=self.domainString, is_training=self.is_training, action_names=['give_info', 'request_info', 'pass'], slot='si' ) # pass is always masked, but its needed for implementation elif self.master_policy_type == 'enac': self.master_policy = FeudalENACPolicy( self._modify_policyfile('master', in_policy_file), self._modify_policyfile('master', out_policy_file), domainString=self.domainString, is_training=self.is_training, action_names=['give_info', 'request_info', 'pass'], slot='si' ) # pass is always masked, but its needed for implementation elif self.master_policy_type == 'bbqn': self.master_policy = FeudalBBQNPolicy( self._modify_policyfile('master', in_policy_file), self._modify_policyfile('master', out_policy_file), domainString=self.domainString, is_training=self.is_training, action_names=['give_info', 'request_info', 'pass'], slot='si' ) # pass is always masked, but its needed for implementation else: self.master_policy = FeudalDQNPolicy( self._modify_policyfile('master', in_policy_file), self._modify_policyfile('master', out_policy_file), domainString=self.domainString, is_training=self.is_training, action_names=['give_info', 'request_info', 'pass'], slot='si' ) # pass is always masked, but its needed for implementation # si policy if self.si_policy_type == 'acer': self.give_info_policy = FeudalACERPolicy( self._modify_policyfile('gi', in_policy_file), self._modify_policyfile('gi', out_policy_file), domainString=self.domainString, is_training=self.is_training, action_names=self.slot_independent_actions, slot='si') elif self.si_policy_type == 'enac': self.give_info_policy = FeudalENACPolicy( self._modify_policyfile('gi', in_policy_file), self._modify_policyfile('gi', out_policy_file), domainString=self.domainString, is_training=self.is_training, action_names=self.slot_independent_actions, slot='si') elif self.si_policy_type == 'bbqn': self.give_info_policy = FeudalBBQNPolicy( self._modify_policyfile('gi', in_policy_file), self._modify_policyfile('gi', out_policy_file), domainString=self.domainString, is_training=self.is_training, action_names=self.slot_independent_actions, slot='si') else: self.give_info_policy = FeudalDQNPolicy( self._modify_policyfile('gi', in_policy_file), self._modify_policyfile('gi', out_policy_file), domainString=self.domainString, is_training=self.is_training, action_names=self.slot_independent_actions, slot='si') # sd policies if self.sd_policy_type == 'acer': self.request_info_policy = FeudalACERPolicy( self._modify_policyfile('ri', in_policy_file), self._modify_policyfile('ri', out_policy_file), domainString=self.domainString, is_training=self.is_training, action_names=self.slot_specific_actions, slot='sd') elif self.sd_policy_type == 'bbqn': self.request_info_policy = FeudalBBQNPolicy( self._modify_policyfile('ri', in_policy_file), self._modify_policyfile('ri', out_policy_file), domainString=self.domainString, is_training=self.is_training, action_names=self.slot_specific_actions, slot='sd') else: self.request_info_policy = FeudalDQNPolicy( self._modify_policyfile('ri', in_policy_file), self._modify_policyfile('ri', out_policy_file), domainString=self.domainString, is_training=self.is_training, action_names=self.slot_specific_actions, slot='sd')
def __init__(self, agent_id='Smith', hub_id='dialogueserver'): # Define all variables in __init__: self.prompt_str = None self.reward = None self.currentTurn = None self.maxTurns = None self.ENDING_DIALOG = None self.SUBJECTIVE_RETRIEVAL_ATTEMPS = None self.TASK_RETRIEVAL_ATTEMPTS = None self.constraints = None self.task = None self.taskId = None self.subjective = None self.session_id = None self.callValidator = CallValidator() self.prev_state = None self.prev_statexx = None self.predloss = None self.action_names = [] # hardcoded to include slots for specific actions (request, confirm, select) self.action_names += ["request(food", "request(area", "request(pricerange", "confirm(food", "confirm(area", "confirm(pricerange", "select(food", "select(area", "select(pricerange", "inform", "inform_byname", "inform_alternatives", "bye", "repeat", "reqmore", "restart"] # DEFAULTS: # meta params - note these define the 'state' of the dialogue, along with those defined in restart_agent() assert(hub_id in ['texthub', 'simulate', 'dialogueserver']) self.hub_id = hub_id # defines certain behaviour of the agent. One of [texthub, simulate, dialogueserver] self.agent_id = agent_id self.NUM_DIALOGS = 0 self.SYSTEM_CAN_HANGUP = False self.SAVE_FREQUENCY = 10 # save the policy after multiples of this many dialogues self.MAX_TURNS_PROMPT = "The dialogue has finished due to too many turns" self.NO_ASR_MSG = "I am afraid I did not understand. Could you please repeat that." self.maxTurns_per_domain = 30 self.traceDialog = 2 self.sim_level = 'dial_act' self.pre_trg = False # CONFIGS: if Settings.config.has_option('agent', 'savefrequency'): self.SAVE_FREQUENCY = Settings.config.getint('agent', 'savefrequency') if Settings.config.has_option("agent","systemcanhangup"): self.SYSTEM_CAN_HANGUP = Settings.config.getboolean("agent", "systemcanhangup") if Settings.config.has_option("agent", "maxturns"): self.maxTurns_per_domain = Settings.config.getint("agent", "maxturns") if Settings.config.has_option("GENERAL", "tracedialog"): self.traceDialog = Settings.config.getint("GENERAL", "tracedialog") if Settings.config.has_option("usermodel", "simlevel"): self.sim_level = Settings.config.get("usermodel", "simlevel") # TOPIC TRACKING: #----------------------------------------- self.topic_tracker = TopicTracking.TopicTrackingManager() # SemI + Belief tracker self.semi_belief_manager = self._load_manger('semanticbelieftrackingmanager', 'semanticbelieftracking.SemanticBeliefTrackingManager.SemanticBeliefTrackingManager') # Policy. #----------------------------------------- self.policy_manager = self._load_manger('policymanager', 'policy.PolicyManager.PolicyManager') # SemO. #----------------------------------------- if self.hub_id == 'simulate': # may or may not have NLG in simulate (default is not to) generate_prompts = False if Settings.config.has_option('simulate', 'generateprompts'): generate_prompts = Settings.config.getboolean('simulate', 'generateprompts') else: generate_prompts = True # default for Texthub and DialogueServer if generate_prompts: self.semo_manager = self._load_manger('semomanager', 'semo.SemOManager.SemOManager') else: self.semo_manager = None # Evaluation Manager. #----------------------------------------- self.evaluation_manager = self._load_manger('evaluationmanager', 'evaluation.EvaluationManager.EvaluationManager') # Restart components - NB: inefficient - will be called again before 1st dialogue - but enables _logical_requirements() self.restart_agent(session_id=None) # Finally, enforce some cross module requirements: self._logical_requirements() self.domainUtil = FlatOnt.FlatDomainOntology(self.topic_tracker.operatingDomain)
import ontology.FlatOntologyManager as FlatOnt # from theano_dialogue.util.tool import * import tensorflow as tf from DRL.replay_bufferVanilla import ReplayBuffer from DRL.replay_prioritisedVanilla import ReplayPrioritised import DRL.utils as drlutils import DRL.concrete_dqn as dqn import Policy import SummaryAction from Policy import TerminalAction, TerminalState logger = utils.ContextLogger.getLogger('') # --- for flattening the belief --- # domainUtil = FlatOnt.FlatDomainOntology('CamRestaurants') def flatten_belief(belief, domainUtil, merge=False): belief = belief.getDomainState(domainUtil.domainString) if isinstance(belief, TerminalState): if domainUtil.domainString == 'CamRestaurants': return [0] * 268 elif domainUtil.domainString == 'CamHotels': return [0] * 111 elif domainUtil.domainString == 'SFRestaurants': return [0] * 633 elif domainUtil.domainString == 'SFHotels': return [0] * 438 elif domainUtil.domainString == 'Laptops11': return [0] * 257
def __init__(self, in_policy_file, out_policy_file, domainString='CamRestaurants', is_training=False, action_names=None): super(MORLPolicy, self).__init__(domainString, is_training) self.domainString = domainString self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString) self.in_policy_file = in_policy_file self.out_policy_file = out_policy_file self.is_training = is_training self.accum_belief = [] self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString) self.prev_state_check = None # parameter settings if 0: # cfg.has_option('morlpolicy', 'n_in'): #ic304: this was giving me a weird error, disabled it until i can check it deeper self.n_in = cfg.getint('morlpolicy', 'n_in') else: self.n_in = self.get_n_in(domainString) self.n_rew = 1 if cfg.has_option('morlpolicy', 'n_rew'): self.n_rew = cfg.getint('morlpolicy', 'n_rew') self.lr = 0.001 if cfg.has_option('morlpolicy', 'learning_rate'): self.lr = cfg.getfloat('morlpolicy', 'learning_rate') self.epsilon = 0.5 if cfg.has_option('morlpolicy', 'epsilon'): self.epsilon = cfg.getfloat('morlpolicy', 'epsilon') self.epsilon_decay = True if cfg.has_option('morlpolicy', 'epsilon_decay'): self.epsilon_decay = cfg.getboolean('morlpolicy', 'epsilon_decay') self.randomseed = 1234 if cfg.has_option('GENERAL', 'seed'): self.randomseed = cfg.getint('GENERAL', 'seed') self.gamma = 1.0 if cfg.has_option('morlpolicy', 'gamma'): self.gamma = cfg.getfloat('morlpolicy', 'gamma') self.weight_num = 32 if cfg.has_option('morlpolicy', 'weight_num'): self.weight_num = cfg.getint('morlpolicy', 'weight_num') self.episode_num = 1000 if cfg.has_option('morlpolicy', 'episode_num'): self.episode_num = cfg.getfloat('morlpolicy', 'episode_num') self.optimizer = "Adam" if cfg.has_option('morlpolicy', 'optimizer'): self.optimizer = cfg.get('morlpolicy', 'optimizer') self.save_step = 100 if cfg.has_option('policy', 'save_step'): self.save_step = cfg.getint('policy', 'save_step') self.update_freq = 50 if cfg.has_option('morlpolicy', 'update_freq'): self.update_freq = cfg.getint('morlpolicy', 'update_freq') self.policyfeatures = [] if cfg.has_option('morlpolicy', 'features'): logger.info('Features: ' + str(cfg.get('morlpolicy', 'features'))) self.policyfeatures = json.loads(cfg.get('morlpolicy', 'features')) self.algorithm = 'naive' if cfg.has_option('morlpolicy', 'algorithm'): self.algorithm = cfg.get('morlpolicy', 'algorithm') logger.info('Learning algorithm: ' + self.algorithm) self.batch_size = 32 if cfg.has_option('morlpolicy', 'batch_size'): self.batch_size = cfg.getint('morlpolicy', 'batch_size') self.mem_size = 1000 if cfg.has_option('morlpolicy', 'mem_size'): self.mem_size = cfg.getint('morlpolicy', 'mem_size') self.training_freq = 1 if cfg.has_option('morlpolicy', 'training_freq'): self.training_freq = cfg.getint('morlpolicy', 'training_freq') # set beta for envelope algorithm self.beta = 0.1 if cfg.has_option('morlpolicy', 'beta'): self.beta = cfg.getfloat('morlpolicy', 'beta') self.beta_init = self.beta self.beta_uplim = 1.00 self.tau = 1000. self.beta_expbase = float( np.power(self.tau * (self.beta_uplim - self.beta), 1. / (self.episode_num + 1))) self.beta_delta = self.beta_expbase / self.tau self.beta -= self.beta_delta # using homotopy method for optimization self.homotopy = False if cfg.has_option('morlpolicy', 'homotopy'): self.homotopy = cfg.getboolean('morlpolicy', 'homotopy') self.epsilon_delta = (self.epsilon - 0.05) / self.episode_num self.episodecount = 0 # construct the models self.state_dim = self.n_in self.summaryaction = SummaryAction.SummaryAction(domainString) if action_names is None: self.action_names = self.summaryaction.action_names else: self.action_names = action_names self.action_dim = len(self.action_names) self.stats = [0 for _ in range(self.action_dim)] self.reward_dim = self.n_rew model = None if self.algorithm == 'naive': model = naive.NaiveLinearCQN(self.state_dim, self.action_dim, self.reward_dim) elif self.algorithm == 'envelope': model = envelope.EnvelopeLinearCQN(self.state_dim, self.action_dim, self.reward_dim) self.model_ = model self.model = copy.deepcopy(model) # initialize memory self.trans_mem = deque() self.trans = namedtuple('trans', ['s', 'a', 's_', 'r', 'd', 'ms', 'ms_']) self.priority_mem = deque() self.mem_last_state = None self.mem_last_action = None self.mem_last_mask = None self.mem_cur_state = None self.mem_cur_action = None self.mem_cur_mask = None if self.optimizer == 'Adam': self.optimizer = optim.Adam(self.model_.parameters(), lr=self.lr) elif self.optimizer == 'RMSprop': self.optimizer = optim.RMSprop(self.model_.parameters(), lr=self.lr) try: self.loadPolicy(self.in_policy_file) except: logger.info("No previous model found...") self.w_kept = None self.update_count = 0 if self.is_training: self.model_.train() if use_cuda: self.model.cuda() self.model_.cuda() self.monitor = None
def __init__(self, in_policy_file, out_policy_file, domainString='CamRestaurants', is_training=False, action_names=None): super(DQNPolicy, self).__init__(domainString, is_training) tf.reset_default_graph() self.domainString = domainString self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString) self.in_policy_file = in_policy_file self.out_policy_file = out_policy_file self.is_training = is_training self.accum_belief = [] self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString) self.prev_state_check = None # pw: Use turn info for predictions # action vector creation action_names = [ ] # hardcoded to include slots for specific actions (request, confirm, select) action_names += [ "request(food)", "request(area)", "request(pricerange)", "confirm(food)", "confirm(area)", "confirm(pricerange)", "select(food)", "select(area)", "select(pricerange)", "inform", "inform_byname", "inform_alternatives", "bye", "repeat", "reqmore", "restart" ] num_actions = len(action_names) self.prev_state = None # parameter settings if 0: #cfg.has_option('dqnpolicy', 'n_in'): #ic304: this was giving me a weird error, disabled it until i can check it deeper self.n_in = cfg.getint('dqnpolicy', 'n_in') else: self.n_in = self.get_n_in(domainString) self.learning_rate = 0.001 if cfg.has_option('dqnpolicy', 'learning_rate'): self.learning_rate = cfg.getfloat('dqnpolicy', 'learning_rate') self.tau = 0.001 if cfg.has_option('dqnpolicy', 'tau'): self.tau = cfg.getfloat('dqnpolicy', 'tau') self.randomseed = 1234 if cfg.has_option('GENERAL', 'seed'): self.randomseed = cfg.getint('GENERAL', 'seed') self.gamma = 1.0 if cfg.has_option('dqnpolicy', 'gamma'): self.gamma = cfg.getfloat('dqnpolicy', 'gamma') self.regularisation = 'l2' if cfg.has_option('dqnpolicy', 'regularisation'): self.regularisation = cfg.get('dqnpolicy', 'regulariser') self.exploration_type = 'e-greedy' # Boltzman if cfg.has_option('dqnpolicy', 'exploration_type'): self.exploration_type = cfg.get('dqnpolicy', 'exploration_type') self.episodeNum = 1000 if cfg.has_option('dqnpolicy', 'episodeNum'): self.episodeNum = cfg.getfloat('dqnpolicy', 'episodeNum') self.maxiter = 5000 if cfg.has_option('dqnpolicy', 'maxiter'): self.maxiter = cfg.getfloat('dqnpolicy', 'maxiter') self.epsilon = 1 if cfg.has_option('dqnpolicy', 'epsilon'): self.epsilon = cfg.getfloat('dqnpolicy', 'epsilon') self.epsilon_start = 1 if cfg.has_option('dqnpolicy', 'epsilon_start'): self.epsilon_start = cfg.getfloat('dqnpolicy', 'epsilon_start') self.epsilon_end = 1 if cfg.has_option('dqnpolicy', 'epsilon_end'): self.epsilon_end = cfg.getfloat('dqnpolicy', 'epsilon_end') self.save_step = 100 if cfg.has_option('policy', 'save_step'): self.save_step = cfg.getint('policy', 'save_step') self.priorProbStart = 1.0 if cfg.has_option('dqnpolicy', 'prior_sample_prob_start'): self.priorProbStart = cfg.getfloat('dqnpolicy', 'prior_sample_prob_start') self.priorProbEnd = 0.1 if cfg.has_option('dqnpolicy', 'prior_sample_prob_end'): self.priorProbEnd = cfg.getfloat('dqnpolicy', 'prior_sample_prob_end') self.policyfeatures = [] if cfg.has_option('dqnpolicy', 'features'): logger.info('Features: ' + str(cfg.get('dqnpolicy', 'features'))) self.policyfeatures = json.loads(cfg.get('dqnpolicy', 'features')) self.max_k = 5 if cfg.has_option('dqnpolicy', 'max_k'): self.max_k = cfg.getint('dqnpolicy', 'max_k') self.learning_algorithm = 'drl' if cfg.has_option('dqnpolicy', 'learning_algorithm'): self.learning_algorithm = cfg.get('dqnpolicy', 'learning_algorithm') logger.info('Learning algorithm: ' + self.learning_algorithm) self.minibatch_size = 32 if cfg.has_option('dqnpolicy', 'minibatch_size'): self.minibatch_size = cfg.getint('dqnpolicy', 'minibatch_size') self.capacity = 1000 if cfg.has_option('dqnpolicy', 'capacity'): self.capacity = cfg.getint('dqnpolicy', 'capacity') self.replay_type = 'vanilla' if cfg.has_option('dqnpolicy', 'replay_type'): self.replay_type = cfg.get('dqnpolicy', 'replay_type') self.architecture = 'vanilla' if cfg.has_option('dqnpolicy', 'architecture'): self.architecture = cfg.get('dqnpolicy', 'architecture') if self.architecture == 'dip': self.architecture = 'dip2' self.q_update = 'single' if cfg.has_option('dqnpolicy', 'q_update'): self.q_update = cfg.get('dqnpolicy', 'q_update') self.h1_size = 130 if cfg.has_option('dqnpolicy', 'h1_size'): self.h1_size = cfg.getint('dqnpolicy', 'h1_size') self.h2_size = 130 if cfg.has_option('dqnpolicy', 'h2_size'): self.h2_size = cfg.getint('dqnpolicy', 'h2_size') self.training_frequency = 2 if cfg.has_option('dqnpolicy', 'training_frequency'): self.training_frequency = cfg.getint('dqnpolicy', 'training_frequency') # domain specific parameter settings (overrides general policy parameter settings) if cfg.has_option('dqnpolicy_' + domainString, 'n_in'): self.n_in = cfg.getint('dqnpolicy_' + domainString, 'n_in') if cfg.has_option('dqnpolicy_' + domainString, 'learning_rate'): self.learning_rate = cfg.getfloat('dqnpolicy_' + domainString, 'learning_rate') if cfg.has_option('dqnpolicy_' + domainString, 'tau'): self.tau = cfg.getfloat('dqnpolicy_' + domainString, 'tau') if cfg.has_option('dqnpolicy_' + domainString, 'gamma'): self.gamma = cfg.getfloat('dqnpolicy_' + domainString, 'gamma') if cfg.has_option('dqnpolicy_' + domainString, 'regularisation'): self.regularisation = cfg.get('dqnpolicy_' + domainString, 'regulariser') if cfg.has_option('dqnpolicy_' + domainString, 'exploration_type'): self.exploration_type = cfg.get('dqnpolicy_' + domainString, 'exploration_type') if cfg.has_option('dqnpolicy_' + domainString, 'episodeNum'): self.episodeNum = cfg.getfloat('dqnpolicy_' + domainString, 'episodeNum') if cfg.has_option('dqnpolicy_' + domainString, 'maxiter'): self.maxiter = cfg.getfloat('dqnpolicy_' + domainString, 'maxiter') if cfg.has_option('dqnpolicy_' + domainString, 'epsilon'): self.epsilon = cfg.getfloat('dqnpolicy_' + domainString, 'epsilon') if cfg.has_option('dqnpolicy_' + domainString, 'epsilon_start'): self.epsilon_start = cfg.getfloat('dqnpolicy_' + domainString, 'epsilon_start') if cfg.has_option('dqnpolicy_' + domainString, 'epsilon_end'): self.epsilon_end = cfg.getfloat('dqnpolicy_' + domainString, 'epsilon_end') if cfg.has_option('policy_' + domainString, 'save_step'): self.save_step = cfg.getint('policy_' + domainString, 'save_step') if cfg.has_option('dqnpolicy_' + domainString, 'prior_sample_prob_start'): self.priorProbStart = cfg.getfloat('dqnpolicy_' + domainString, 'prior_sample_prob_start') if cfg.has_option('dqnpolicy_' + domainString, 'prior_sample_prob_end'): self.priorProbEnd = cfg.getfloat('dqnpolicy_' + domainString, 'prior_sample_prob_end') if cfg.has_option('dqnpolicy_' + domainString, 'features'): logger.info('Features: ' + str(cfg.get('dqnpolicy_' + domainString, 'features'))) self.policyfeatures = json.loads( cfg.get('dqnpolicy_' + domainString, 'features')) if cfg.has_option('dqnpolicy_' + domainString, 'max_k'): self.max_k = cfg.getint('dqnpolicy_' + domainString, 'max_k') if cfg.has_option('dqnpolicy_' + domainString, 'learning_algorithm'): self.learning_algorithm = cfg.get('dqnpolicy_' + domainString, 'learning_algorithm') logger.info('Learning algorithm: ' + self.learning_algorithm) if cfg.has_option('dqnpolicy_' + domainString, 'minibatch_size'): self.minibatch_size = cfg.getint('dqnpolicy_' + domainString, 'minibatch_size') if cfg.has_option('dqnpolicy_' + domainString, 'capacity'): self.capacity = cfg.getint('dqnpolicy_' + domainString, 'capacity') if cfg.has_option('dqnpolicy_' + domainString, 'replay_type'): self.replay_type = cfg.get('dqnpolicy_' + domainString, 'replay_type') if cfg.has_option('dqnpolicy_' + domainString, 'architecture'): self.architecture = cfg.get('dqnpolicy_' + domainString, 'architecture') if cfg.has_option('dqnpolicy_' + domainString, 'q_update'): self.q_update = cfg.get('dqnpolicy_' + domainString, 'q_update') if cfg.has_option('dqnpolicy_' + domainString, 'h1_size'): self.h1_size = cfg.getint('dqnpolicy_' + domainString, 'h1_size') if cfg.has_option('dqnpolicy_' + domainString, 'h2_size'): self.h2_size = cfg.getint('dqnpolicy_' + domainString, 'h2_size') if cfg.has_option('dqnpolicy_' + domainString, 'training_frequency'): self.training_frequency = cfg.getint('dqnpolicy_' + domainString, 'training_frequency') """ self.shuffle = False if cfg.has_option('dqnpolicy_'+domainString, 'experience_replay'): self.shuffle = cfg.getboolean('dqnpolicy_'+domainString, 'experience_replay') if not self.shuffle: # If we don't use experience replay, we don't need to maintain # sliding window of experiences with maximum capacity. # We only need to maintain the data of minibatch_size self.capacity = self.minibatch_size """ self.episode_ave_max_q = [] os.environ["CUDA_VISIBLE_DEVICES"] = "" policytype = 'dqn' self.dropout_rate = 0. if cfg.has_option('dqnpolicy', 'dropout_rate'): self.dropout_rate = cfg.getfloat('dqnpolicy', 'dropout_rate') if cfg.has_option('policy', 'policytype'): policytype = cfg.get('policy', 'policytype') if policytype != 'feudal': # init session self.sess = tf.Session() with tf.device("/cpu:0"): np.random.seed(self.randomseed) tf.set_random_seed(self.randomseed) # initialise an replay buffer if self.replay_type == 'vanilla': self.episodes[self.domainString] = ReplayBuffer( self.capacity, self.minibatch_size, self.randomseed) elif self.replay_type == 'prioritized': self.episodes[self.domainString] = ReplayPrioritised( self.capacity, self.minibatch_size, self.randomseed) self.samplecount = 0 self.episodecount = 0 # construct the models self.state_dim = self.n_in if self.architecture == 'dip2': self.state_dim = 89 self.summaryaction = SummaryAction.SummaryAction(domainString) if action_names is None: self.action_names = self.summaryaction.action_names else: self.action_names = action_names self.action_dim = len(self.action_names) action_bound = len(self.action_names) self.stats = [0 for _ in range(self.action_dim)] self.dqn = dqn.DeepQNetwork(self.sess, self.state_dim, self.action_dim, \ self.learning_rate, self.tau, action_bound, self.minibatch_size, self.architecture, self.h1_size, self.h2_size, dropout_rate=self.dropout_rate) # when all models are defined, init all variables init_op = tf.global_variables_initializer() self.sess.run(init_op) self.loadPolicy(self.in_policy_file) print 'loaded replay size: ', self.episodes[ self.domainString].size() self.dqn.update_target_network()
def __init__(self, in_policy_file, out_policy_file, domainString='CamRestaurants', is_training=False, action_names=None, slot=None): super(FeudalACERPolicy, self).__init__(in_policy_file, out_policy_file, domainString, is_training) tf.reset_default_graph() self.in_policy_file = in_policy_file self.out_policy_file = out_policy_file self.is_training = is_training self.accum_belief = [] self.prev_state_check = None self.domainString = domainString self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString) self.features = 'dip' self.sd_enc_size = 80 self.si_enc_size = 40 self.dropout_rate = 0. if cfg.has_option('feudalpolicy', 'features'): self.features = cfg.get('feudalpolicy', 'features') if cfg.has_option('feudalpolicy', 'sd_enc_size'): self.sd_enc_size = cfg.getint('feudalpolicy', 'sd_enc_size') if cfg.has_option('feudalpolicy', 'si_enc_size'): self.si_enc_size = cfg.getint('feudalpolicy', 'si_enc_size') if cfg.has_option('dqnpolicy', 'dropout_rate') and self.is_training: self.dropout_rate = cfg.getfloat('feudalpolicy', 'dropout_rate') if cfg.has_option('dqnpolicy', 'dropout_rate') and self.is_training: self.dropout_rate = cfg.getfloat('feudalpolicy', 'dropout_rate') self.actfreq_ds = False if cfg.has_option('feudalpolicy', 'actfreq_ds'): self.actfreq_ds = cfg.getboolean('feudalpolicy', 'actfreq_ds') # init session self.sess = tf.Session() with tf.device("/cpu:0"): np.random.seed(self.randomseed) tf.set_random_seed(self.randomseed) # initialise an replay buffer if self.replay_type == 'vanilla': self.episodes[self.domainString] = ReplayBufferEpisode( self.capacity, self.minibatch_size, self.randomseed) elif self.replay_type == 'prioritized': self.episodes[self.domainString] = ReplayPrioritisedEpisode( self.capacity, self.minibatch_size, self.randomseed) #replay_buffer = ReplayBuffer(self.capacity, self.randomseed) #self.episodes = [] self.samplecount = 0 self.episodecount = 0 # construct the models self.state_dim = 89 # current DIP state dim self.summaryaction = policy.SummaryAction.SummaryAction( domainString) self.action_names = action_names self.action_dim = len(self.action_names) action_bound = len(self.action_names) self.stats = [0 for _ in range(self.action_dim)] self.global_mu = [0. for _ in range(self.action_dim)] if self.features == 'dip': if self.actfreq_ds: if self.domainString == 'CamRestaurants': self.state_dim += 9 #16 elif self.domainString == 'SFRestaurants': self.state_dim += 9 #25 elif self.domainString == 'Laptops11': self.state_dim += 9 #40 self.acer = acer.ACERNetwork(self.sess, self.state_dim, self.action_dim, self.critic_lr, self.delta, self.c, self.alpha, self.h1_size, self.h2_size, self.is_training) elif self.features == 'learned' or self.features == 'rnn': si_state_dim = 72 if self.actfreq_ds: if self.domainString == 'CamRestaurants': si_state_dim += 9 #16 elif self.domainString == 'SFRestaurants': si_state_dim += 9 #25 elif self.domainString == 'Laptops11': si_state_dim += 9 #40 if self.domainString == 'CamRestaurants': sd_state_dim = 158 #94 elif self.domainString == 'SFRestaurants': sd_state_dim = 158 elif self.domainString == 'Laptops11': sd_state_dim = 158 #13 else: logger.error( 'Domain {} not implemented in feudal-DQN yet' ) # just find out the size of sd_state_dim for the new domain if 0: #self.features == 'rnn': self.acer = acer.RNNACERNetwork(self.sess, si_state_dim, sd_state_dim, self.action_dim, self.critic_lr, self.delta, self.c, self.alpha, self.h1_size, self.h2_size, self.is_training, sd_enc_size=25, si_enc_size=25, dropout_rate=0., tn='normal', slot='si') else: self.state_dim = si_state_dim + sd_state_dim self.acer = acer.ACERNetwork(self.sess, self.state_dim, self.action_dim, self.critic_lr, self.delta, self.c, self.alpha, self.h1_size, self.h2_size, self.is_training) else: logger.error('features "{}" not implemented'.format( self.features)) # when all models are defined, init all variables init_op = tf.global_variables_initializer() self.sess.run(init_op) self.loadPolicy(self.in_policy_file) print 'loaded replay size: ', self.episodes[ self.domainString].size()
def __init__(self, in_policy_file, out_policy_file, domainString='CamRestaurants', is_training=False, action_names=None, slot=None): super(FeudalDQNPolicy, self).__init__(in_policy_file, out_policy_file, domainString, is_training) tf.reset_default_graph() self.domainString = domainString self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString) self.in_policy_file = in_policy_file self.out_policy_file = out_policy_file self.is_training = is_training self.accum_belief = [] self.slot = slot self.features = 'dip' if cfg.has_option('feudalpolicy', 'features'): self.features = cfg.get('feudalpolicy', 'features') self.actfreq_ds = False if cfg.has_option('feudalpolicy', 'actfreq_ds'): self.actfreq_ds = cfg.getboolean('feudalpolicy', 'actfreq_ds') self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString) self.prev_state_check = None self.max_k = 5 if cfg.has_option('dqnpolicy', 'max_k'): self.max_k = cfg.getint('dqnpolicy', 'max_k') self.capacity *= 4 # capacity for episode methods, multiply it to adjust to turn based methods # init session self.sess = tf.Session() with tf.device("/cpu:0"): np.random.seed(self.randomseed) tf.set_random_seed(self.randomseed) # initialise a replay buffer if self.replay_type == 'vanilla': self.episodes[self.domainString] = ReplayBuffer( self.capacity, self.minibatch_size, self.randomseed) elif self.replay_type == 'prioritized': self.episodes[self.domainString] = ReplayPrioritised( self.capacity, self.minibatch_size, self.randomseed) self.samplecount = 0 self.episodecount = 0 # construct the models self.state_dim = 89 # current DIP state dim self.summaryaction = policy.SummaryAction.SummaryAction( domainString) self.action_names = action_names self.action_dim = len(self.action_names) action_bound = len(self.action_names) self.stats = [0 for _ in range(self.action_dim)] if self.features == 'learned' or self.features == 'rnn': si_state_dim = 72 if self.actfreq_ds: if self.domainString == 'CamRestaurants': si_state_dim += 9 #16 elif self.domainString == 'SFRestaurants': si_state_dim += 9 #25 elif self.domainString == 'Laptops11': si_state_dim += 9 #40 if self.domainString == 'CamRestaurants': sd_state_dim = 158 #94 elif self.domainString == 'SFRestaurants': sd_state_dim = 158 elif self.domainString == 'Laptops11': sd_state_dim = 158 #13 else: logger.error( 'Domain {} not implemented in feudal-DQN yet' ) # just find out the size of sd_state_dim for the new domain self.sd_enc_size = 50 self.si_enc_size = 25 self.dropout_rate = 0. if cfg.has_option('feudalpolicy', 'sd_enc_size'): self.sd_enc_size = cfg.getint('feudalpolicy', 'sd_enc_size') if cfg.has_option('feudalpolicy', 'si_enc_size'): self.si_enc_size = cfg.getint('feudalpolicy', 'si_enc_size') if cfg.has_option('dqnpolicy', 'dropout_rate') and self.is_training: self.dropout_rate = cfg.getfloat('feudalpolicy', 'dropout_rate') if cfg.has_option('dqnpolicy', 'dropout_rate') and self.is_training: self.dropout_rate = cfg.getfloat('feudalpolicy', 'dropout_rate') self.state_dim = si_state_dim + sd_state_dim if self.features == 'learned': self.dqn = dqn.NNFDeepQNetwork( self.sess, si_state_dim, sd_state_dim, self.action_dim, self.learning_rate, self.tau, action_bound, self.minibatch_size, self.architecture, self.h1_size, self.h2_size, sd_enc_size=self.sd_enc_size, si_enc_size=self.si_enc_size, dropout_rate=self.dropout_rate) elif self.features == 'rnn': self.dqn = dqn.RNNFDeepQNetwork( self.sess, si_state_dim, sd_state_dim, self.action_dim, self.learning_rate, self.tau, action_bound, self.minibatch_size, self.architecture, self.h1_size, self.h2_size, sd_enc_size=self.sd_enc_size, si_enc_size=self.si_enc_size, dropout_rate=self.dropout_rate, slot=self.slot) else: # self.features = 'dip' if self.actfreq_ds: if self.domainString == 'CamRestaurants': self.state_dim += 9 #16 elif self.domainString == 'SFRestaurants': self.state_dim += 9 #25 elif self.domainString == 'Laptops11': self.state_dim += 9 #40 self.dqn = dqn.DeepQNetwork(self.sess, self.state_dim, self.action_dim, self.learning_rate, self.tau, action_bound, self.minibatch_size, self.architecture, self.h1_size, self.h2_size, dropout_rate=self.dropout_rate) # when all models are defined, init all variables (this might to be sent to the main policy too) init_op = tf.global_variables_initializer() self.sess.run(init_op) self.loadPolicy(self.in_policy_file) print 'loaded replay size: ', self.episodes[ self.domainString].size() self.dqn.update_target_network()
def __init__(self, in_policy_file, out_policy_file, domainString='CamRestaurants', is_training=False): super(BootstrappedDQNPolicy, self).__init__(domainString, is_training) self.domainString = domainString self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString) self.in_policy_file = in_policy_file self.out_policy_file = out_policy_file self.is_training = is_training self.accum_belief = [] self.prev_state_check = None # parameter settings self.n_in = 260 if cfg.has_option('dqnpolicy_' + domainString, 'n_in'): self.n_in = cfg.getint('dqnpolicy_' + domainString, 'n_in') self.actor_lr = 0.0001 if cfg.has_option('dqnpolicy_' + domainString, 'actor_lr'): self.actor_lr = cfg.getfloat('dqnpolicy_' + domainString, 'actor_lr') self.critic_lr = 0.001 if cfg.has_option('dqnpolicy_' + domainString, 'critic_lr'): self.critic_lr = cfg.getfloat('dqnpolicy_' + domainString, 'critic_lr') self.tau = 0.001 if cfg.has_option('dqnpolicy_' + domainString, 'tau'): self.tau = cfg.getfloat('dqnpolicy_' + domainString, 'tau') self.randomseed = 1234 if cfg.has_option('GENERAL', 'seed'): self.randomseed = cfg.getint('GENERAL', 'seed') self.gamma = 1.0 if cfg.has_option('dqnpolicy_' + domainString, 'gamma'): self.gamma = cfg.getfloat('dqnpolicy_' + domainString, 'gamma') self.regularisation = 'l2' if cfg.has_option('dqnpolicy_' + domainString, 'regularisation'): self.regularisation = cfg.get('dqnpolicy_' + domainString, 'regulariser') self.learning_rate = 0.001 if cfg.has_option('dqnpolicy_' + domainString, 'learning_rate'): self.learning_rate = cfg.getfloat('dqnpolicy_' + domainString, 'learning_rate') self.exploration_type = 'e-greedy' # Boltzman if cfg.has_option('dqnpolicy_' + domainString, 'exploration_type'): self.exploration_type = cfg.get('dqnpolicy_' + domainString, 'exploration_type') self.episodeNum = 1000 if cfg.has_option('dqnpolicy_' + domainString, 'episodeNum'): self.episodeNum = cfg.getfloat('dqnpolicy_' + domainString, 'episodeNum') self.maxiter = 5000 if cfg.has_option('dqnpolicy_' + domainString, 'maxiter'): self.maxiter = cfg.getfloat('dqnpolicy_' + domainString, 'maxiter') self.epsilon = 1 if cfg.has_option('dqnpolicy_' + domainString, 'epsilon'): self.epsilon = cfg.getfloat('dqnpolicy_' + domainString, 'epsilon') self.epsilon_start = 1 if cfg.has_option('dqnpolicy_' + domainString, 'epsilon_start'): self.epsilon_start = cfg.getfloat('dqnpolicy_' + domainString, 'epsilon_start') self.epsilon_end = 1 if cfg.has_option('dqnpolicy_' + domainString, 'epsilon_end'): self.epsilon_end = cfg.getfloat('dqnpolicy_' + domainString, 'epsilon_end') self.priorProbStart = 1.0 if cfg.has_option('dqnpolicy_' + domainString, 'prior_sample_prob_start'): self.priorProbStart = cfg.getfloat('dqnpolicy_' + domainString, 'prior_sample_prob_start') self.save_step = 100 if cfg.has_option('policy_' + domainString, 'save_step'): self.save_step = cfg.getint('policy_' + domainString, 'save_step') self.priorProbEnd = 0.1 if cfg.has_option('dqnpolicy_' + domainString, 'prior_sample_prob_end'): self.priorProbEnd = cfg.getfloat('dqnpolicy_' + domainString, 'prior_sample_prob_end') self.policyfeatures = [] if cfg.has_option('dqnpolicy_' + domainString, 'features'): logger.info('Features: ' + str(cfg.get('dqnpolicy_' + domainString, 'features'))) self.policyfeatures = json.loads( cfg.get('dqnpolicy_' + domainString, 'features')) self.max_k = 5 if cfg.has_option('dqnpolicy_' + domainString, 'max_k'): self.max_k = cfg.getint('dqnpolicy_' + domainString, 'max_k') self.learning_algorithm = 'drl' if cfg.has_option('dqnpolicy_' + domainString, 'learning_algorithm'): self.learning_algorithm = cfg.get('dqnpolicy_' + domainString, 'learning_algorithm') logger.info('Learning algorithm: ' + self.learning_algorithm) self.minibatch_size = 32 if cfg.has_option('dqnpolicy_' + domainString, 'minibatch_size'): self.minibatch_size = cfg.getint('dqnpolicy_' + domainString, 'minibatch_size') self.capacity = 1000 # max(self.minibatch_size, 2000) if cfg.has_option('dqnpolicy_' + domainString, 'capacity'): self.capacity = max( cfg.getint('dqnpolicy_' + domainString, 'capacity'), 2000) self.replay_type = 'vanilla' if cfg.has_option('dqnpolicy_' + domainString, 'replay_type'): self.replay_type = cfg.get('dqnpolicy_' + domainString, 'replay_type') self.architecture = 'vanilla' if cfg.has_option('dqnpolicy_' + domainString, 'architecture'): self.architecture = cfg.get('dqnpolicy_' + domainString, 'architecture') self.q_update = 'single' if cfg.has_option('dqnpolicy_' + domainString, 'q_update'): self.q_update = cfg.get('dqnpolicy_' + domainString, 'q_update') self.h1_size = 130 if cfg.has_option('dqnpolicy_' + domainString, 'h1_size'): self.h1_size = cfg.getint('dqnpolicy_' + domainString, 'h1_size') self.h2_size = 130 if cfg.has_option('dqnpolicy_' + domainString, 'h2_size'): self.h2_size = cfg.getint('dqnpolicy_' + domainString, 'h2_size') self.no_heads = 3 if cfg.has_option('dqnpolicy_' + domainString, 'no_head'): self.no_heads = cfg.getint('dqnpolicy_' + domainString, 'no_head') self.episode_ave_max_q = [] os.environ["CUDA_VISIBLE_DEVICES"] = "" # initialize head self.head = None # init session self.sess = tf.Session() with tf.device("/cpu:0"): np.random.seed(self.randomseed) tf.set_random_seed(self.randomseed) # initialise an replay buffer if self.replay_type == 'vanilla': self.episodes[self.domainString] = ReplayBuffer( self.capacity, self.minibatch_size, self.randomseed) elif self.replay_type == 'prioritized': self.episodes[self.domainString] = ReplayPrioritised( self.capacity, self.minibatch_size, self.randomseed) # replay_buffer = ReplayBuffer(self.capacity, self.randomseed) # self.episodes = [] self.samplecount = 0 self.episodecount = 0 # construct the models self.state_dim = self.n_in self.summaryaction = SummaryAction.SummaryAction(domainString) self.action_dim = len(self.summaryaction.action_names) action_bound = len(self.summaryaction.action_names) self.stats = [0 for _ in range(self.action_dim)] self.dqn = dqn.DeepQNetwork(self.sess, self.state_dim, self.action_dim, \ self.critic_lr, self.tau, action_bound, self.architecture, self.h1_size, self.h2_size, self.no_heads, self.minibatch_size) # when all models are defined, init all variables init_op = tf.global_variables_initializer() self.sess.run(init_op) self.loadPolicy(self.in_policy_file) print 'loaded replay size: ', self.episodes[ self.domainString].size() for head in range(self.no_heads): self.dqn.update_target_network(head)
def __init__(self, in_policy_file, out_policy_file, domainString='CamRestaurants', is_training=False): super(ACERPolicy, self).__init__(domainString, is_training) tf.reset_default_graph() self.in_policy_file = in_policy_file self.out_policy_file = out_policy_file self.is_training = is_training self.accum_belief = [] self.prev_state_check = None self.domainString = domainString self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString) #improvement================================== self.intrinsic_reward_method = None if cfg.has_option('scme', 'method'): self.intrinsic_reward_method = cfg.get('scme', 'method') #improvement================================== # parameter settings if 0: # cfg.has_option('dqnpolicy', 'n_in'): #ic304: this was giving me a weird error, disabled it until i can check it deeper self.n_in = cfg.getint('dqnpolicy', 'n_in') else: self.n_in = self.get_n_in(domainString) self.actor_lr = 0.0001 if cfg.has_option('dqnpolicy', 'actor_lr'): self.actor_lr = cfg.getfloat('dqnpolicy', 'actor_lr') self.critic_lr = 0.001 if cfg.has_option('dqnpolicy', 'critic_lr'): self.critic_lr = cfg.getfloat('dqnpolicy', 'critic_lr') self.delta = 1. if cfg.has_option('dqnpolicy', 'delta'): self.delta = cfg.getfloat('dqnpolicy', 'delta') self.alpha = 0.99 if cfg.has_option('dqnpolicy', 'beta'): self.alpha = cfg.getfloat('dqnpolicy', 'beta') self.c = 10. if cfg.has_option('dqnpolicy', 'is_threshold'): self.c = cfg.getfloat('dqnpolicy', 'is_threshold') self.randomseed = 1234 if cfg.has_option('GENERAL', 'seed'): self.randomseed = cfg.getint('GENERAL', 'seed') self.gamma = 0.99 if cfg.has_option('dqnpolicy', 'gamma'): self.gamma = cfg.getfloat('dqnpolicy', 'gamma') self.regularisation = 'l2' if cfg.has_option('dqnpolicy', 'regularisation'): self.regularisation = cfg.get('dqnpolicy', 'regularisation') self.learning_rate = 0.001 if cfg.has_option('dqnpolicy', 'learning_rate'): self.learning_rate = cfg.getfloat('dqnpolicy', 'learning_rate') self.exploration_type = 'e-greedy' # Boltzman if cfg.has_option('dqnpolicy', 'exploration_type'): self.exploration_type = cfg.get('dqnpolicy', 'exploration_type') self.episodeNum = 1000 if cfg.has_option('dqnpolicy', 'episodeNum'): self.episodeNum = cfg.getfloat('dqnpolicy', 'episodeNum') self.maxiter = 5000 if cfg.has_option('dqnpolicy', 'maxiter'): self.maxiter = cfg.getfloat('dqnpolicy', 'maxiter') self.curiosityreward = False if cfg.has_option('eval', 'curiosityreward'): self.curiosityreward = cfg.getboolean('eval', 'curiosityreward') self.epsilon = 1 if cfg.has_option('dqnpolicy', 'epsilon'): self.epsilon = cfg.getfloat('dqnpolicy', 'epsilon') if not self.curiosityreward: # no eps-greedy exploration when curious expl. is used self.epsilon_start = 1 if cfg.has_option('dqnpolicy', 'epsilon_start'): self.epsilon_start = cfg.getfloat('dqnpolicy', 'epsilon_start') else: self.epsilon_start = 1 if cfg.has_option('dqnpolicy', 'epsilon_start'): self.epsilon_start = cfg.getfloat('dqnpolicy', 'epsilon_start') self.epsilon_end = 1 if cfg.has_option('dqnpolicy', 'epsilon_end'): self.epsilon_end = cfg.getfloat('dqnpolicy', 'epsilon_end') self.priorProbStart = 1.0 if cfg.has_option('dqnpolicy', 'prior_sample_prob_start'): self.priorProbStart = cfg.getfloat('dqnpolicy', 'prior_sample_prob_start') self.priorProbEnd = 0.1 if cfg.has_option('dqnpolicy', 'prior_sample_prob_end'): self.priorProbEnd = cfg.getfloat('dqnpolicy', 'prior_sample_prob_end') self.policyfeatures = [] if cfg.has_option('dqnpolicy', 'features'): logger.info('Features: ' + str(cfg.get('dqnpolicy', 'features'))) self.policyfeatures = json.loads(cfg.get('dqnpolicy', 'features')) self.max_k = 5 if cfg.has_option('dqnpolicy', 'max_k'): self.max_k = cfg.getint('dqnpolicy', 'max_k') self.learning_algorithm = 'drl' if cfg.has_option('dqnpolicy', 'learning_algorithm'): self.learning_algorithm = cfg.get('dqnpolicy', 'learning_algorithm') logger.info('Learning algorithm: ' + self.learning_algorithm) self.minibatch_size = 32 if cfg.has_option('dqnpolicy', 'minibatch_size'): self.minibatch_size = cfg.getint('dqnpolicy', 'minibatch_size') self.capacity = 1000 if cfg.has_option('dqnpolicy', 'capacity'): self.capacity = cfg.getint('dqnpolicy', 'capacity') self.replay_type = 'vanilla' if cfg.has_option('dqnpolicy', 'replay_type'): self.replay_type = cfg.get('dqnpolicy', 'replay_type') self.architecture = 'vanilla' if cfg.has_option('dqnpolicy', 'architecture'): self.architecture = cfg.get('dqnpolicy', 'architecture') self.q_update = 'single' if cfg.has_option('dqnpolicy', 'q_update'): self.q_update = cfg.get('dqnpolicy', 'q_update') self.h1_size = 130 if cfg.has_option('dqnpolicy', 'h1_size'): self.h1_size = cfg.getint('dqnpolicy', 'h1_size') self.h2_size = 50 if cfg.has_option('dqnpolicy', 'h2_size'): self.h2_size = cfg.getint('dqnpolicy', 'h2_size') self.save_step = 200 if cfg.has_option('policy', 'save_step'): self.save_step = cfg.getint('policy', 'save_step') self.importance_sampling = 'soft' if cfg.has_option('dqnpolicy', 'importance_sampling'): self.importance_sampling = cfg.get('dqnpolicy', 'importance_sampling') self.train_iters_per_episode = 1 if cfg.has_option('dqnpolicy', 'train_iters_per_episode'): self.train_iters_per_episode = cfg.getint( 'dqnpolicy', 'train_iters_per_episode') self.training_frequency = 2 if cfg.has_option('dqnpolicy', 'training_frequency'): self.training_frequency = cfg.getint('dqnpolicy', 'training_frequency') # domain specific parameter settings (overrides general policy parameter settings) if cfg.has_option('dqnpolicy_' + domainString, 'n_in'): self.n_in = cfg.getint('dqnpolicy_' + domainString, 'n_in') if cfg.has_option('dqnpolicy_' + domainString, 'actor_lr'): self.actor_lr = cfg.getfloat('dqnpolicy_' + domainString, 'actor_lr') if cfg.has_option('dqnpolicy_' + domainString, 'critic_lr'): self.critic_lr = cfg.getfloat('dqnpolicy_' + domainString, 'critic_lr') if cfg.has_option('dqnpolicy_' + domainString, 'delta'): self.delta = cfg.getfloat('dqnpolicy_' + domainString, 'delta') if cfg.has_option('dqnpolicy_' + domainString, 'beta'): self.alpha = cfg.getfloat('dqnpolicy_' + domainString, 'beta') if cfg.has_option('dqnpolicy_' + domainString, 'is_threshold'): self.c = cfg.getfloat('dqnpolicy_' + domainString, 'is_threshold') if cfg.has_option('dqnpolicy_' + domainString, 'gamma'): self.gamma = cfg.getfloat('dqnpolicy_' + domainString, 'gamma') if cfg.has_option('dqnpolicy_' + domainString, 'regularisation'): self.regularisation = cfg.get('dqnpolicy_' + domainString, 'regulariser') if cfg.has_option('dqnpolicy_' + domainString, 'learning_rate'): self.learning_rate = cfg.getfloat('dqnpolicy_' + domainString, 'learning_rate') if cfg.has_option('dqnpolicy_' + domainString, 'exploration_type'): self.exploration_type = cfg.get('dqnpolicy_' + domainString, 'exploration_type') if cfg.has_option('dqnpolicy_' + domainString, 'episodeNum'): self.episodeNum = cfg.getfloat('dqnpolicy_' + domainString, 'episodeNum') if cfg.has_option('dqnpolicy_' + domainString, 'maxiter'): self.maxiter = cfg.getfloat('dqnpolicy_' + domainString, 'maxiter') if cfg.has_option('dqnpolicy_' + domainString, 'epsilon'): self.epsilon = cfg.getfloat('dqnpolicy_' + domainString, 'epsilon') if cfg.has_option('dqnpolicy_' + domainString, 'epsilon_start'): self.epsilon_start = cfg.getfloat('dqnpolicy_' + domainString, 'epsilon_start') if cfg.has_option('dqnpolicy_' + domainString, 'epsilon_end'): self.epsilon_end = cfg.getfloat('dqnpolicy_' + domainString, 'epsilon_end') if cfg.has_option('dqnpolicy_' + domainString, 'prior_sample_prob_start'): self.priorProbStart = cfg.getfloat('dqnpolicy_' + domainString, 'prior_sample_prob_start') if cfg.has_option('dqnpolicy_' + domainString, 'prior_sample_prob_end'): self.priorProbEnd = cfg.getfloat('dqnpolicy_' + domainString, 'prior_sample_prob_end') if cfg.has_option('dqnpolicy_' + domainString, 'features'): logger.info('Features: ' + str(cfg.get('dqnpolicy_' + domainString, 'features'))) self.policyfeatures = json.loads( cfg.get('dqnpolicy_' + domainString, 'features')) if cfg.has_option('dqnpolicy_' + domainString, 'max_k'): self.max_k = cfg.getint('dqnpolicy_' + domainString, 'max_k') if cfg.has_option('dqnpolicy_' + domainString, 'learning_algorithm'): self.learning_algorithm = cfg.get('dqnpolicy_' + domainString, 'learning_algorithm') logger.info('Learning algorithm: ' + self.learning_algorithm) if cfg.has_option('dqnpolicy_' + domainString, 'minibatch_size'): self.minibatch_size = cfg.getint('dqnpolicy_' + domainString, 'minibatch_size') if cfg.has_option('dqnpolicy_' + domainString, 'capacity'): self.capacity = cfg.getint('dqnpolicy_' + domainString, 'capacity') if cfg.has_option('dqnpolicy_' + domainString, 'replay_type'): self.replay_type = cfg.get('dqnpolicy_' + domainString, 'replay_type') if cfg.has_option('dqnpolicy_' + domainString, 'architecture'): self.architecture = cfg.get('dqnpolicy_' + domainString, 'architecture') if cfg.has_option('dqnpolicy_' + domainString, 'q_update'): self.q_update = cfg.get('dqnpolicy_' + domainString, 'q_update') if cfg.has_option('dqnpolicy_' + domainString, 'h1_size'): self.h1_size = cfg.getint('dqnpolicy_' + domainString, 'h1_size') if cfg.has_option('dqnpolicy_' + domainString, 'h2_size'): self.h2_size = cfg.getint('dqnpolicy_' + domainString, 'h2_size') if cfg.has_option('policy_' + domainString, 'save_step'): self.save_step = cfg.getint('policy_' + domainString, 'save_step') if cfg.has_option('dqnpolicy_' + domainString, 'importance_sampling'): self.importance_sampling = cfg.get('dqnpolicy_' + domainString, 'importance_sampling') if cfg.has_option('dqnpolicy_' + domainString, 'train_iters_per_episode'): self.train_iters_per_episode = cfg.getint( 'dqnpolicy_' + domainString, 'train_iters_per_episode') if cfg.has_option('dqnpolicy_' + domainString, 'training_frequency'): self.training_frequency = cfg.getint('dqnpolicy_' + domainString, 'training_frequency') self.episode_ct = 0 self.episode_ave_max_q = [] self.mu_prob = 0. # behavioral policy os.environ["CUDA_VISIBLE_DEVICES"] = "" # init session self.sess = tf.Session() with tf.device("/cpu:0"): np.random.seed(self.randomseed) tf.set_random_seed(self.randomseed) # initialise an replay buffer if self.replay_type == 'vanilla': self.episodes[self.domainString] = ReplayBufferEpisode( self.capacity, self.minibatch_size, self.randomseed) elif self.replay_type == 'prioritized': self.episodes[self.domainString] = ReplayPrioritisedEpisode( self.capacity, self.minibatch_size, self.randomseed) #replay_buffer = ReplayBuffer(self.capacity, self.randomseed) #self.episodes = [] self.samplecount = 0 self.episodecount = 0 # construct the models self.state_dim = self.n_in self.summaryaction = SummaryAction.SummaryAction(domainString) self.action_dim = len(self.summaryaction.action_names) action_bound = len(self.summaryaction.action_names) self.stats = [0 for _ in range(self.action_dim)] self.global_mu = [0. for _ in range(self.action_dim)] self.acer = acer.ACERNetwork(self.sess, self.state_dim, self.action_dim, \ self.critic_lr, self.delta, self.c, self.alpha, self.h1_size, self.h2_size, self.is_training, self.randomseed) # when all models are defined, init all variables init_op = tf.global_variables_initializer() self.sess.run(init_op) self.loadPolicy(self.in_policy_file) print 'loaded replay size: ', self.episodes[ self.domainString].size() #improvement================================== #initial if self.intrinsic_reward_method == 'vime': self.vime_model = vime(self.state_dim, self.action_dim) self.vime_model.load_model('model/vime_model/' + self.in_policy_file) elif self.intrinsic_reward_method == 'cme': self.cme_model = cme(self.state_dim, self.action_dim) self.cme_model.load_model('model/cme_model/' + self.in_policy_file) elif self.intrinsic_reward_method == 'scme': self.scme_model = scme(self.state_dim, self.action_dim) self.scme_model.load_model('model/scme_model/' + self.in_policy_file)
def __init__(self, in_policy_file, out_policy_file, domainString='CamRestaurants', is_training=False, action_names=None): super(DQNPolicy, self).__init__(domainString, is_training) tf.reset_default_graph() self.domainString = domainString self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString) self.in_policy_file = in_policy_file self.out_policy_file = out_policy_file self.is_training = is_training self.accum_belief = [] self.prev_state_check = None #improvement================================== self.intrinsic_reward_method = None self.conf = ConfigParser.ConfigParser() if utils.Settings.config.has_option('scme', 'method'): self.intrinsic_reward_method = utils.Settings.config.get( 'scme', 'method') #improvement================================== # parameter settings if 0: #cfg.has_option('dqnpolicy', 'n_in'): #ic304: this was giving me a weird error, disabled it until i can check it deeper self.n_in = cfg.getint('dqnpolicy', 'n_in') else: self.n_in = self.get_n_in(domainString) self.learning_rate = 0.001 if utils.Settings.config.has_option('dqnpolicy', 'learning_rate'): self.learning_rate = utils.Settings.config.getfloat( 'dqnpolicy', 'learning_rate') self.tau = 0.001 if utils.Settings.config.has_option('dqnpolicy', 'tau'): self.tau = utils.Settings.config.getfloat('dqnpolicy', 'tau') # self.randomseed = 1234 #TODO cfg import doesn't work anymore therfore i changed all the cfg to u.S.config. # if cfg.has_option('GENERAL', 'seed'): # self.randomseed = cfg.getint('GENERAL', 'seed') #see same below, this is just kept as example to try self.randomseed = 1234 if utils.Settings.config.has_option('GENERAL', 'seed'): self.randomseed = utils.Settings.config.getint('GENERAL', 'seed') self.gamma = 1.0 if utils.Settings.config.has_option('dqnpolicy', 'gamma'): self.gamma = utils.Settings.config.getfloat('dqnpolicy', 'gamma') self.regularisation = 'l2' if utils.Settings.config.has_option('dqnpolicy', 'regularisation'): self.regularisation = utils.Settings.config.get( 'dqnpolicy', 'regulariser') self.exploration_type = 'e-greedy' # Boltzman if utils.Settings.config.has_option('dqnpolicy', 'exploration_type'): self.exploration_type = utils.Settings.config.get( 'dqnpolicy', 'exploration_type') self.episodeNum = 1000 if utils.Settings.config.has_option('dqnpolicy', 'episodeNum'): self.episodeNum = utils.Settings.config.getfloat( 'dqnpolicy', 'episodeNum') self.maxiter = 5000 if utils.Settings.config.has_option('dqnpolicy', 'maxiter'): self.maxiter = utils.Settings.config.getfloat( 'dqnpolicy', 'maxiter') self.epsilon = 1 if utils.Settings.config.has_option('dqnpolicy', 'epsilon'): self.epsilon = utils.Settings.config.getfloat( 'dqnpolicy', 'epsilon') self.epsilon_start = 1 if utils.Settings.config.has_option('dqnpolicy', 'epsilon_start'): self.epsilon_start = utils.Settings.config.getfloat( 'dqnpolicy', 'epsilon_start') self.epsilon_end = 1 if utils.Settings.config.has_option('dqnpolicy', 'epsilon_end'): self.epsilon_end = utils.Settings.config.getfloat( 'dqnpolicy', 'epsilon_end') self.save_step = 100 if utils.Settings.config.has_option('policy', 'save_step'): self.save_step = utils.Settings.config.getint( 'policy', 'save_step') self.priorProbStart = 1.0 if utils.Settings.config.has_option('dqnpolicy', 'prior_sample_prob_start'): self.priorProbStart = utils.Settings.config.getfloat( 'dqnpolicy', 'prior_sample_prob_start') self.priorProbEnd = 0.1 if utils.Settings.config.has_option('dqnpolicy', 'prior_sample_prob_end'): self.priorProbEnd = utils.Settings.config.getfloat( 'dqnpolicy', 'prior_sample_prob_end') self.policyfeatures = [] if utils.Settings.config.has_option('dqnpolicy', 'features'): logger.info( 'Features: ' + str(utils.Settings.config.get('dqnpolicy', 'features'))) self.policyfeatures = json.loads( utils.Settings.config.get('dqnpolicy', 'features')) self.max_k = 5 if utils.Settings.config.has_option('dqnpolicy', 'max_k'): self.max_k = utils.Settings.config.getint('dqnpolicy', 'max_k') self.learning_algorithm = 'drl' if utils.Settings.config.has_option('dqnpolicy', 'learning_algorithm'): self.learning_algorithm = utils.Settings.config.get( 'dqnpolicy', 'learning_algorithm') logger.info('Learning algorithm: ' + self.learning_algorithm) self.minibatch_size = 32 if utils.Settings.config.has_option('dqnpolicy', 'minibatch_size'): self.minibatch_size = utils.Settings.config.getint( 'dqnpolicy', 'minibatch_size') self.capacity = 1000 if utils.Settings.config.has_option('dqnpolicy', 'capacity'): self.capacity = utils.Settings.config.getint( 'dqnpolicy', 'capacity') self.replay_type = 'vanilla' if utils.Settings.config.has_option('dqnpolicy', 'replay_type'): self.replay_type = utils.Settings.config.get( 'dqnpolicy', 'replay_type') self.architecture = 'vanilla' if utils.Settings.config.has_option('dqnpolicy', 'architecture'): self.architecture = utils.Settings.config.get( 'dqnpolicy', 'architecture') if self.architecture == 'dip': self.architecture = 'dip2' self.q_update = 'single' if utils.Settings.config.has_option('dqnpolicy', 'q_update'): self.q_update = utils.Settings.config.get('dqnpolicy', 'q_update') self.h1_size = 130 if utils.Settings.config.has_option('dqnpolicy', 'h1_size'): self.h1_size = utils.Settings.config.getint('dqnpolicy', 'h1_size') self.h2_size = 130 if utils.Settings.config.has_option('dqnpolicy', 'h2_size'): self.h2_size = utils.Settings.config.getint('dqnpolicy', 'h2_size') self.training_frequency = 2 if utils.Settings.config.has_option('dqnpolicy', 'training_frequency'): self.training_frequency = utils.Settings.config.getint( 'dqnpolicy', 'training_frequency') # domain specific parameter settings (overrides general policy parameter settings) if utils.Settings.config.has_option('dqnpolicy_' + domainString, 'n_in'): self.n_in = utils.Settings.config.getint( 'dqnpolicy_' + domainString, 'n_in') if utils.Settings.config.has_option('dqnpolicy_' + domainString, 'learning_rate'): self.learning_rate = utils.Settings.config.getfloat( 'dqnpolicy_' + domainString, 'learning_rate') if utils.Settings.config.has_option('dqnpolicy_' + domainString, 'tau'): self.tau = utils.Settings.config.getfloat( 'dqnpolicy_' + domainString, 'tau') if utils.Settings.config.has_option('dqnpolicy_' + domainString, 'gamma'): self.gamma = utils.Settings.config.getfloat( 'dqnpolicy_' + domainString, 'gamma') if utils.Settings.config.has_option('dqnpolicy_' + domainString, 'regularisation'): self.regularisation = utils.Settings.config.get( 'dqnpolicy_' + domainString, 'regulariser') if utils.Settings.config.has_option('dqnpolicy_' + domainString, 'exploration_type'): self.exploration_type = utils.Settings.config.get( 'dqnpolicy_' + domainString, 'exploration_type') if utils.Settings.config.has_option('dqnpolicy_' + domainString, 'episodeNum'): self.episodeNum = utils.Settings.config.getfloat( 'dqnpolicy_' + domainString, 'episodeNum') if utils.Settings.config.has_option('dqnpolicy_' + domainString, 'maxiter'): self.maxiter = utils.Settings.config.getfloat( 'dqnpolicy_' + domainString, 'maxiter') if utils.Settings.config.has_option('dqnpolicy_' + domainString, 'epsilon'): self.epsilon = utils.Settings.config.getfloat( 'dqnpolicy_' + domainString, 'epsilon') if utils.Settings.config.has_option('dqnpolicy_' + domainString, 'epsilon_start'): self.epsilon_start = utils.Settings.config.getfloat( 'dqnpolicy_' + domainString, 'epsilon_start') if utils.Settings.config.has_option('dqnpolicy_' + domainString, 'epsilon_end'): self.epsilon_end = utils.Settings.config.getfloat( 'dqnpolicy_' + domainString, 'epsilon_end') if utils.Settings.config.has_option('policy_' + domainString, 'save_step'): self.save_step = utils.Settings.config.getint( 'policy_' + domainString, 'save_step') if utils.Settings.config.has_option('dqnpolicy_' + domainString, 'prior_sample_prob_start'): self.priorProbStart = utils.Settings.config.getfloat( 'dqnpolicy_' + domainString, 'prior_sample_prob_start') if utils.Settings.config.has_option('dqnpolicy_' + domainString, 'prior_sample_prob_end'): self.priorProbEnd = utils.Settings.config.getfloat( 'dqnpolicy_' + domainString, 'prior_sample_prob_end') if utils.Settings.config.has_option('dqnpolicy_' + domainString, 'features'): logger.info('Features: ' + str( utils.Settings.config.get('dqnpolicy_' + domainString, 'features'))) self.policyfeatures = json.loads( utils.Settings.config.get('dqnpolicy_' + domainString, 'features')) if utils.Settings.config.has_option('dqnpolicy_' + domainString, 'max_k'): self.max_k = utils.Settings.config.getint( 'dqnpolicy_' + domainString, 'max_k') if utils.Settings.config.has_option('dqnpolicy_' + domainString, 'learning_algorithm'): self.learning_algorithm = utils.Settings.config.get( 'dqnpolicy_' + domainString, 'learning_algorithm') logger.info('Learning algorithm: ' + self.learning_algorithm) if utils.Settings.config.has_option('dqnpolicy_' + domainString, 'minibatch_size'): self.minibatch_size = utils.Settings.config.getint( 'dqnpolicy_' + domainString, 'minibatch_size') if utils.Settings.config.has_option('dqnpolicy_' + domainString, 'capacity'): self.capacity = utils.Settings.config.getint( 'dqnpolicy_' + domainString, 'capacity') if utils.Settings.config.has_option('dqnpolicy_' + domainString, 'replay_type'): self.replay_type = utils.Settings.config.get( 'dqnpolicy_' + domainString, 'replay_type') if utils.Settings.config.has_option('dqnpolicy_' + domainString, 'architecture'): self.architecture = utils.Settings.config.get( 'dqnpolicy_' + domainString, 'architecture') if utils.Settings.config.has_option('dqnpolicy_' + domainString, 'q_update'): self.q_update = utils.Settings.config.get( 'dqnpolicy_' + domainString, 'q_update') if utils.Settings.config.has_option('dqnpolicy_' + domainString, 'h1_size'): self.h1_size = utils.Settings.config.getint( 'dqnpolicy_' + domainString, 'h1_size') if utils.Settings.config.has_option('dqnpolicy_' + domainString, 'h2_size'): self.h2_size = utils.Settings.config.getint( 'dqnpolicy_' + domainString, 'h2_size') if utils.Settings.config.has_option('dqnpolicy_' + domainString, 'training_frequency'): self.training_frequency = utils.Settings.config.getint( 'dqnpolicy_' + domainString, 'training_frequency') """ self.shuffle = False if cfg.has_option('dqnpolicy_'+domainString, 'experience_replay'): self.shuffle = cfg.getboolean('dqnpolicy_'+domainString, 'experience_replay') if not self.shuffle: # If we don't use experience replay, we don't need to maintain # sliding window of experiences with maximum capacity. # We only need to maintain the data of minibatch_size self.capacity = self.minibatch_size """ self.episode_ave_max_q = [] self.curiositypred_loss = [] #os.environ["CUDA_VISIBLE_DEVICES"] = "" policytype = 'dqn' self.dropout_rate = 0. if utils.Settings.config.has_option('dqnpolicy', 'dropout_rate'): self.dropout_rate = utils.Settings.config.getfloat( 'dqnpolicy', 'dropout_rate') if utils.Settings.config.has_option('policy', 'policytype'): policytype = utils.Settings.config.get('policy', 'policytype') if policytype != 'feudal': self.sess = tf.Session() with tf.device("/cpu:0"): np.random.seed(self.randomseed) tf.set_random_seed(self.randomseed) # initialise an replay buffer if self.replay_type == 'vanilla': self.episodes[self.domainString] = ReplayBuffer( self.capacity, self.minibatch_size, self.randomseed) elif self.replay_type == 'prioritized': self.episodes[self.domainString] = ReplayPrioritised( self.capacity, self.minibatch_size, self.randomseed) self.samplecount = 0 self.episodecount = 0 # construct the models self.state_dim = self.n_in if self.architecture == 'dip2': self.state_dim = 89 self.summaryaction = SummaryAction.SummaryAction(domainString) if action_names is None: self.action_names = self.summaryaction.action_names else: self.action_names = action_names self.action_dim = len(self.action_names) action_bound = len(self.action_names) self.stats = [0 for _ in range(self.action_dim)] self.dqn = dqn.DeepQNetwork(self.sess, self.state_dim, self.action_dim, \ self.learning_rate, self.tau, action_bound, self.minibatch_size, self.architecture, self.h1_size, self.h2_size, dropout_rate=self.dropout_rate) #self.curiosityFunctions = scme(self.sess, self.state_dim, self.action_dim, self.randomseed) # when all models are defined, init all variables init_op = tf.global_variables_initializer() self.sess.run(init_op) self.loadPolicy(self.in_policy_file) print 'loaded replay size: ', self.episodes[ self.domainString].size() #improvement================================== #initial if self.intrinsic_reward_method == 'vime': self.vime_model = vime(self.state_dim, self.action_dim) self.vime_model.load_model('model/vime_model/' + self.in_policy_file) elif self.intrinsic_reward_method == 'cme': self.cme_model = cme(self.state_dim, self.action_dim) self.cme_model.load_model('model/cme_model/' + self.in_policy_file) elif self.intrinsic_reward_method == 'scme': self.scme_model = scme(self.state_dim, self.action_dim) self.scme_model.load_model('model/scme_model/' + self.in_policy_file) #improvement================================== self.dqn.update_target_network()