예제 #1
0
    def __init__(self, in_policy_file, out_policy_file, domainString='CamRestaurants', is_training=False):
        super(BDQNPolicy, self).__init__(domainString, is_training)

        tf.reset_default_graph()

        self.domainString = domainString
        self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString)
        self.in_policy_file = in_policy_file
        self.out_policy_file = out_policy_file
        self.is_training = is_training
        self.accum_belief = []

        self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString)
        self.prev_state_check = None

        # parameter settings
        self.n_in = 260
        if cfg.has_option('dqnpolicy', 'n_in'):
            self.n_in = cfg.getint('dqnpolicy', 'n_in')

        self.actor_lr = 0.0001
        if cfg.has_option('dqnpolicy', 'actor_lr'):
            self.actor_lr = cfg.getfloat('dqnpolicy', 'actor_lr')

        self.critic_lr = 0.001
        if cfg.has_option('dqnpolicy', 'critic_lr'):
            self.critic_lr = cfg.getfloat('dqnpolicy', 'critic_lr')

        self.tau = 0.001
        if cfg.has_option('dqnpolicy', 'tau'):
            self.tau = cfg.getfloat('dqnpolicy', 'tau')

        self.randomseed = 1234
        if cfg.has_option('GENERAL', 'seed'):
            self.randomseed = cfg.getint('GENERAL', 'seed')

        self.gamma = 1.0
        if cfg.has_option('dqnpolicy', 'gamma'):
            self.gamma = cfg.getfloat('dqnpolicy', 'gamma')

        self.regularisation = 'l2'
        if cfg.has_option('dqnpolicy', 'regularisation'):
            self.regularisation = cfg.get('dqnpolicy', 'regulariser')

        self.learning_rate = 0.001  # ct506 #0.001
        if cfg.has_option('dqnpolicy', 'learning_rate'):
            self.learning_rate = cfg.getfloat('dqnpolicy', 'learning_rate')

        self.exploration_type = 'e-greedy'  # Boltzman
        if cfg.has_option('dqnpolicy', 'exploration_type'):
            self.exploration_type = cfg.get('dqnpolicy', 'exploration_type')

        self.episodeNum = 1000
        if cfg.has_option('dqnpolicy', 'episodeNum'):
            self.episodeNum = cfg.getfloat('dqnpolicy', 'episodeNum')

        self.maxiter = 5000
        if cfg.has_option('dqnpolicy', 'maxiter'):
            self.maxiter = cfg.getfloat('dqnpolicy', 'maxiter')

        self.epsilon = 1
        if cfg.has_option('dqnpolicy', 'epsilon'):
            self.epsilon = cfg.getfloat('dqnpolicy', 'epsilon')

        self.epsilon_start = 1
        if cfg.has_option('dqnpolicy', 'epsilon_start'):
            self.epsilon_start = cfg.getfloat('dqnpolicy', 'epsilon_start')

        self.epsilon_end = 1
        if cfg.has_option('dqnpolicy', 'epsilon_end'):
            self.epsilon_end = cfg.getfloat('dqnpolicy', 'epsilon_end')

        self.priorProbStart = 1.0
        if cfg.has_option('dqnpolicy', 'prior_sample_prob_start'):
            self.priorProbStart = cfg.getfloat('dqnpolicy', 'prior_sample_prob_start')

        self.priorProbEnd = 0.1
        if cfg.has_option('dqnpolicy', 'prior_sample_prob_end'):
            self.priorProbEnd = cfg.getfloat('dqnpolicy', 'prior_sample_prob_end')

        self.policyfeatures = []
        if cfg.has_option('dqnpolicy', 'features'):
            logger.info('Features: ' + str(cfg.get('dqnpolicy', 'features')))
            self.policyfeatures = json.loads(cfg.get('dqnpolicy', 'features'))

        self.max_k = 5
        if cfg.has_option('dqnpolicy', 'max_k'):
            self.max_k = cfg.getint('dqnpolicy', 'max_k')

        self.learning_algorithm = 'drl'
        if cfg.has_option('dqnpolicy', 'learning_algorithm'):
            self.learning_algorithm = cfg.get('dqnpolicy', 'learning_algorithm')
            logger.info('Learning algorithm: ' + self.learning_algorithm)

        self.minibatch_size = 32
        if cfg.has_option('dqnpolicy', 'minibatch_size'):
            self.minibatch_size = cfg.getint('dqnpolicy', 'minibatch_size')

        self.capacity = 1000  # max(self.minibatch_size, 2000)
        if cfg.has_option('dqnpolicy', 'capacity'):
            self.capacity = max(cfg.getint('dqnpolicy', 'capacity'), 2000)

        self.replay_type = 'vanilla'
        if cfg.has_option('dqnpolicy', 'replay_type'):
            self.replay_type = cfg.get('dqnpolicy', 'replay_type')

        self.architecture = 'vanilla'
        if cfg.has_option('dqnpolicy', 'architecture'):
            self.architecture = cfg.get('dqnpolicy', 'architecture')

        self.q_update = 'single'
        if cfg.has_option('dqnpolicy', 'q_update'):
            self.q_update = cfg.get('dqnpolicy', 'q_update')

        self.h1_size = 130
        if cfg.has_option('dqnpolicy', 'h1_size'):
            self.h1_size = cfg.getint('dqnpolicy', 'h1_size')

        self.h2_size = 130
        if cfg.has_option('dqnpolicy', 'h2_size'):
            self.h2_size = cfg.getint('dqnpolicy', 'h2_size')

        self.save_step = 200
        if cfg.has_option('policy', 'save_step'):
            self.save_step = cfg.getint('policy', 'save_step')

        # BDQN parameteres
        self.n_samples = 1
        if cfg.has_option('dqnpolicy', 'n_samples'):
            self.n_samples = cfg.getint('dqnpolicy', 'n_samples')

        sigma_prior = 1.5  # np.array(-3.0, dtype=np.float32)
        if cfg.has_option('dqnpolicy', 'sigma_prior'):
            sigma_prior = cfg.getfloat('dqnpolicy', 'sigma_prior')
        self.sigma_prior = tf.exp(sigma_prior)  # np.exp(np.array(sigma_prior, dtype=np.float32))

        self.stddev_var_mu = 0.01
        if cfg.has_option('dqnpolicy', 'stddev_var_mu'):
            self.stddev_var_mu = cfg.getfloat('dqnpolicy', 'stddev_var_mu')

        self.stddev_var_logsigma = 0.01
        if cfg.has_option('dqnpolicy', 'stddev_var_logsigma'):
            self.stddev_var_logsigma = cfg.getfloat('dqnpolicy', 'stddev_var_logsigma')

        self.mean_log_sigma = 0.000001
        if cfg.has_option('dqnpolicy', 'mean_log_sigma'):
            self.mean_log_sigma = cfg.getfloat('dqnpolicy', 'mean_log_sigma')

        self.n_batches = 1000.0
        if cfg.has_option('dqnpolicy', 'n_batches'):
            self.n_batches = cfg.getfloat('dqnpolicy', 'n_batches')

        self.importance_sampling = False
        if cfg.has_option('dqnpolicy', 'importance_sampling'):
            self.importance_sampling = cfg.getboolean('dqnpolicy', 'importance_sampling')

        self.alpha = 0.85
        if cfg.has_option('dqnpolicy', 'alpha'):
            self.alpha = cfg.getfloat('dqnpolicy', 'alpha')

        self.alpha_divergence = False
        if cfg.has_option('dqnpolicy', 'alpha_divergence'):
            self.alpha_divergence = cfg.getboolean('dqnpolicy', 'alpha_divergence')

        self.sigma_eps = 0.01
        if cfg.has_option('dqnpolicy', 'sigma_eps'):
            self.sigma_eps = cfg.getfloat('dqnpolicy', 'sigma_eps')

        self.training_frequency = 2
        if cfg.has_option('dqnpolicy', 'training_frequency'):
            self.training_frequency = cfg.getint('dqnpolicy', 'training_frequency')

        # domain specific parameter settings (overrides general policy parameter settings)
        if cfg.has_option('dqnpolicy_' + domainString, 'n_in'):
            self.n_in = cfg.getint('dqnpolicy_' + domainString, 'n_in')

        if cfg.has_option('dqnpolicy_' + domainString, 'actor_lr'):
            self.actor_lr = cfg.getfloat('dqnpolicy_' + domainString, 'actor_lr')

        if cfg.has_option('dqnpolicy_' + domainString, 'critic_lr'):
            self.critic_lr = cfg.getfloat('dqnpolicy_' + domainString, 'critic_lr')

        if cfg.has_option('dqnpolicy_' + domainString, 'tau'):
            self.tau = cfg.getfloat('dqnpolicy_' + domainString, 'tau')

        if cfg.has_option('dqnpolicy_' + domainString, 'gamma'):
            self.gamma = cfg.getfloat('dqnpolicy_' + domainString, 'gamma')

        if cfg.has_option('dqnpolicy_' + domainString, 'regularisation'):
            self.regularisation = cfg.get('dqnpolicy_' + domainString, 'regulariser')

        if cfg.has_option('dqnpolicy_' + domainString, 'learning_rate'):
            self.learning_rate = cfg.getfloat('dqnpolicy_' + domainString, 'learning_rate')

        if cfg.has_option('dqnpolicy_' + domainString, 'exploration_type'):
            self.exploration_type = cfg.get('dqnpolicy_' + domainString, 'exploration_type')

        if cfg.has_option('dqnpolicy_' + domainString, 'episodeNum'):
            self.episodeNum = cfg.getfloat('dqnpolicy_' + domainString, 'episodeNum')

        if cfg.has_option('dqnpolicy_' + domainString, 'maxiter'):
            self.maxiter = cfg.getfloat('dqnpolicy_' + domainString, 'maxiter')

        if cfg.has_option('dqnpolicy_' + domainString, 'epsilon'):
            self.epsilon = cfg.getfloat('dqnpolicy_' + domainString, 'epsilon')

        if cfg.has_option('dqnpolicy_' + domainString, 'epsilon_start'):
            self.epsilon_start = cfg.getfloat('dqnpolicy_' + domainString, 'epsilon_start')

        if cfg.has_option('dqnpolicy_' + domainString, 'epsilon_end'):
            self.epsilon_end = cfg.getfloat('dqnpolicy_' + domainString, 'epsilon_end')

        if cfg.has_option('dqnpolicy_' + domainString, 'prior_sample_prob_start'):
            self.priorProbStart = cfg.getfloat('dqnpolicy_' + domainString, 'prior_sample_prob_start')

        if cfg.has_option('dqnpolicy_' + domainString, 'prior_sample_prob_end'):
            self.priorProbEnd = cfg.getfloat('dqnpolicy_' + domainString, 'prior_sample_prob_end')

        if cfg.has_option('dqnpolicy_' + domainString, 'features'):
            logger.info('Features: ' + str(cfg.get('dqnpolicy_' + domainString, 'features')))
            self.policyfeatures = json.loads(cfg.get('dqnpolicy_' + domainString, 'features'))

        if cfg.has_option('dqnpolicy_' + domainString, 'max_k'):
            self.max_k = cfg.getint('dqnpolicy_' + domainString, 'max_k')

        if cfg.has_option('dqnpolicy_' + domainString, 'learning_algorithm'):
            self.learning_algorithm = cfg.get('dqnpolicy_' + domainString, 'learning_algorithm')
            logger.info('Learning algorithm: ' + self.learning_algorithm)

        if cfg.has_option('dqnpolicy_' + domainString, 'minibatch_size'):
            self.minibatch_size = cfg.getint('dqnpolicy_' + domainString, 'minibatch_size')

        if cfg.has_option('dqnpolicy_' + domainString, 'capacity'):
            self.capacity = cfg.getint('dqnpolicy_' + domainString, 'capacity')

        if cfg.has_option('dqnpolicy_' + domainString, 'replay_type'):
            self.replay_type = cfg.get('dqnpolicy_' + domainString, 'replay_type')

        if cfg.has_option('dqnpolicy_' + domainString, 'architecture'):
            self.architecture = cfg.get('dqnpolicy_' + domainString, 'architecture')

        if cfg.has_option('dqnpolicy_' + domainString, 'q_update'):
            self.q_update = cfg.get('dqnpolicy_' + domainString, 'q_update')

        if cfg.has_option('dqnpolicy_' + domainString, 'h1_size'):
            self.h1_size = cfg.getint('dqnpolicy_' + domainString, 'h1_size')

        if cfg.has_option('dqnpolicy_' + domainString, 'h2_size'):
            self.h2_size = cfg.getint('dqnpolicy_' + domainString, 'h2_size')

        if cfg.has_option('policy_' + domainString, 'save_step'):
            self.save_step = cfg.getint('policy_' + domainString, 'save_step')

        # BDQN parameteres
        if cfg.has_option('dqnpolicy_' + domainString, 'n_samples'):
            self.n_samples = cfg.getint('dqnpolicy_' + domainString, 'n_samples')

        if cfg.has_option('dqnpolicy_' + domainString, 'sigma_prior'):
            sigma_prior = cfg.getfloat('dqnpolicy_' + domainString, 'sigma_prior')
        self.sigma_prior = tf.exp(sigma_prior)  # np.exp(np.array(sigma_prior, dtype=np.float32))

        if cfg.has_option('dqnpolicy_' + domainString, 'stddev_var_mu'):
            self.stddev_var_mu = cfg.getfloat('dqnpolicy_' + domainString, 'stddev_var_mu')

        if cfg.has_option('dqnpolicy_' + domainString, 'stddev_var_logsigma'):
            self.stddev_var_logsigma = cfg.getfloat('dqnpolicy_' + domainString, 'stddev_var_logsigma')

        if cfg.has_option('dqnpolicy_' + domainString, 'mean_log_sigma'):
            self.mean_log_sigma = cfg.getfloat('dqnpolicy_' + domainString, 'mean_log_sigma')

        if cfg.has_option('dqnpolicy_' + domainString, 'n_batches'):
            self.n_batches = cfg.getfloat('dqnpolicy_' + domainString, 'n_batches')

        if cfg.has_option('dqnpolicy_' + domainString, 'importance_sampling'):
            self.importance_sampling = cfg.getboolean('dqnpolicy_' + domainString, 'importance_sampling')

        if cfg.has_option('dqnpolicy_' + domainString, 'alpha'):
            self.alpha = cfg.getfloat('dqnpolicy_' + domainString, 'alpha')

        if cfg.has_option('dqnpolicy_' + domainString, 'alpha_divergence'):
            self.alpha_divergence = cfg.getboolean('dqnpolicy_' + domainString, 'alpha_divergence')

        if cfg.has_option('dqnpolicy_' + domainString, 'sigma_eps'):
            self.sigma_eps = cfg.getfloat('dqnpolicy_' + domainString, 'sigma_eps')

        if cfg.has_option('dqnpolicy_' + domainString, 'training_frequency'):
            self.training_frequency = cfg.getint('dqnpolicy_' + domainString, 'training_frequency')

        print 'ct506', 'sigma_eps', self.sigma_eps, 'lr', self.learning_rate, 'm', self.n_batches
        self.episode_ave_max_q = []

        os.environ["CUDA_VISIBLE_DEVICES"] = ""

        # init session
        self.sess = tf.Session()
        with tf.device("/cpu:0"):

            np.random.seed(self.randomseed)
            tf.set_random_seed(self.randomseed)

            # initialise an replay buffer
            if self.replay_type == 'vanilla':
                self.episodes[self.domainString] = ReplayBuffer(self.capacity, self.minibatch_size, self.randomseed)
            elif self.replay_type == 'prioritized':
                self.episodes[self.domainString] = ReplayPrioritised(self.capacity, self.minibatch_size,
                                                                     self.randomseed)
            # replay_buffer = ReplayBuffer(self.capacity, self.randomseed)
            # self.episodes = []
            self.samplecount = 0
            self.episodecount = 0

            # construct the models
            self.state_dim = self.n_in
            self.summaryaction = SummaryAction.SummaryAction(domainString)
            self.action_dim = len(self.summaryaction.action_names)
            action_bound = len(self.summaryaction.action_names)
            self.stats = [0 for _ in range(self.action_dim)]
            self.stdVar = []
            self.meanVar = []
            self.stdMean = []
            self.meanMean = []
            self.td_error = []
            self.td_errorVar = []

            self.bbqn = bbqn.DeepQNetwork(self.sess, self.state_dim, self.action_dim, self.learning_rate, self.tau, action_bound, self.architecture, self.h1_size, self.h2_size, self.n_samples, self.minibatch_size, self.sigma_prior, self.n_batches, self.stddev_var_mu,  self.stddev_var_logsigma, self.mean_log_sigma, self.importance_sampling, self.alpha_divergence, self.alpha, self.sigma_eps)

            # when all models are defined, init all variables
            init_op = tf.global_variables_initializer()
            self.sess.run(init_op)

            self.loadPolicy(self.in_policy_file)
            print 'loaded replay size: ', self.episodes[self.domainString].size()

            self.bbqn.update_target_network()
예제 #2
0
    def __init__(self, in_policy_file, out_policy_file, domainString='CamRestaurants', is_training=False):
        super(ENACPolicy, self).__init__(domainString, is_training)

        tf.reset_default_graph()

        self.in_policy_file = in_policy_file
        self.out_policy_file = out_policy_file
        self.is_training = is_training
        self.accum_belief = []
        self.prev_state_check = None

        self.domainString = domainString
        self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString)

        # parameter settings

        if 0:#cfg.has_option('dqnpolicy', 'n_in'): #ic304: this was giving me a weird error, disabled it until i can check it deeper
            self.n_in = cfg.getint('dqnpolicy', 'n_in')
        else:
            self.n_in = self.get_n_in(domainString)

        self.actor_lr = 0.0001
        if cfg.has_option('dqnpolicy', 'actor_lr'):
            self.actor_lr = cfg.getfloat('dqnpolicy', 'actor_lr')

        self.critic_lr = 0.001
        if cfg.has_option('dqnpolicy', 'critic_lr'):
            self.critic_lr = cfg.getfloat('dqnpolicy', 'critic_lr')

        self.tau = 0.001
        if cfg.has_option('dqnpolicy', 'tau'):
            self.tau = cfg.getfloat('dqnpolicy', 'tau')

        self.randomseed = 1234
        if cfg.has_option('GENERAL', 'seed'):
            self.randomseed = cfg.getint('GENERAL', 'seed')

        self.gamma = 1.0
        if cfg.has_option('dqnpolicy', 'gamma'):
            self.gamma = cfg.getfloat('dqnpolicy', 'gamma')

        self.regularisation = 'l2'
        if cfg.has_option('dqnpolicy', 'regularisation'):
            self.regularisation = cfg.get('dqnpolicy', 'regulariser')

        self.learning_rate = 0.001
        if cfg.has_option('dqnpolicy', 'learning_rate'):
            self.learning_rate = cfg.getfloat('dqnpolicy', 'learning_rate')

        self.exploration_type = 'e-greedy'  # Boltzman
        if cfg.has_option('dqnpolicy', 'exploration_type'):
            self.exploration_type = cfg.get('dqnpolicy', 'exploration_type')

        self.episodeNum = 1000
        if cfg.has_option('dqnpolicy', 'episodeNum'):
            self.episodeNum = cfg.getfloat('dqnpolicy', 'episodeNum')

        self.maxiter = 5000
        if cfg.has_option('dqnpolicy', 'maxiter'):
            self.maxiter = cfg.getfloat('dqnpolicy', 'maxiter')

        self.epsilon = 1
        if cfg.has_option('dqnpolicy', 'epsilon'):
            self.epsilon = cfg.getfloat('dqnpolicy', 'epsilon')

        self.epsilon_start = 1
        if cfg.has_option('dqnpolicy', 'epsilon_start'):
            self.epsilon_start = cfg.getfloat('dqnpolicy', 'epsilon_start')

        self.epsilon_end = 1
        if cfg.has_option('dqnpolicy', 'epsilon_end'):
            self.epsilon_end = cfg.getfloat('dqnpolicy', 'epsilon_end')

        self.priorProbStart = 1.0
        if cfg.has_option('dqnpolicy', 'prior_sample_prob_start'):
            self.priorProbStart = cfg.getfloat('dqnpolicy', 'prior_sample_prob_start')

        self.priorProbEnd = 0.1
        if cfg.has_option('dqnpolicy', 'prior_sample_prob_end'):
            self.priorProbEnd = cfg.getfloat('dqnpolicy', 'prior_sample_prob_end')

        self.policyfeatures = []
        if cfg.has_option('dqnpolicy', 'features'):
            logger.info('Features: ' + str(cfg.get('dqnpolicy', 'features')))
            self.policyfeatures = json.loads(cfg.get('dqnpolicy', 'features'))

        self.max_k = 5
        if cfg.has_option('dqnpolicy', 'max_k'):
            self.max_k = cfg.getint('dqnpolicy', 'max_k')

        self.learning_algorithm = 'drl'
        if cfg.has_option('dqnpolicy', 'learning_algorithm'):
            self.learning_algorithm = cfg.get('dqnpolicy', 'learning_algorithm')
            logger.info('Learning algorithm: ' + self.learning_algorithm)

        self.minibatch_size = 32
        if cfg.has_option('dqnpolicy', 'minibatch_size'):
            self.minibatch_size = cfg.getint('dqnpolicy', 'minibatch_size')

        self.capacity = 1000  # max(self.minibatch_size, 2000)
        if cfg.has_option('dqnpolicy', 'capacity'):
            self.capacity = cfg.getint('dqnpolicy', 'capacity')

        self.replay_type = 'vanilla'
        if cfg.has_option('dqnpolicy', 'replay_type'):
            self.replay_type = cfg.get('dqnpolicy', 'replay_type')

        self.architecture = 'vanilla'
        if cfg.has_option('dqnpolicy', 'architecture'):
            self.architecture = cfg.get('dqnpolicy', 'architecture')

        self.q_update = 'single'
        if cfg.has_option('dqnpolicy', 'q_update'):
            self.q_update = cfg.get('dqnpolicy', 'q_update')

        self.h1_size = 130
        if cfg.has_option('dqnpolicy', 'h1_size'):
            self.h1_size = cfg.getint('dqnpolicy', 'h1_size')

        self.h2_size = 50
        if cfg.has_option('dqnpolicy', 'h2_size'):
            self.h2_size = cfg.getint('dqnpolicy', 'h2_size')

        self.save_step = 200
        if cfg.has_option('policy', 'save_step'):
            self.save_step = cfg.getint('policy', 'save_step')

        self.importance_sampling = 'soft'
        if cfg.has_option('dqnpolicy', 'importance_sampling'):
            self.importance_sampling = cfg.get('dqnpolicy', 'importance_sampling')

        self.training_frequency = 2
        if cfg.has_option('dqnpolicy', 'training_frequency'):
            self.training_frequency = cfg.getint('dqnpolicy', 'training_frequency')

        # domain specific parameter settings (overrides general policy parameter settings)
        if cfg.has_option('dqnpolicy_' + domainString, 'n_in'):
            self.n_in = cfg.getint('dqnpolicy_' + domainString, 'n_in')

        if cfg.has_option('dqnpolicy_' + domainString, 'actor_lr'):
            self.actor_lr = cfg.getfloat('dqnpolicy_' + domainString, 'actor_lr')

        if cfg.has_option('dqnpolicy_' + domainString, 'critic_lr'):
            self.critic_lr = cfg.getfloat('dqnpolicy_' + domainString, 'critic_lr')

        if cfg.has_option('dqnpolicy_' + domainString, 'tau'):
            self.tau = cfg.getfloat('dqnpolicy_' + domainString, 'tau')

        if cfg.has_option('dqnpolicy_' + domainString, 'gamma'):
            self.gamma = cfg.getfloat('dqnpolicy_' + domainString, 'gamma')

        if cfg.has_option('dqnpolicy_' + domainString, 'regularisation'):
            self.regularisation = cfg.get('dqnpolicy_' + domainString, 'regulariser')

        if cfg.has_option('dqnpolicy_' + domainString, 'learning_rate'):
            self.learning_rate = cfg.getfloat('dqnpolicy_' + domainString, 'learning_rate')

        if cfg.has_option('dqnpolicy_' + domainString, 'exploration_type'):
            self.exploration_type = cfg.get('dqnpolicy_' + domainString, 'exploration_type')

        if cfg.has_option('dqnpolicy_' + domainString, 'episodeNum'):
            self.episodeNum = cfg.getfloat('dqnpolicy_' + domainString, 'episodeNum')

        if cfg.has_option('dqnpolicy_' + domainString, 'maxiter'):
            self.maxiter = cfg.getfloat('dqnpolicy_' + domainString, 'maxiter')

        if cfg.has_option('dqnpolicy_' + domainString, 'epsilon'):
            self.epsilon = cfg.getfloat('dqnpolicy_' + domainString, 'epsilon')

        if cfg.has_option('dqnpolicy_' + domainString, 'epsilon_start'):
            self.epsilon_start = cfg.getfloat('dqnpolicy_' + domainString, 'epsilon_start')

        if cfg.has_option('dqnpolicy_' + domainString, 'epsilon_end'):
            self.epsilon_end = cfg.getfloat('dqnpolicy_' + domainString, 'epsilon_end')

        if cfg.has_option('dqnpolicy_' + domainString, 'prior_sample_prob_start'):
            self.priorProbStart = cfg.getfloat('dqnpolicy_' + domainString, 'prior_sample_prob_start')

        if cfg.has_option('dqnpolicy_' + domainString, 'prior_sample_prob_end'):
            self.priorProbEnd = cfg.getfloat('dqnpolicy_' + domainString, 'prior_sample_prob_end')

        if cfg.has_option('dqnpolicy_' + domainString, 'features'):
            logger.info('Features: ' + str(cfg.get('dqnpolicy_' + domainString, 'features')))
            self.policyfeatures = json.loads(cfg.get('dqnpolicy_' + domainString, 'features'))

        if cfg.has_option('dqnpolicy_' + domainString, 'max_k'):
            self.max_k = cfg.getint('dqnpolicy_' + domainString, 'max_k')

        self.learning_algorithm = 'drl'
        if cfg.has_option('dqnpolicy_' + domainString, 'learning_algorithm'):
            self.learning_algorithm = cfg.get('dqnpolicy_' + domainString, 'learning_algorithm')
            logger.info('Learning algorithm: ' + self.learning_algorithm)

        if cfg.has_option('dqnpolicy_' + domainString, 'minibatch_size'):
            self.minibatch_size = cfg.getint('dqnpolicy_' + domainString, 'minibatch_size')

        if cfg.has_option('dqnpolicy_' + domainString, 'capacity'):
            self.capacity = cfg.getint('dqnpolicy_' + domainString, 'capacity')

        if cfg.has_option('dqnpolicy_' + domainString, 'replay_type'):
            self.replay_type = cfg.get('dqnpolicy_' + domainString, 'replay_type')

        if cfg.has_option('dqnpolicy_' + domainString, 'architecture'):
            self.architecture = cfg.get('dqnpolicy_' + domainString, 'architecture')

        if cfg.has_option('dqnpolicy_' + domainString, 'q_update'):
            self.q_update = cfg.get('dqnpolicy_' + domainString, 'q_update')

        if cfg.has_option('dqnpolicy_' + domainString, 'h1_size'):
            self.h1_size = cfg.getint('dqnpolicy_' + domainString, 'h1_size')

        if cfg.has_option('dqnpolicy_' + domainString, 'h2_size'):
            self.h2_size = cfg.getint('dqnpolicy_' + domainString, 'h2_size')

        if cfg.has_option('policy_' + domainString, 'save_step'):
            self.save_step = cfg.getint('policy_' + domainString, 'save_step')

        if cfg.has_option('dqnpolicy_' + domainString, 'importance_sampling'):
            self.importance_sampling = cfg.get('dqnpolicy_' + domainString, 'importance_sampling')

        if cfg.has_option('dqnpolicy_' + domainString, 'training_frequency'):
            self.training_frequency = cfg.getint('dqnpolicy_' + domainString, 'training_frequency')

        self.natural_gradient_prev = 0.

        """
        self.shuffle = False
        if cfg.has_option('dqnpolicy_'+domainString, 'experience_replay'):
            self.shuffle = cfg.getboolean('dqnpolicy_'+domainString, 'experience_replay')
        if not self.shuffle:
            # If we don't use experience replay, we don't need to maintain
            # sliding window of experiences with maximum capacity.
            # We only need to maintain the data of minibatch_size
            self.capacity = self.minibatch_size
        """

        self.episode_ave_max_q = []
        self.mu_prob = 0.  # behavioral policy

        os.environ["CUDA_VISIBLE_DEVICES"]=""

        # init session
        self.sess = tf.Session()
        with tf.device("/cpu:0"):

            np.random.seed(self.randomseed)
            tf.set_random_seed(self.randomseed)

            # initialise an replay buffer
            if self.replay_type == 'vanilla':
                self.episodes[self.domainString] = ReplayBufferEpisode(self.capacity, self.minibatch_size, self.randomseed)
            elif self.replay_type == 'prioritized':
                self.episodes[self.domainString] = ReplayPrioritisedEpisode(self.capacity, self.minibatch_size, self.randomseed)
            #replay_buffer = ReplayBuffer(self.capacity, self.randomseed)
            #self.episodes = []
            self.samplecount = 0
            self.episodecount = 0

            # construct the models
            self.state_dim = self.n_in
            self.summaryaction = SummaryAction.SummaryAction(domainString)
            self.action_dim = len(self.summaryaction.action_names)
            action_bound = len(self.summaryaction.action_names)
            self.stats = [0 for _ in range(self.action_dim)]

            self.enac = enac.ENACNetwork(self.sess, self.state_dim, self.action_dim, \
                self.critic_lr, self.tau, action_bound, self.architecture, self.h1_size, self.h2_size, self.is_training)
            
            # when all models are defined, init all variables
            init_op = tf.global_variables_initializer()
            self.sess.run(init_op)

            self.loadPolicy(self.in_policy_file)
            print 'loaded replay size: ', self.episodes[self.domainString].size()
예제 #3
0
파일: HackRBPolicy.py 프로젝트: WowCZ/strac
    def __init__(self, in_policy_file, out_policy_file, ontology, cfg, logger, SetObj, domainString='CamRestaurants', is_training=False):
        super(RBDQNPolicy, self).__init__(domainString, ontology, cfg, logger, SetObj, is_training)

        # tf.reset_default_graph()

        self.domainString = domainString
        self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString, cfg, ontology.OntologyUtils, SetObj)
        self.in_policy_file = in_policy_file
        self.out_policy_file = out_policy_file
        self.is_training = is_training
        self.accum_belief = []

        self.prev_state_check = None
        self.ontology = ontology
        self.logger = logger
        self.SetObj =SetObj
        self.atoms = 21
        self.vmin = -1
        self.vmax = 1
        self.support = np.linspace(self.vmin, self.vmax, self.atoms)
        self.delta_z = float(self.vmax - self.vmin) / (self.atoms - 1)

        # parameter settings
        if 0:#cfg.has_option('dqnpolicy', 'n_in'): #ic304: this was giving me a weird error, disabled it until i can check it deeper
            self.n_in = cfg.getint('dqnpolicy', 'n_in')
        else:
            self.n_in = self.get_n_in(domainString)

        self.learning_rate = 0.001
        if cfg.has_option('dqnpolicy', 'learning_rate'):
            self.learning_rate = cfg.getfloat('dqnpolicy', 'learning_rate')

        self.tau = 0.001
        if cfg.has_option('dqnpolicy', 'tau'):
            self.tau = cfg.getfloat('dqnpolicy', 'tau')

        self.randomseed = 1234
        if cfg.has_option('GENERAL', 'seed'):
            self.randomseed = cfg.getint('GENERAL', 'seed')

        self.gamma = 1.0
        if cfg.has_option('dqnpolicy', 'gamma'):
            self.gamma = cfg.getfloat('dqnpolicy', 'gamma')

        self.regularisation = 'l2'
        if cfg.has_option('dqnpolicy', 'regularisation'):
            self.regularisation = cfg.get('dqnpolicy', 'regulariser')

        self.exploration_type = 'e-greedy'  # Boltzman
        if cfg.has_option('dqnpolicy', 'exploration_type'):
            self.exploration_type = cfg.get('dqnpolicy', 'exploration_type')

        self.episodeNum = 1000
        if cfg.has_option('dqnpolicy', 'episodeNum'):
            self.episodeNum = cfg.getfloat('dqnpolicy', 'episodeNum')

        self.maxiter = 5000
        if cfg.has_option('dqnpolicy', 'maxiter'):
            self.maxiter = cfg.getfloat('dqnpolicy', 'maxiter')

        self.epsilon = 0.0
        # if cfg.has_option('dqnpolicy', 'epsilon'):
        #     self.epsilon = cfg.getfloat('dqnpolicy', 'epsilon')

        self.epsilon_start = 0.0
        # if cfg.has_option('dqnpolicy', 'epsilon_start'):
        #     self.epsilon_start = cfg.getfloat('dqnpolicy', 'epsilon_start')

        self.epsilon_end = 0.0
        if cfg.has_option('dqnpolicy', 'epsilon_end'):
            self.epsilon_end = cfg.getfloat('dqnpolicy', 'epsilon_end')

        self.save_step = 100
        if cfg.has_option('policy', 'save_step'):
            self.save_step = cfg.getint('policy', 'save_step')

        self.priorProbStart = 1.0
        if cfg.has_option('dqnpolicy', 'prior_sample_prob_start'):
            self.priorProbStart = cfg.getfloat('dqnpolicy', 'prior_sample_prob_start')

        self.priorProbEnd = 0.1
        if cfg.has_option('dqnpolicy', 'prior_sample_prob_end'):
            self.priorProbEnd = cfg.getfloat('dqnpolicy', 'prior_sample_prob_end')

        self.policyfeatures = []
        if cfg.has_option('dqnpolicy', 'features'):
            self.logger.info('Features: ' + str(cfg.get('dqnpolicy', 'features')))
            self.policyfeatures = json.loads(cfg.get('dqnpolicy', 'features'))

        self.max_k = 5
        if cfg.has_option('dqnpolicy', 'max_k'):
            self.max_k = cfg.getint('dqnpolicy', 'max_k')

        self.learning_algorithm = 'drl'
        if cfg.has_option('dqnpolicy', 'learning_algorithm'):
            self.learning_algorithm = cfg.get('dqnpolicy', 'learning_algorithm')
            self.logger.info('Learning algorithm: ' + self.learning_algorithm)

        self.minibatch_size = 32
        if cfg.has_option('dqnpolicy', 'minibatch_size'):
            self.minibatch_size = cfg.getint('dqnpolicy', 'minibatch_size')

        self.capacity = 1000  # max(self.minibatch_size, 2000)
        if cfg.has_option('dqnpolicy', 'capacity'):
            self.capacity = max(cfg.getint('dqnpolicy', 'capacity'), 2000)

        self.replay_type = 'prioritized'
        if cfg.has_option('dqnpolicy', 'replay_type'):
            self.replay_type = cfg.get('dqnpolicy', 'replay_type')

        self.architecture = 'vanilla'
        if cfg.has_option('dqnpolicy', 'architecture'):
            self.architecture = cfg.get('dqnpolicy', 'architecture')

        self.q_update = 'double'
        if cfg.has_option('dqnpolicy', 'q_update'):
            self.q_update = cfg.get('dqnpolicy', 'q_update')

        self.h1_size = 130
        if cfg.has_option('dqnpolicy', 'h1_size'):
            self.h1_size = cfg.getint('dqnpolicy', 'h1_size')

        self.h1_drop = None
        if cfg.has_option('dqnpolicy', 'h1_drop'):
            self.h1_drop = cfg.getfloat('dqnpolicy', 'h1_drop')

        self.h2_size = 130
        if cfg.has_option('dqnpolicy', 'h2_size'):
            self.h2_size = cfg.getint('dqnpolicy', 'h2_size')

        self.h2_drop = None
        if cfg.has_option('dqnpolicy', 'h2_drop'):
            self.h2_drop = cfg.getfloat('dqnpolicy', 'h2_drop')

        self.nature_mode = None
        if cfg.has_option('dqnpolicy', 'nature_mode'):
            self.nature_mode = cfg.getboolean('dqnpolicy', 'nature_mode')

        self.madqn_hidden_layers = None
        if cfg.has_option('dqnpolicy', 'madqn_hidden_layers'):
            self.madqn_hidden_layers = cfg.getint('dqnpolicy', 'madqn_hidden_layers')

        self.madqn_local_hidden_units = None
        if cfg.has_option('dqnpolicy', 'madqn_local_hidden_units'):
            self.madqn_local_hidden_units = cfg.get('dqnpolicy', 'madqn_local_hidden_units')
            self.madqn_local_hidden_units = eval(self.madqn_local_hidden_units)

        self.madqn_local_dropouts = None
        if cfg.has_option('dqnpolicy', 'madqn_local_dropouts'):
            self.madqn_local_dropouts = cfg.get('dqnpolicy', 'madqn_local_dropouts')
            self.madqn_local_dropouts = eval(self.madqn_local_dropouts)

        self.madqn_global_hidden_units = None
        if cfg.has_option('dqnpolicy', 'madqn_global_hidden_units'):
            self.madqn_global_hidden_units = cfg.get('dqnpolicy', 'madqn_global_hidden_units')
            self.madqn_global_hidden_units = eval(self.madqn_global_hidden_units)

        self.madqn_global_dropouts = None
        if cfg.has_option('dqnpolicy', 'madqn_global_dropouts'):
            self.madqn_global_dropouts = cfg.get('dqnpolicy', 'madqn_global_dropouts')
            self.madqn_global_dropouts = eval(self.madqn_global_dropouts)

        self.madqn_private_rate = None
        if cfg.has_option('dqnpolicy', 'madqn_private_rate'):
            self.madqn_private_rate = cfg.getfloat('dqnpolicy', 'madqn_private_rate')

        self.madqn_sort_input_vec = False
        if cfg.has_option('dqnpolicy', 'madqn_sort_input_vec'):
            self.madqn_sort_input_vec = cfg.getboolean('dqnpolicy', 'madqn_sort_input_vec')

        self.madqn_share_last_layer = False
        if cfg.has_option('dqnpolicy', 'madqn_share_last_layer'):
            self.madqn_share_last_layer = cfg.getboolean('dqnpolicy', 'madqn_share_last_layer')

        self.madqn_shared_last_layer_use_bias = True
        if cfg.has_option('dqnpolicy', 'madqn_shared_last_layer_use_bias'):
            self.madqn_shared_last_layer_use_bias = cfg.getboolean('dqnpolicy', 'madqn_shared_last_layer_use_bias')

        self.madqn_recurrent_mode = False
        if cfg.has_option('dqnpolicy', 'madqn_recurrent_mode'):
            self.madqn_recurrent_mode = cfg.getboolean('dqnpolicy', 'madqn_recurrent_mode')

        self.madqn_input_comm = True
        if cfg.has_option('dqnpolicy', 'madqn_input_comm'):
            self.madqn_input_comm = cfg.getboolean('dqnpolicy', 'madqn_input_comm')

        self.madqn_target_explore = False
        if cfg.has_option('dqnpolicy', 'madqn_target_explore'):
            self.madqn_target_explore = cfg.getboolean('dqnpolicy', 'madqn_target_explore')

        self.madqn_concrete_share_rate = False
        if cfg.has_option('dqnpolicy', 'madqn_concrete_share_rate'):
            self.madqn_concrete_share_rate = cfg.getboolean('dqnpolicy', 'madqn_concrete_share_rate')

        self.madqn_dropout_regularizer = 0.
        if cfg.has_option('dqnpolicy', 'madqn_dropout_regularizer'):
            self.madqn_dropout_regularizer = cfg.getfloat('dqnpolicy', 'madqn_dropout_regularizer')

        self.madqn_weight_regularizer = 0.
        if cfg.has_option('dqnpolicy', 'madqn_weight_regularizer'):
            self.madqn_weight_regularizer = cfg.getfloat('dqnpolicy', 'madqn_weight_regularizer')

        self.madqn_non_local_mode = False
        if cfg.has_option('dqnpolicy', 'madqn_non_local_mode'):
            self.madqn_non_local_mode = cfg.getboolean('dqnpolicy', 'madqn_non_local_mode')

        self.madqn_block_mode = False
        if cfg.has_option('dqnpolicy', 'madqn_block_mode'):
            self.madqn_block_mode = cfg.getboolean('dqnpolicy', 'madqn_block_mode')

        self.madqn_slots_comm = True
        if cfg.has_option('dqnpolicy', 'madqn_slots_comm'):
            self.madqn_slots_comm = cfg.getboolean('dqnpolicy', 'madqn_slots_comm')

        self.madqn_use_dueling = False
        if cfg.has_option('dqnpolicy', 'madqn_use_dueling'):
            self.madqn_use_dueling = cfg.getboolean('dqnpolicy', 'madqn_use_dueling')

        self.madqn_topo_learning_mode = False
        if cfg.has_option('dqnpolicy', 'madqn_topo_learning_mode'):
            self.madqn_topo_learning_mode = cfg.getboolean('dqnpolicy', 'madqn_topo_learning_mode')

        self.madqn_message_embedding = False
        if cfg.has_option('dqnpolicy', 'madqn_message_embedding'):
            self.madqn_message_embedding = cfg.getboolean('dqnpolicy', 'madqn_message_embedding')

        self.madqn_dueling_share_last = False
        if cfg.has_option('dqnpolicy', 'madqn_dueling_share_last'):
            self.madqn_dueling_share_last = cfg.getboolean('dqnpolicy', 'madqn_dueling_share_last')

        self.state_feature = 'vanilla'
        if cfg.has_option('dqnpolicy', 'state_feature'):
            self.state_feature = cfg.get('dqnpolicy', 'state_feature')

        self.init_policy = None
        if cfg.has_option('dqnpolicy', 'init_policy'):
            self.init_policy = cfg.get('dqnpolicy', 'init_policy')

        self.training_frequency = 2
        if cfg.has_option('dqnpolicy', 'training_frequency'):
            self.training_frequency = cfg.getint('dqnpolicy', 'training_frequency')

        # domain specific parameter settings (overrides general policy parameter settings)
        if cfg.has_option('dqnpolicy_' + domainString, 'n_in'):
            self.n_in = cfg.getint('dqnpolicy_' + domainString, 'n_in')

        if cfg.has_option('dqnpolicy_' + domainString, 'learning_rate'):
            self.learning_rate = cfg.getfloat('dqnpolicy_' + domainString, 'learning_rate')

        if cfg.has_option('dqnpolicy_' + domainString, 'tau'):
            self.tau = cfg.getfloat('dqnpolicy_' + domainString, 'tau')

        if cfg.has_option('dqnpolicy_' + domainString, 'gamma'):
            self.gamma = cfg.getfloat('dqnpolicy_' + domainString, 'gamma')

        if cfg.has_option('dqnpolicy_' + domainString, 'regularisation'):
            self.regularisation = cfg.get('dqnpolicy_' + domainString, 'regulariser')

        if cfg.has_option('dqnpolicy_' + domainString, 'exploration_type'):
            self.exploration_type = cfg.get('dqnpolicy_' + domainString, 'exploration_type')

        if cfg.has_option('dqnpolicy_' + domainString, 'episodeNum'):
            self.episodeNum = cfg.getfloat('dqnpolicy_' + domainString, 'episodeNum')

        if cfg.has_option('dqnpolicy_' + domainString, 'maxiter'):
            self.maxiter = cfg.getfloat('dqnpolicy_' + domainString, 'maxiter')

        if cfg.has_option('dqnpolicy_' + domainString, 'epsilon'):
            self.epsilon = cfg.getfloat('dqnpolicy_' + domainString, 'epsilon')

        if cfg.has_option('dqnpolicy_' + domainString, 'epsilon_start'):
            self.epsilon_start = cfg.getfloat('dqnpolicy_' + domainString, 'epsilon_start')

        if cfg.has_option('dqnpolicy_' + domainString, 'epsilon_end'):
            self.epsilon_end = cfg.getfloat('dqnpolicy_' + domainString, 'epsilon_end')

        if cfg.has_option('policy_' + domainString, 'save_step'):
            self.save_step = cfg.getint('policy_' + domainString, 'save_step')

        if cfg.has_option('dqnpolicy_' + domainString, 'prior_sample_prob_start'):
            self.priorProbStart = cfg.getfloat('dqnpolicy_' + domainString, 'prior_sample_prob_start')

        if cfg.has_option('dqnpolicy_' + domainString, 'prior_sample_prob_end'):
            self.priorProbEnd = cfg.getfloat('dqnpolicy_' + domainString, 'prior_sample_prob_end')

        if cfg.has_option('dqnpolicy_' + domainString, 'features'):
            self.logger.info('Features: ' + str(cfg.get('dqnpolicy_' + domainString, 'features')))
            self.policyfeatures = json.loads(cfg.get('dqnpolicy_' + domainString, 'features'))

        if cfg.has_option('dqnpolicy_' + domainString, 'max_k'):
            self.max_k = cfg.getint('dqnpolicy_' + domainString, 'max_k')

        if cfg.has_option('dqnpolicy_' + domainString, 'learning_algorithm'):
            self.learning_algorithm = cfg.get('dqnpolicy_' + domainString, 'learning_algorithm')
            self.logger.info('Learning algorithm: ' + self.learning_algorithm)

        if cfg.has_option('dqnpolicy_' + domainString, 'minibatch_size'):
            self.minibatch_size = cfg.getint('dqnpolicy_' + domainString, 'minibatch_size')

        if cfg.has_option('dqnpolicy_' + domainString, 'capacity'):
            self.capacity = max(cfg.getint('dqnpolicy_' + domainString, 'capacity'), 2000)

        if cfg.has_option('dqnpolicy_' + domainString, 'replay_type'):
            self.replay_type = cfg.get('dqnpolicy_' + domainString, 'replay_type')

        if cfg.has_option('dqnpolicy_' + domainString, 'architecture'):
            self.architecture = cfg.get('dqnpolicy_' + domainString, 'architecture')

        if cfg.has_option('dqnpolicy_' + domainString, 'q_update'):
            self.q_update = cfg.get('dqnpolicy_' + domainString, 'q_update')

        if cfg.has_option('dqnpolicy_' + domainString, 'h1_size'):
            self.h1_size = cfg.getint('dqnpolicy_' + domainString, 'h1_size')

        if cfg.has_option('dqnpolicy_' + domainString, 'h1_drop'):
            self.h1_drop = cfg.getfloat('dqnpolicy_' + domainString, 'h1_drop')

        if cfg.has_option('dqnpolicy_' + domainString, 'h2_size'):
            self.h2_size = cfg.getint('dqnpolicy_' + domainString, 'h2_size')

        if cfg.has_option('dqnpolicy_' + domainString, 'h2_drop'):
            self.h2_drop = cfg.getfloat('dqnpolicy_' + domainString, 'h2_drop')


        if cfg.has_option('dqnpolicy_' + domainString, 'training_frequency'):
            self.training_frequency = cfg.getint('dqnpolicy_' + domainString, 'training_frequency')

        """
        self.shuffle = False
        if cfg.has_option('dqnpolicy_'+domainString, 'experience_replay'):
            self.shuffle = cfg.getboolean('dqnpolicy_'+domainString, 'experience_replay')
        if not self.shuffle:
            # If we don't use experience replay, we don't need to maintain
            # sliding window of experiences with maximum capacity.
            # We only need to maintain the data of minibatch_size
            self.capacity = self.minibatch_size
        """

        self.episode_ave_max_q = []

        os.environ["CUDA_VISIBLE_DEVICES"] = ""

        # init session
        # self.sess = tf.Session()
        # with tf.device("/cpu:0"):

        np.random.seed(self.randomseed)
        # tf.set_random_seed(self.randomseed)

        # initialise an replay buffer
        if self.replay_type == 'vanilla':
            self.episodes[self.domainString] = ReplayBuffer(self.capacity, self.minibatch_size, self.randomseed)
        elif self.replay_type == 'prioritized':
            self.episodes[self.domainString] = ReplayPrioritised(self.capacity, self.minibatch_size,
                                                                 self.randomseed)
        self.samplecount = 0
        self.episodecount = 0

        # construct the models
        self.state_dim = self.n_in
        self.summaryaction = SummaryAction.SummaryAction(domainString, self.ontology, self.SetObj)
        self.action_dim = len(self.summaryaction.action_names)
        action_bound = len(self.summaryaction.action_names)
        self.stats = [0 for _ in range(self.action_dim)]

        import tube
        self.dqn = dqn.DeepRBQNetwork(self.state_dim, self.action_dim, self.atoms, \
                                    self.learning_rate, self.tau, action_bound, self.minibatch_size,
                                    self.architecture, self.h1_size, self.h1_drop,
                                    self.h2_size, self.h2_drop, self.domainString,
                                    self.madqn_hidden_layers,
                                    self.madqn_local_hidden_units, self.madqn_local_dropouts,
                                    self.madqn_global_hidden_units, self.madqn_global_dropouts,
                                    self.madqn_private_rate, self.madqn_sort_input_vec,
                                    self.madqn_share_last_layer, self.madqn_recurrent_mode,
                                    self.madqn_input_comm, self.madqn_target_explore,
                                    concrete_share_rate=self.madqn_concrete_share_rate,
                                    dropout_regularizer=self.madqn_dropout_regularizer,
                                    weight_regularizer=self.madqn_weight_regularizer,
                                    non_local_mode=self.madqn_non_local_mode,
                                    block_mode=self.madqn_block_mode,
                                    slots_comm=self.madqn_slots_comm,
                                    topo_learning_mode=self.madqn_topo_learning_mode,
                                    use_dueling=self.madqn_use_dueling,
                                    dueling_share_last=self.madqn_dueling_share_last,
                                    message_embedding=self.madqn_message_embedding,
                                    state_feature=self.state_feature,
                                    init_policy=self.init_policy,
                                    shared_last_layer_use_bias=self.madqn_shared_last_layer_use_bias,
                                    seed=tube.seed)

        # when all models are defined, init all variables
        # init_op = tf.global_variables_initializer()
        # self.sess.run(init_op)

        lock.acquire()
        self.loadPolicy(self.in_policy_file)
        lock.release()
        print('###################################################')
        print(self.domainString + ' loaded replay size: ' + str(self.episodes[self.domainString].size()))

        # globalEpisodeCount = copy.deepcopy(Settings.get_count())
        # globalEpisodeCount != 0:
        lock.acquire()
        # self.dqn.update_target_network()
        self._savePolicyInc()
        lock.release()

        Settings.load_policy(self.dqn, threading.currentThread().getName())
예제 #4
0
    def __init__(self,
                 in_policy_file,
                 out_policy_file,
                 domainString='CamRestaurants',
                 is_training=False):
        super(FeudalPolicy, self).__init__(domainString, is_training)

        self.domainString = domainString
        self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString)
        self.in_policy_file = in_policy_file
        self.out_policy_file = out_policy_file
        self.is_training = is_training
        self.prev_state_check = None

        #feudalRL variables
        self.prev_sub_policy = None
        self.prev_master_act = None
        self.prev_master_belief = None
        self.prev_child_act = None
        self.prev_child_belief = None

        self.action_freq = np.zeros(len(self.actions.action_names))

        self.master_dec_count = np.array([0., 0.])
        self.gi_dec_inrow = 0

        self.features = 'dip'
        if cfg.has_option('feudalpolicy', 'features'):
            self.features = cfg.get('feudalpolicy', 'features')
        self.si_policy_type = 'dqn'
        if cfg.has_option('feudalpolicy', 'si_policy_type'):
            self.si_policy_type = cfg.get('feudalpolicy', 'si_policy_type')
        self.sd_policy_type = 'dqn'
        if cfg.has_option('feudalpolicy', 'sd_policy_type'):
            self.sd_policy_type = cfg.get('feudalpolicy', 'sd_policy_type')
        self.master_policy_type = self.si_policy_type
        if cfg.has_option('feudalpolicy', 'master_policy_type'):
            self.master_policy_type = cfg.get('feudalpolicy',
                                              'master_policy_type')
        self.sample_master = False
        if cfg.has_option('feudalpolicy', 'sample_master'):
            self.sample_master = cfg.getboolean('feudalpolicy',
                                                'sample_master')
        self.correct_master = False
        if cfg.has_option('feudalpolicy', 'correct_master'):
            self.correct_master = cfg.getboolean('feudalpolicy',
                                                 'correct_master')
        self.use_bye = False
        if cfg.has_option('feudalpolicy', 'use_bye'):
            self.use_bye = cfg.getboolean('feudalpolicy', 'use_bye')
        self.reqmore_in_si = True
        if cfg.has_option('feudalpolicy', 'reqmore_in_si'):
            self.reqmore_in_si = cfg.getboolean('feudalpolicy',
                                                'reqmore_in_si')
        self.correction_factor = 0
        if cfg.has_option('feudalpolicy', 'correction_factor'):
            self.correction_factor = cfg.getfloat('feudalpolicy',
                                                  'correction_factor')
        self.actfreq_ds = False
        if cfg.has_option('feudalpolicy', 'actfreq_ds'):
            self.actfreq_ds = cfg.getboolean('feudalpolicy', 'actfreq_ds')

        # parameter settings

        self.randomseed = 1234
        if cfg.has_option('GENERAL', 'seed'):
            self.randomseed = cfg.getint('GENERAL', 'seed')

        # Create the feudal structure (including feudal masks)

        self.summaryaction = SummaryAction.SummaryAction(domainString)
        self.full_action_list = self.summaryaction.action_names

        self.master_actions = ['give_info', 'request_info', 'pass']

        self.slot_independent_actions = [
            "inform", "inform_byname", "inform_alternatives"
        ]
        if self.reqmore_in_si:
            self.slot_independent_actions.append("reqmore")
        if self.use_bye:
            self.slot_independent_actions.append('bye')
        self.slot_independent_actions.append('pass')

        self.slot_specific_actions = ["request", "confirm", "select"]
        #if self.reqmore_in_sd is True:
        #    self.slot_specific_actions.append("reqmore")
        self.slot_specific_actions.append('pass')

        self.master_freq = np.zeros(len(self.master_actions))
        self.si_freq = np.zeros(len(self.slot_independent_actions))
        self.sd_freq = np.zeros(len(self.slot_specific_actions))

        # master policy
        if self.master_policy_type == 'acer':
            self.master_policy = FeudalACERPolicy(
                self._modify_policyfile('master', in_policy_file),
                self._modify_policyfile('master', out_policy_file),
                domainString=self.domainString,
                is_training=self.is_training,
                action_names=['give_info', 'request_info', 'pass'],
                slot='si'
            )  # pass is always masked, but its needed for implementation
        elif self.master_policy_type == 'enac':
            self.master_policy = FeudalENACPolicy(
                self._modify_policyfile('master', in_policy_file),
                self._modify_policyfile('master', out_policy_file),
                domainString=self.domainString,
                is_training=self.is_training,
                action_names=['give_info', 'request_info', 'pass'],
                slot='si'
            )  # pass is always masked, but its needed for implementation
        elif self.master_policy_type == 'bbqn':
            self.master_policy = FeudalBBQNPolicy(
                self._modify_policyfile('master', in_policy_file),
                self._modify_policyfile('master', out_policy_file),
                domainString=self.domainString,
                is_training=self.is_training,
                action_names=['give_info', 'request_info', 'pass'],
                slot='si'
            )  # pass is always masked, but its needed for implementation
        else:
            self.master_policy = FeudalDQNPolicy(
                self._modify_policyfile('master', in_policy_file),
                self._modify_policyfile('master', out_policy_file),
                domainString=self.domainString,
                is_training=self.is_training,
                action_names=['give_info', 'request_info', 'pass'],
                slot='si'
            )  # pass is always masked, but its needed for implementation
        # si policy
        if self.si_policy_type == 'acer':
            self.give_info_policy = FeudalACERPolicy(
                self._modify_policyfile('gi', in_policy_file),
                self._modify_policyfile('gi', out_policy_file),
                domainString=self.domainString,
                is_training=self.is_training,
                action_names=self.slot_independent_actions,
                slot='si')
        elif self.si_policy_type == 'enac':
            self.give_info_policy = FeudalENACPolicy(
                self._modify_policyfile('gi', in_policy_file),
                self._modify_policyfile('gi', out_policy_file),
                domainString=self.domainString,
                is_training=self.is_training,
                action_names=self.slot_independent_actions,
                slot='si')
        elif self.si_policy_type == 'bbqn':
            self.give_info_policy = FeudalBBQNPolicy(
                self._modify_policyfile('gi', in_policy_file),
                self._modify_policyfile('gi', out_policy_file),
                domainString=self.domainString,
                is_training=self.is_training,
                action_names=self.slot_independent_actions,
                slot='si')
        else:
            self.give_info_policy = FeudalDQNPolicy(
                self._modify_policyfile('gi', in_policy_file),
                self._modify_policyfile('gi', out_policy_file),
                domainString=self.domainString,
                is_training=self.is_training,
                action_names=self.slot_independent_actions,
                slot='si')

        # sd policies
        if self.sd_policy_type == 'acer':
            self.request_info_policy = FeudalACERPolicy(
                self._modify_policyfile('ri', in_policy_file),
                self._modify_policyfile('ri', out_policy_file),
                domainString=self.domainString,
                is_training=self.is_training,
                action_names=self.slot_specific_actions,
                slot='sd')
        elif self.sd_policy_type == 'bbqn':
            self.request_info_policy = FeudalBBQNPolicy(
                self._modify_policyfile('ri', in_policy_file),
                self._modify_policyfile('ri', out_policy_file),
                domainString=self.domainString,
                is_training=self.is_training,
                action_names=self.slot_specific_actions,
                slot='sd')
        else:
            self.request_info_policy = FeudalDQNPolicy(
                self._modify_policyfile('ri', in_policy_file),
                self._modify_policyfile('ri', out_policy_file),
                domainString=self.domainString,
                is_training=self.is_training,
                action_names=self.slot_specific_actions,
                slot='sd')
예제 #5
0
    def __init__(self, agent_id='Smith', hub_id='dialogueserver'):

        # Define all variables in __init__:
        self.prompt_str = None
        self.reward = None
        self.currentTurn = None
        self.maxTurns = None
        self.ENDING_DIALOG = None
        self.SUBJECTIVE_RETRIEVAL_ATTEMPS = None
        self.TASK_RETRIEVAL_ATTEMPTS = None
        self.constraints = None
        self.task = None
        self.taskId = None
        self.subjective = None
        self.session_id = None
        self.callValidator = CallValidator()
        self.prev_state = None
        self.prev_statexx = None
        self.predloss = None

        self.action_names = []  # hardcoded to include slots for specific actions (request, confirm, select)
        self.action_names += ["request(food", "request(area", "request(pricerange",
                              "confirm(food", "confirm(area", "confirm(pricerange",
                              "select(food", "select(area", "select(pricerange",
                              "inform",
                              "inform_byname",
                              "inform_alternatives",
                              "bye",
                              "repeat",
                              "reqmore",
                              "restart"]

        # DEFAULTS:
        # meta params - note these define the 'state' of the dialogue, along with those defined in restart_agent()
        assert(hub_id in ['texthub', 'simulate', 'dialogueserver'])
        self.hub_id = hub_id  # defines certain behaviour of the agent. One of [texthub, simulate, dialogueserver]
        self.agent_id = agent_id
        self.NUM_DIALOGS = 0
        self.SYSTEM_CAN_HANGUP = False
        self.SAVE_FREQUENCY = 10   # save the policy after multiples of this many dialogues
        self.MAX_TURNS_PROMPT = "The dialogue has finished due to too many turns"
        self.NO_ASR_MSG = "I am afraid I did not understand. Could you please repeat that."
        self.maxTurns_per_domain = 30
        self.traceDialog = 2
        self.sim_level = 'dial_act'
        self.pre_trg = False

        # CONFIGS:
        if Settings.config.has_option('agent', 'savefrequency'):
            self.SAVE_FREQUENCY = Settings.config.getint('agent', 'savefrequency')
        if Settings.config.has_option("agent","systemcanhangup"):
            self.SYSTEM_CAN_HANGUP = Settings.config.getboolean("agent", "systemcanhangup")
        if Settings.config.has_option("agent", "maxturns"):
            self.maxTurns_per_domain = Settings.config.getint("agent", "maxturns")
        if Settings.config.has_option("GENERAL", "tracedialog"):
            self.traceDialog = Settings.config.getint("GENERAL", "tracedialog")
        if Settings.config.has_option("usermodel", "simlevel"):
            self.sim_level = Settings.config.get("usermodel", "simlevel")

        # TOPIC TRACKING:
        #-----------------------------------------
        self.topic_tracker = TopicTracking.TopicTrackingManager()


        # SemI + Belief tracker
        self.semi_belief_manager = self._load_manger('semanticbelieftrackingmanager', 'semanticbelieftracking.SemanticBeliefTrackingManager.SemanticBeliefTrackingManager')

        # Policy.
        #-----------------------------------------
        self.policy_manager = self._load_manger('policymanager', 'policy.PolicyManager.PolicyManager')

        # SemO.
        #-----------------------------------------
        if self.hub_id == 'simulate':      # may or may not have NLG in simulate (default is not to)
            generate_prompts = False
            if Settings.config.has_option('simulate', 'generateprompts'):
                generate_prompts = Settings.config.getboolean('simulate', 'generateprompts')
        else:
            generate_prompts = True  # default for Texthub and DialogueServer
        if generate_prompts:
            self.semo_manager = self._load_manger('semomanager', 'semo.SemOManager.SemOManager')
        else:
            self.semo_manager = None

        # Evaluation Manager.
        #-----------------------------------------
        self.evaluation_manager = self._load_manger('evaluationmanager', 'evaluation.EvaluationManager.EvaluationManager')

        # Restart components - NB: inefficient - will be called again before 1st dialogue - but enables _logical_requirements()
        self.restart_agent(session_id=None)

        # Finally, enforce some cross module requirements:
        self._logical_requirements()

        self.domainUtil = FlatOnt.FlatDomainOntology(self.topic_tracker.operatingDomain)
예제 #6
0
import ontology.FlatOntologyManager as FlatOnt
# from theano_dialogue.util.tool import *

import tensorflow as tf
from DRL.replay_bufferVanilla import ReplayBuffer
from DRL.replay_prioritisedVanilla import ReplayPrioritised
import DRL.utils as drlutils
import DRL.concrete_dqn as dqn
import Policy
import SummaryAction
from Policy import TerminalAction, TerminalState

logger = utils.ContextLogger.getLogger('')

# --- for flattening the belief --- # 
domainUtil = FlatOnt.FlatDomainOntology('CamRestaurants')


def flatten_belief(belief, domainUtil, merge=False):
    belief = belief.getDomainState(domainUtil.domainString)
    if isinstance(belief, TerminalState):
        if domainUtil.domainString == 'CamRestaurants':
            return [0] * 268
        elif domainUtil.domainString == 'CamHotels':
            return [0] * 111
        elif domainUtil.domainString == 'SFRestaurants':
            return [0] * 633
        elif domainUtil.domainString == 'SFHotels':
            return [0] * 438
        elif domainUtil.domainString == 'Laptops11':
            return [0] * 257
예제 #7
0
    def __init__(self,
                 in_policy_file,
                 out_policy_file,
                 domainString='CamRestaurants',
                 is_training=False,
                 action_names=None):
        super(MORLPolicy, self).__init__(domainString, is_training)

        self.domainString = domainString
        self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString)
        self.in_policy_file = in_policy_file
        self.out_policy_file = out_policy_file
        self.is_training = is_training
        self.accum_belief = []

        self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString)
        self.prev_state_check = None

        # parameter settings
        if 0:  # cfg.has_option('morlpolicy', 'n_in'): #ic304: this was giving me a weird error, disabled it until i can check it deeper
            self.n_in = cfg.getint('morlpolicy', 'n_in')
        else:
            self.n_in = self.get_n_in(domainString)

        self.n_rew = 1
        if cfg.has_option('morlpolicy', 'n_rew'):
            self.n_rew = cfg.getint('morlpolicy', 'n_rew')

        self.lr = 0.001
        if cfg.has_option('morlpolicy', 'learning_rate'):
            self.lr = cfg.getfloat('morlpolicy', 'learning_rate')

        self.epsilon = 0.5
        if cfg.has_option('morlpolicy', 'epsilon'):
            self.epsilon = cfg.getfloat('morlpolicy', 'epsilon')

        self.epsilon_decay = True
        if cfg.has_option('morlpolicy', 'epsilon_decay'):
            self.epsilon_decay = cfg.getboolean('morlpolicy', 'epsilon_decay')

        self.randomseed = 1234
        if cfg.has_option('GENERAL', 'seed'):
            self.randomseed = cfg.getint('GENERAL', 'seed')

        self.gamma = 1.0
        if cfg.has_option('morlpolicy', 'gamma'):
            self.gamma = cfg.getfloat('morlpolicy', 'gamma')

        self.weight_num = 32
        if cfg.has_option('morlpolicy', 'weight_num'):
            self.weight_num = cfg.getint('morlpolicy', 'weight_num')

        self.episode_num = 1000
        if cfg.has_option('morlpolicy', 'episode_num'):
            self.episode_num = cfg.getfloat('morlpolicy', 'episode_num')

        self.optimizer = "Adam"
        if cfg.has_option('morlpolicy', 'optimizer'):
            self.optimizer = cfg.get('morlpolicy', 'optimizer')

        self.save_step = 100
        if cfg.has_option('policy', 'save_step'):
            self.save_step = cfg.getint('policy', 'save_step')

        self.update_freq = 50
        if cfg.has_option('morlpolicy', 'update_freq'):
            self.update_freq = cfg.getint('morlpolicy', 'update_freq')

        self.policyfeatures = []
        if cfg.has_option('morlpolicy', 'features'):
            logger.info('Features: ' + str(cfg.get('morlpolicy', 'features')))
            self.policyfeatures = json.loads(cfg.get('morlpolicy', 'features'))

        self.algorithm = 'naive'
        if cfg.has_option('morlpolicy', 'algorithm'):
            self.algorithm = cfg.get('morlpolicy', 'algorithm')
            logger.info('Learning algorithm: ' + self.algorithm)

        self.batch_size = 32
        if cfg.has_option('morlpolicy', 'batch_size'):
            self.batch_size = cfg.getint('morlpolicy', 'batch_size')

        self.mem_size = 1000
        if cfg.has_option('morlpolicy', 'mem_size'):
            self.mem_size = cfg.getint('morlpolicy', 'mem_size')

        self.training_freq = 1
        if cfg.has_option('morlpolicy', 'training_freq'):
            self.training_freq = cfg.getint('morlpolicy', 'training_freq')

        # set beta for envelope algorithm
        self.beta = 0.1
        if cfg.has_option('morlpolicy', 'beta'):
            self.beta = cfg.getfloat('morlpolicy', 'beta')
        self.beta_init = self.beta
        self.beta_uplim = 1.00
        self.tau = 1000.
        self.beta_expbase = float(
            np.power(self.tau * (self.beta_uplim - self.beta),
                     1. / (self.episode_num + 1)))
        self.beta_delta = self.beta_expbase / self.tau
        self.beta -= self.beta_delta

        # using homotopy method for optimization
        self.homotopy = False
        if cfg.has_option('morlpolicy', 'homotopy'):
            self.homotopy = cfg.getboolean('morlpolicy', 'homotopy')

        self.epsilon_delta = (self.epsilon - 0.05) / self.episode_num

        self.episodecount = 0

        # construct the models
        self.state_dim = self.n_in
        self.summaryaction = SummaryAction.SummaryAction(domainString)
        if action_names is None:
            self.action_names = self.summaryaction.action_names
        else:
            self.action_names = action_names
        self.action_dim = len(self.action_names)
        self.stats = [0 for _ in range(self.action_dim)]
        self.reward_dim = self.n_rew

        model = None
        if self.algorithm == 'naive':
            model = naive.NaiveLinearCQN(self.state_dim, self.action_dim,
                                         self.reward_dim)
        elif self.algorithm == 'envelope':
            model = envelope.EnvelopeLinearCQN(self.state_dim, self.action_dim,
                                               self.reward_dim)

        self.model_ = model
        self.model = copy.deepcopy(model)

        # initialize memory
        self.trans_mem = deque()
        self.trans = namedtuple('trans',
                                ['s', 'a', 's_', 'r', 'd', 'ms', 'ms_'])
        self.priority_mem = deque()
        self.mem_last_state = None
        self.mem_last_action = None
        self.mem_last_mask = None
        self.mem_cur_state = None
        self.mem_cur_action = None
        self.mem_cur_mask = None

        if self.optimizer == 'Adam':
            self.optimizer = optim.Adam(self.model_.parameters(), lr=self.lr)
        elif self.optimizer == 'RMSprop':
            self.optimizer = optim.RMSprop(self.model_.parameters(),
                                           lr=self.lr)

        try:
            self.loadPolicy(self.in_policy_file)
        except:
            logger.info("No previous model found...")

        self.w_kept = None
        self.update_count = 0
        if self.is_training:
            self.model_.train()
        if use_cuda:
            self.model.cuda()
            self.model_.cuda()

        self.monitor = None
    def __init__(self,
                 in_policy_file,
                 out_policy_file,
                 domainString='CamRestaurants',
                 is_training=False,
                 action_names=None):
        super(DQNPolicy, self).__init__(domainString, is_training)

        tf.reset_default_graph()

        self.domainString = domainString
        self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString)
        self.in_policy_file = in_policy_file
        self.out_policy_file = out_policy_file
        self.is_training = is_training
        self.accum_belief = []

        self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString)
        self.prev_state_check = None

        # pw: Use turn info for predictions
        # action vector creation
        action_names = [
        ]  # hardcoded to include slots for specific actions (request, confirm, select)
        action_names += [
            "request(food)", "request(area)", "request(pricerange)",
            "confirm(food)", "confirm(area)", "confirm(pricerange)",
            "select(food)", "select(area)", "select(pricerange)", "inform",
            "inform_byname", "inform_alternatives", "bye", "repeat", "reqmore",
            "restart"
        ]
        num_actions = len(action_names)
        self.prev_state = None

        # parameter settings
        if 0:  #cfg.has_option('dqnpolicy', 'n_in'): #ic304: this was giving me a weird error, disabled it until i can check it deeper
            self.n_in = cfg.getint('dqnpolicy', 'n_in')
        else:
            self.n_in = self.get_n_in(domainString)

        self.learning_rate = 0.001
        if cfg.has_option('dqnpolicy', 'learning_rate'):
            self.learning_rate = cfg.getfloat('dqnpolicy', 'learning_rate')

        self.tau = 0.001
        if cfg.has_option('dqnpolicy', 'tau'):
            self.tau = cfg.getfloat('dqnpolicy', 'tau')

        self.randomseed = 1234
        if cfg.has_option('GENERAL', 'seed'):
            self.randomseed = cfg.getint('GENERAL', 'seed')

        self.gamma = 1.0
        if cfg.has_option('dqnpolicy', 'gamma'):
            self.gamma = cfg.getfloat('dqnpolicy', 'gamma')

        self.regularisation = 'l2'
        if cfg.has_option('dqnpolicy', 'regularisation'):
            self.regularisation = cfg.get('dqnpolicy', 'regulariser')

        self.exploration_type = 'e-greedy'  # Boltzman
        if cfg.has_option('dqnpolicy', 'exploration_type'):
            self.exploration_type = cfg.get('dqnpolicy', 'exploration_type')

        self.episodeNum = 1000
        if cfg.has_option('dqnpolicy', 'episodeNum'):
            self.episodeNum = cfg.getfloat('dqnpolicy', 'episodeNum')

        self.maxiter = 5000
        if cfg.has_option('dqnpolicy', 'maxiter'):
            self.maxiter = cfg.getfloat('dqnpolicy', 'maxiter')

        self.epsilon = 1
        if cfg.has_option('dqnpolicy', 'epsilon'):
            self.epsilon = cfg.getfloat('dqnpolicy', 'epsilon')

        self.epsilon_start = 1
        if cfg.has_option('dqnpolicy', 'epsilon_start'):
            self.epsilon_start = cfg.getfloat('dqnpolicy', 'epsilon_start')

        self.epsilon_end = 1
        if cfg.has_option('dqnpolicy', 'epsilon_end'):
            self.epsilon_end = cfg.getfloat('dqnpolicy', 'epsilon_end')

        self.save_step = 100
        if cfg.has_option('policy', 'save_step'):
            self.save_step = cfg.getint('policy', 'save_step')

        self.priorProbStart = 1.0
        if cfg.has_option('dqnpolicy', 'prior_sample_prob_start'):
            self.priorProbStart = cfg.getfloat('dqnpolicy',
                                               'prior_sample_prob_start')

        self.priorProbEnd = 0.1
        if cfg.has_option('dqnpolicy', 'prior_sample_prob_end'):
            self.priorProbEnd = cfg.getfloat('dqnpolicy',
                                             'prior_sample_prob_end')

        self.policyfeatures = []
        if cfg.has_option('dqnpolicy', 'features'):
            logger.info('Features: ' + str(cfg.get('dqnpolicy', 'features')))
            self.policyfeatures = json.loads(cfg.get('dqnpolicy', 'features'))

        self.max_k = 5
        if cfg.has_option('dqnpolicy', 'max_k'):
            self.max_k = cfg.getint('dqnpolicy', 'max_k')

        self.learning_algorithm = 'drl'
        if cfg.has_option('dqnpolicy', 'learning_algorithm'):
            self.learning_algorithm = cfg.get('dqnpolicy',
                                              'learning_algorithm')
            logger.info('Learning algorithm: ' + self.learning_algorithm)

        self.minibatch_size = 32
        if cfg.has_option('dqnpolicy', 'minibatch_size'):
            self.minibatch_size = cfg.getint('dqnpolicy', 'minibatch_size')

        self.capacity = 1000
        if cfg.has_option('dqnpolicy', 'capacity'):
            self.capacity = cfg.getint('dqnpolicy', 'capacity')

        self.replay_type = 'vanilla'
        if cfg.has_option('dqnpolicy', 'replay_type'):
            self.replay_type = cfg.get('dqnpolicy', 'replay_type')

        self.architecture = 'vanilla'
        if cfg.has_option('dqnpolicy', 'architecture'):
            self.architecture = cfg.get('dqnpolicy', 'architecture')
            if self.architecture == 'dip':
                self.architecture = 'dip2'

        self.q_update = 'single'
        if cfg.has_option('dqnpolicy', 'q_update'):
            self.q_update = cfg.get('dqnpolicy', 'q_update')

        self.h1_size = 130
        if cfg.has_option('dqnpolicy', 'h1_size'):
            self.h1_size = cfg.getint('dqnpolicy', 'h1_size')

        self.h2_size = 130
        if cfg.has_option('dqnpolicy', 'h2_size'):
            self.h2_size = cfg.getint('dqnpolicy', 'h2_size')

        self.training_frequency = 2
        if cfg.has_option('dqnpolicy', 'training_frequency'):
            self.training_frequency = cfg.getint('dqnpolicy',
                                                 'training_frequency')

        # domain specific parameter settings (overrides general policy parameter settings)
        if cfg.has_option('dqnpolicy_' + domainString, 'n_in'):
            self.n_in = cfg.getint('dqnpolicy_' + domainString, 'n_in')

        if cfg.has_option('dqnpolicy_' + domainString, 'learning_rate'):
            self.learning_rate = cfg.getfloat('dqnpolicy_' + domainString,
                                              'learning_rate')

        if cfg.has_option('dqnpolicy_' + domainString, 'tau'):
            self.tau = cfg.getfloat('dqnpolicy_' + domainString, 'tau')

        if cfg.has_option('dqnpolicy_' + domainString, 'gamma'):
            self.gamma = cfg.getfloat('dqnpolicy_' + domainString, 'gamma')

        if cfg.has_option('dqnpolicy_' + domainString, 'regularisation'):
            self.regularisation = cfg.get('dqnpolicy_' + domainString,
                                          'regulariser')

        if cfg.has_option('dqnpolicy_' + domainString, 'exploration_type'):
            self.exploration_type = cfg.get('dqnpolicy_' + domainString,
                                            'exploration_type')

        if cfg.has_option('dqnpolicy_' + domainString, 'episodeNum'):
            self.episodeNum = cfg.getfloat('dqnpolicy_' + domainString,
                                           'episodeNum')

        if cfg.has_option('dqnpolicy_' + domainString, 'maxiter'):
            self.maxiter = cfg.getfloat('dqnpolicy_' + domainString, 'maxiter')

        if cfg.has_option('dqnpolicy_' + domainString, 'epsilon'):
            self.epsilon = cfg.getfloat('dqnpolicy_' + domainString, 'epsilon')

        if cfg.has_option('dqnpolicy_' + domainString, 'epsilon_start'):
            self.epsilon_start = cfg.getfloat('dqnpolicy_' + domainString,
                                              'epsilon_start')

        if cfg.has_option('dqnpolicy_' + domainString, 'epsilon_end'):
            self.epsilon_end = cfg.getfloat('dqnpolicy_' + domainString,
                                            'epsilon_end')

        if cfg.has_option('policy_' + domainString, 'save_step'):
            self.save_step = cfg.getint('policy_' + domainString, 'save_step')

        if cfg.has_option('dqnpolicy_' + domainString,
                          'prior_sample_prob_start'):
            self.priorProbStart = cfg.getfloat('dqnpolicy_' + domainString,
                                               'prior_sample_prob_start')

        if cfg.has_option('dqnpolicy_' + domainString,
                          'prior_sample_prob_end'):
            self.priorProbEnd = cfg.getfloat('dqnpolicy_' + domainString,
                                             'prior_sample_prob_end')

        if cfg.has_option('dqnpolicy_' + domainString, 'features'):
            logger.info('Features: ' +
                        str(cfg.get('dqnpolicy_' + domainString, 'features')))
            self.policyfeatures = json.loads(
                cfg.get('dqnpolicy_' + domainString, 'features'))

        if cfg.has_option('dqnpolicy_' + domainString, 'max_k'):
            self.max_k = cfg.getint('dqnpolicy_' + domainString, 'max_k')

        if cfg.has_option('dqnpolicy_' + domainString, 'learning_algorithm'):
            self.learning_algorithm = cfg.get('dqnpolicy_' + domainString,
                                              'learning_algorithm')
            logger.info('Learning algorithm: ' + self.learning_algorithm)

        if cfg.has_option('dqnpolicy_' + domainString, 'minibatch_size'):
            self.minibatch_size = cfg.getint('dqnpolicy_' + domainString,
                                             'minibatch_size')

        if cfg.has_option('dqnpolicy_' + domainString, 'capacity'):
            self.capacity = cfg.getint('dqnpolicy_' + domainString, 'capacity')

        if cfg.has_option('dqnpolicy_' + domainString, 'replay_type'):
            self.replay_type = cfg.get('dqnpolicy_' + domainString,
                                       'replay_type')

        if cfg.has_option('dqnpolicy_' + domainString, 'architecture'):
            self.architecture = cfg.get('dqnpolicy_' + domainString,
                                        'architecture')

        if cfg.has_option('dqnpolicy_' + domainString, 'q_update'):
            self.q_update = cfg.get('dqnpolicy_' + domainString, 'q_update')

        if cfg.has_option('dqnpolicy_' + domainString, 'h1_size'):
            self.h1_size = cfg.getint('dqnpolicy_' + domainString, 'h1_size')

        if cfg.has_option('dqnpolicy_' + domainString, 'h2_size'):
            self.h2_size = cfg.getint('dqnpolicy_' + domainString, 'h2_size')

        if cfg.has_option('dqnpolicy_' + domainString, 'training_frequency'):
            self.training_frequency = cfg.getint('dqnpolicy_' + domainString,
                                                 'training_frequency')
        """
        self.shuffle = False
        if cfg.has_option('dqnpolicy_'+domainString, 'experience_replay'):
            self.shuffle = cfg.getboolean('dqnpolicy_'+domainString, 'experience_replay')
        if not self.shuffle:
            # If we don't use experience replay, we don't need to maintain
            # sliding window of experiences with maximum capacity.
            # We only need to maintain the data of minibatch_size
            self.capacity = self.minibatch_size
        """

        self.episode_ave_max_q = []

        os.environ["CUDA_VISIBLE_DEVICES"] = ""
        policytype = 'dqn'
        self.dropout_rate = 0.
        if cfg.has_option('dqnpolicy', 'dropout_rate'):
            self.dropout_rate = cfg.getfloat('dqnpolicy', 'dropout_rate')
        if cfg.has_option('policy', 'policytype'):
            policytype = cfg.get('policy', 'policytype')
        if policytype != 'feudal':
            # init session
            self.sess = tf.Session()
            with tf.device("/cpu:0"):

                np.random.seed(self.randomseed)
                tf.set_random_seed(self.randomseed)

                # initialise an replay buffer
                if self.replay_type == 'vanilla':
                    self.episodes[self.domainString] = ReplayBuffer(
                        self.capacity, self.minibatch_size, self.randomseed)
                elif self.replay_type == 'prioritized':
                    self.episodes[self.domainString] = ReplayPrioritised(
                        self.capacity, self.minibatch_size, self.randomseed)
                self.samplecount = 0
                self.episodecount = 0

                # construct the models
                self.state_dim = self.n_in
                if self.architecture == 'dip2':
                    self.state_dim = 89
                self.summaryaction = SummaryAction.SummaryAction(domainString)
                if action_names is None:
                    self.action_names = self.summaryaction.action_names
                else:
                    self.action_names = action_names
                self.action_dim = len(self.action_names)
                action_bound = len(self.action_names)
                self.stats = [0 for _ in range(self.action_dim)]

                self.dqn = dqn.DeepQNetwork(self.sess, self.state_dim, self.action_dim, \
                                            self.learning_rate, self.tau, action_bound, self.minibatch_size,
                                            self.architecture, self.h1_size,
                                            self.h2_size, dropout_rate=self.dropout_rate)

                # when all models are defined, init all variables
                init_op = tf.global_variables_initializer()
                self.sess.run(init_op)

                self.loadPolicy(self.in_policy_file)
                print 'loaded replay size: ', self.episodes[
                    self.domainString].size()

                self.dqn.update_target_network()
예제 #9
0
    def __init__(self,
                 in_policy_file,
                 out_policy_file,
                 domainString='CamRestaurants',
                 is_training=False,
                 action_names=None,
                 slot=None):
        super(FeudalACERPolicy, self).__init__(in_policy_file, out_policy_file,
                                               domainString, is_training)

        tf.reset_default_graph()

        self.in_policy_file = in_policy_file
        self.out_policy_file = out_policy_file
        self.is_training = is_training
        self.accum_belief = []
        self.prev_state_check = None

        self.domainString = domainString
        self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString)

        self.features = 'dip'
        self.sd_enc_size = 80
        self.si_enc_size = 40
        self.dropout_rate = 0.
        if cfg.has_option('feudalpolicy', 'features'):
            self.features = cfg.get('feudalpolicy', 'features')
        if cfg.has_option('feudalpolicy', 'sd_enc_size'):
            self.sd_enc_size = cfg.getint('feudalpolicy', 'sd_enc_size')
        if cfg.has_option('feudalpolicy', 'si_enc_size'):
            self.si_enc_size = cfg.getint('feudalpolicy', 'si_enc_size')
        if cfg.has_option('dqnpolicy', 'dropout_rate') and self.is_training:
            self.dropout_rate = cfg.getfloat('feudalpolicy', 'dropout_rate')
        if cfg.has_option('dqnpolicy', 'dropout_rate') and self.is_training:
            self.dropout_rate = cfg.getfloat('feudalpolicy', 'dropout_rate')
        self.actfreq_ds = False
        if cfg.has_option('feudalpolicy', 'actfreq_ds'):
            self.actfreq_ds = cfg.getboolean('feudalpolicy', 'actfreq_ds')

        # init session
        self.sess = tf.Session()
        with tf.device("/cpu:0"):

            np.random.seed(self.randomseed)
            tf.set_random_seed(self.randomseed)

            # initialise an replay buffer
            if self.replay_type == 'vanilla':
                self.episodes[self.domainString] = ReplayBufferEpisode(
                    self.capacity, self.minibatch_size, self.randomseed)
            elif self.replay_type == 'prioritized':
                self.episodes[self.domainString] = ReplayPrioritisedEpisode(
                    self.capacity, self.minibatch_size, self.randomseed)
            #replay_buffer = ReplayBuffer(self.capacity, self.randomseed)
            #self.episodes = []
            self.samplecount = 0
            self.episodecount = 0

            # construct the models
            self.state_dim = 89  # current DIP state dim
            self.summaryaction = policy.SummaryAction.SummaryAction(
                domainString)
            self.action_names = action_names
            self.action_dim = len(self.action_names)
            action_bound = len(self.action_names)
            self.stats = [0 for _ in range(self.action_dim)]

            self.global_mu = [0. for _ in range(self.action_dim)]

            if self.features == 'dip':
                if self.actfreq_ds:
                    if self.domainString == 'CamRestaurants':
                        self.state_dim += 9  #16
                    elif self.domainString == 'SFRestaurants':
                        self.state_dim += 9  #25
                    elif self.domainString == 'Laptops11':
                        self.state_dim += 9  #40
                self.acer = acer.ACERNetwork(self.sess, self.state_dim,
                                             self.action_dim, self.critic_lr,
                                             self.delta, self.c, self.alpha,
                                             self.h1_size, self.h2_size,
                                             self.is_training)
            elif self.features == 'learned' or self.features == 'rnn':
                si_state_dim = 72
                if self.actfreq_ds:
                    if self.domainString == 'CamRestaurants':
                        si_state_dim += 9  #16
                    elif self.domainString == 'SFRestaurants':
                        si_state_dim += 9  #25
                    elif self.domainString == 'Laptops11':
                        si_state_dim += 9  #40
                if self.domainString == 'CamRestaurants':
                    sd_state_dim = 158  #94
                elif self.domainString == 'SFRestaurants':
                    sd_state_dim = 158
                elif self.domainString == 'Laptops11':
                    sd_state_dim = 158  #13
                else:
                    logger.error(
                        'Domain {} not implemented in feudal-DQN yet'
                    )  # just find out the size of sd_state_dim for the new domain
                if 0:  #self.features == 'rnn':
                    self.acer = acer.RNNACERNetwork(self.sess,
                                                    si_state_dim,
                                                    sd_state_dim,
                                                    self.action_dim,
                                                    self.critic_lr,
                                                    self.delta,
                                                    self.c,
                                                    self.alpha,
                                                    self.h1_size,
                                                    self.h2_size,
                                                    self.is_training,
                                                    sd_enc_size=25,
                                                    si_enc_size=25,
                                                    dropout_rate=0.,
                                                    tn='normal',
                                                    slot='si')
                else:
                    self.state_dim = si_state_dim + sd_state_dim
                    self.acer = acer.ACERNetwork(self.sess, self.state_dim,
                                                 self.action_dim,
                                                 self.critic_lr, self.delta,
                                                 self.c, self.alpha,
                                                 self.h1_size, self.h2_size,
                                                 self.is_training)

            else:
                logger.error('features "{}" not implemented'.format(
                    self.features))

            # when all models are defined, init all variables
            init_op = tf.global_variables_initializer()
            self.sess.run(init_op)

            self.loadPolicy(self.in_policy_file)
            print 'loaded replay size: ', self.episodes[
                self.domainString].size()
예제 #10
0
    def __init__(self,
                 in_policy_file,
                 out_policy_file,
                 domainString='CamRestaurants',
                 is_training=False,
                 action_names=None,
                 slot=None):
        super(FeudalDQNPolicy, self).__init__(in_policy_file, out_policy_file,
                                              domainString, is_training)

        tf.reset_default_graph()

        self.domainString = domainString
        self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString)
        self.in_policy_file = in_policy_file
        self.out_policy_file = out_policy_file
        self.is_training = is_training
        self.accum_belief = []
        self.slot = slot
        self.features = 'dip'
        if cfg.has_option('feudalpolicy', 'features'):
            self.features = cfg.get('feudalpolicy', 'features')
        self.actfreq_ds = False
        if cfg.has_option('feudalpolicy', 'actfreq_ds'):
            self.actfreq_ds = cfg.getboolean('feudalpolicy', 'actfreq_ds')

        self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString)
        self.prev_state_check = None

        self.max_k = 5
        if cfg.has_option('dqnpolicy', 'max_k'):
            self.max_k = cfg.getint('dqnpolicy', 'max_k')

        self.capacity *= 4  # capacity for episode methods, multiply it to adjust to turn based methods

        # init session
        self.sess = tf.Session()
        with tf.device("/cpu:0"):

            np.random.seed(self.randomseed)
            tf.set_random_seed(self.randomseed)

            # initialise a replay buffer
            if self.replay_type == 'vanilla':
                self.episodes[self.domainString] = ReplayBuffer(
                    self.capacity, self.minibatch_size, self.randomseed)
            elif self.replay_type == 'prioritized':
                self.episodes[self.domainString] = ReplayPrioritised(
                    self.capacity, self.minibatch_size, self.randomseed)
            self.samplecount = 0
            self.episodecount = 0

            # construct the models
            self.state_dim = 89  # current DIP state dim
            self.summaryaction = policy.SummaryAction.SummaryAction(
                domainString)
            self.action_names = action_names
            self.action_dim = len(self.action_names)
            action_bound = len(self.action_names)
            self.stats = [0 for _ in range(self.action_dim)]

            if self.features == 'learned' or self.features == 'rnn':
                si_state_dim = 72
                if self.actfreq_ds:
                    if self.domainString == 'CamRestaurants':
                        si_state_dim += 9  #16
                    elif self.domainString == 'SFRestaurants':
                        si_state_dim += 9  #25
                    elif self.domainString == 'Laptops11':
                        si_state_dim += 9  #40
                if self.domainString == 'CamRestaurants':
                    sd_state_dim = 158  #94
                elif self.domainString == 'SFRestaurants':
                    sd_state_dim = 158
                elif self.domainString == 'Laptops11':
                    sd_state_dim = 158  #13
                else:
                    logger.error(
                        'Domain {} not implemented in feudal-DQN yet'
                    )  # just find out the size of sd_state_dim for the new domain
                self.sd_enc_size = 50
                self.si_enc_size = 25
                self.dropout_rate = 0.
                if cfg.has_option('feudalpolicy', 'sd_enc_size'):
                    self.sd_enc_size = cfg.getint('feudalpolicy',
                                                  'sd_enc_size')
                if cfg.has_option('feudalpolicy', 'si_enc_size'):
                    self.si_enc_size = cfg.getint('feudalpolicy',
                                                  'si_enc_size')
                if cfg.has_option('dqnpolicy',
                                  'dropout_rate') and self.is_training:
                    self.dropout_rate = cfg.getfloat('feudalpolicy',
                                                     'dropout_rate')
                if cfg.has_option('dqnpolicy',
                                  'dropout_rate') and self.is_training:
                    self.dropout_rate = cfg.getfloat('feudalpolicy',
                                                     'dropout_rate')

                self.state_dim = si_state_dim + sd_state_dim
                if self.features == 'learned':
                    self.dqn = dqn.NNFDeepQNetwork(
                        self.sess,
                        si_state_dim,
                        sd_state_dim,
                        self.action_dim,
                        self.learning_rate,
                        self.tau,
                        action_bound,
                        self.minibatch_size,
                        self.architecture,
                        self.h1_size,
                        self.h2_size,
                        sd_enc_size=self.sd_enc_size,
                        si_enc_size=self.si_enc_size,
                        dropout_rate=self.dropout_rate)
                elif self.features == 'rnn':
                    self.dqn = dqn.RNNFDeepQNetwork(
                        self.sess,
                        si_state_dim,
                        sd_state_dim,
                        self.action_dim,
                        self.learning_rate,
                        self.tau,
                        action_bound,
                        self.minibatch_size,
                        self.architecture,
                        self.h1_size,
                        self.h2_size,
                        sd_enc_size=self.sd_enc_size,
                        si_enc_size=self.si_enc_size,
                        dropout_rate=self.dropout_rate,
                        slot=self.slot)
            else:  # self.features = 'dip'
                if self.actfreq_ds:
                    if self.domainString == 'CamRestaurants':
                        self.state_dim += 9  #16
                    elif self.domainString == 'SFRestaurants':
                        self.state_dim += 9  #25
                    elif self.domainString == 'Laptops11':
                        self.state_dim += 9  #40
                self.dqn = dqn.DeepQNetwork(self.sess,
                                            self.state_dim,
                                            self.action_dim,
                                            self.learning_rate,
                                            self.tau,
                                            action_bound,
                                            self.minibatch_size,
                                            self.architecture,
                                            self.h1_size,
                                            self.h2_size,
                                            dropout_rate=self.dropout_rate)

            # when all models are defined, init all variables (this might to be sent to the main policy too)
            init_op = tf.global_variables_initializer()
            self.sess.run(init_op)

            self.loadPolicy(self.in_policy_file)
            print 'loaded replay size: ', self.episodes[
                self.domainString].size()

            self.dqn.update_target_network()
예제 #11
0
    def __init__(self,
                 in_policy_file,
                 out_policy_file,
                 domainString='CamRestaurants',
                 is_training=False):
        super(BootstrappedDQNPolicy, self).__init__(domainString, is_training)

        self.domainString = domainString
        self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString)
        self.in_policy_file = in_policy_file
        self.out_policy_file = out_policy_file
        self.is_training = is_training
        self.accum_belief = []

        self.prev_state_check = None

        # parameter settings
        self.n_in = 260
        if cfg.has_option('dqnpolicy_' + domainString, 'n_in'):
            self.n_in = cfg.getint('dqnpolicy_' + domainString, 'n_in')

        self.actor_lr = 0.0001
        if cfg.has_option('dqnpolicy_' + domainString, 'actor_lr'):
            self.actor_lr = cfg.getfloat('dqnpolicy_' + domainString,
                                         'actor_lr')

        self.critic_lr = 0.001
        if cfg.has_option('dqnpolicy_' + domainString, 'critic_lr'):
            self.critic_lr = cfg.getfloat('dqnpolicy_' + domainString,
                                          'critic_lr')

        self.tau = 0.001
        if cfg.has_option('dqnpolicy_' + domainString, 'tau'):
            self.tau = cfg.getfloat('dqnpolicy_' + domainString, 'tau')

        self.randomseed = 1234
        if cfg.has_option('GENERAL', 'seed'):
            self.randomseed = cfg.getint('GENERAL', 'seed')

        self.gamma = 1.0
        if cfg.has_option('dqnpolicy_' + domainString, 'gamma'):
            self.gamma = cfg.getfloat('dqnpolicy_' + domainString, 'gamma')

        self.regularisation = 'l2'
        if cfg.has_option('dqnpolicy_' + domainString, 'regularisation'):
            self.regularisation = cfg.get('dqnpolicy_' + domainString,
                                          'regulariser')

        self.learning_rate = 0.001
        if cfg.has_option('dqnpolicy_' + domainString, 'learning_rate'):
            self.learning_rate = cfg.getfloat('dqnpolicy_' + domainString,
                                              'learning_rate')

        self.exploration_type = 'e-greedy'  # Boltzman
        if cfg.has_option('dqnpolicy_' + domainString, 'exploration_type'):
            self.exploration_type = cfg.get('dqnpolicy_' + domainString,
                                            'exploration_type')

        self.episodeNum = 1000
        if cfg.has_option('dqnpolicy_' + domainString, 'episodeNum'):
            self.episodeNum = cfg.getfloat('dqnpolicy_' + domainString,
                                           'episodeNum')

        self.maxiter = 5000
        if cfg.has_option('dqnpolicy_' + domainString, 'maxiter'):
            self.maxiter = cfg.getfloat('dqnpolicy_' + domainString, 'maxiter')

        self.epsilon = 1
        if cfg.has_option('dqnpolicy_' + domainString, 'epsilon'):
            self.epsilon = cfg.getfloat('dqnpolicy_' + domainString, 'epsilon')

        self.epsilon_start = 1
        if cfg.has_option('dqnpolicy_' + domainString, 'epsilon_start'):
            self.epsilon_start = cfg.getfloat('dqnpolicy_' + domainString,
                                              'epsilon_start')

        self.epsilon_end = 1
        if cfg.has_option('dqnpolicy_' + domainString, 'epsilon_end'):
            self.epsilon_end = cfg.getfloat('dqnpolicy_' + domainString,
                                            'epsilon_end')

        self.priorProbStart = 1.0
        if cfg.has_option('dqnpolicy_' + domainString,
                          'prior_sample_prob_start'):
            self.priorProbStart = cfg.getfloat('dqnpolicy_' + domainString,
                                               'prior_sample_prob_start')

        self.save_step = 100
        if cfg.has_option('policy_' + domainString, 'save_step'):
            self.save_step = cfg.getint('policy_' + domainString, 'save_step')

        self.priorProbEnd = 0.1
        if cfg.has_option('dqnpolicy_' + domainString,
                          'prior_sample_prob_end'):
            self.priorProbEnd = cfg.getfloat('dqnpolicy_' + domainString,
                                             'prior_sample_prob_end')

        self.policyfeatures = []
        if cfg.has_option('dqnpolicy_' + domainString, 'features'):
            logger.info('Features: ' +
                        str(cfg.get('dqnpolicy_' + domainString, 'features')))
            self.policyfeatures = json.loads(
                cfg.get('dqnpolicy_' + domainString, 'features'))

        self.max_k = 5
        if cfg.has_option('dqnpolicy_' + domainString, 'max_k'):
            self.max_k = cfg.getint('dqnpolicy_' + domainString, 'max_k')

        self.learning_algorithm = 'drl'
        if cfg.has_option('dqnpolicy_' + domainString, 'learning_algorithm'):
            self.learning_algorithm = cfg.get('dqnpolicy_' + domainString,
                                              'learning_algorithm')
            logger.info('Learning algorithm: ' + self.learning_algorithm)

        self.minibatch_size = 32
        if cfg.has_option('dqnpolicy_' + domainString, 'minibatch_size'):
            self.minibatch_size = cfg.getint('dqnpolicy_' + domainString,
                                             'minibatch_size')

        self.capacity = 1000  # max(self.minibatch_size, 2000)
        if cfg.has_option('dqnpolicy_' + domainString, 'capacity'):
            self.capacity = max(
                cfg.getint('dqnpolicy_' + domainString, 'capacity'), 2000)

        self.replay_type = 'vanilla'
        if cfg.has_option('dqnpolicy_' + domainString, 'replay_type'):
            self.replay_type = cfg.get('dqnpolicy_' + domainString,
                                       'replay_type')

        self.architecture = 'vanilla'
        if cfg.has_option('dqnpolicy_' + domainString, 'architecture'):
            self.architecture = cfg.get('dqnpolicy_' + domainString,
                                        'architecture')

        self.q_update = 'single'
        if cfg.has_option('dqnpolicy_' + domainString, 'q_update'):
            self.q_update = cfg.get('dqnpolicy_' + domainString, 'q_update')

        self.h1_size = 130
        if cfg.has_option('dqnpolicy_' + domainString, 'h1_size'):
            self.h1_size = cfg.getint('dqnpolicy_' + domainString, 'h1_size')

        self.h2_size = 130
        if cfg.has_option('dqnpolicy_' + domainString, 'h2_size'):
            self.h2_size = cfg.getint('dqnpolicy_' + domainString, 'h2_size')

        self.no_heads = 3
        if cfg.has_option('dqnpolicy_' + domainString, 'no_head'):
            self.no_heads = cfg.getint('dqnpolicy_' + domainString, 'no_head')

        self.episode_ave_max_q = []

        os.environ["CUDA_VISIBLE_DEVICES"] = ""

        # initialize head
        self.head = None

        # init session
        self.sess = tf.Session()
        with tf.device("/cpu:0"):

            np.random.seed(self.randomseed)
            tf.set_random_seed(self.randomseed)

            # initialise an replay buffer
            if self.replay_type == 'vanilla':
                self.episodes[self.domainString] = ReplayBuffer(
                    self.capacity, self.minibatch_size, self.randomseed)
            elif self.replay_type == 'prioritized':
                self.episodes[self.domainString] = ReplayPrioritised(
                    self.capacity, self.minibatch_size, self.randomseed)
            # replay_buffer = ReplayBuffer(self.capacity, self.randomseed)
            # self.episodes = []
            self.samplecount = 0
            self.episodecount = 0

            # construct the models
            self.state_dim = self.n_in
            self.summaryaction = SummaryAction.SummaryAction(domainString)
            self.action_dim = len(self.summaryaction.action_names)
            action_bound = len(self.summaryaction.action_names)
            self.stats = [0 for _ in range(self.action_dim)]

            self.dqn = dqn.DeepQNetwork(self.sess, self.state_dim, self.action_dim, \
                                        self.critic_lr, self.tau, action_bound, self.architecture, self.h1_size,
                                        self.h2_size, self.no_heads, self.minibatch_size)

            # when all models are defined, init all variables
            init_op = tf.global_variables_initializer()
            self.sess.run(init_op)

            self.loadPolicy(self.in_policy_file)
            print 'loaded replay size: ', self.episodes[
                self.domainString].size()

            for head in range(self.no_heads):
                self.dqn.update_target_network(head)
예제 #12
0
    def __init__(self,
                 in_policy_file,
                 out_policy_file,
                 domainString='CamRestaurants',
                 is_training=False):
        super(ACERPolicy, self).__init__(domainString, is_training)

        tf.reset_default_graph()

        self.in_policy_file = in_policy_file
        self.out_policy_file = out_policy_file
        self.is_training = is_training
        self.accum_belief = []
        self.prev_state_check = None

        self.domainString = domainString
        self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString)

        #improvement==================================
        self.intrinsic_reward_method = None
        if cfg.has_option('scme', 'method'):
            self.intrinsic_reward_method = cfg.get('scme', 'method')
        #improvement==================================

        # parameter settings

        if 0:  # cfg.has_option('dqnpolicy', 'n_in'): #ic304: this was giving me a weird error, disabled it until i can check it deeper
            self.n_in = cfg.getint('dqnpolicy', 'n_in')
        else:
            self.n_in = self.get_n_in(domainString)

        self.actor_lr = 0.0001
        if cfg.has_option('dqnpolicy', 'actor_lr'):
            self.actor_lr = cfg.getfloat('dqnpolicy', 'actor_lr')

        self.critic_lr = 0.001
        if cfg.has_option('dqnpolicy', 'critic_lr'):
            self.critic_lr = cfg.getfloat('dqnpolicy', 'critic_lr')

        self.delta = 1.
        if cfg.has_option('dqnpolicy', 'delta'):
            self.delta = cfg.getfloat('dqnpolicy', 'delta')

        self.alpha = 0.99
        if cfg.has_option('dqnpolicy', 'beta'):
            self.alpha = cfg.getfloat('dqnpolicy', 'beta')

        self.c = 10.
        if cfg.has_option('dqnpolicy', 'is_threshold'):
            self.c = cfg.getfloat('dqnpolicy', 'is_threshold')

        self.randomseed = 1234
        if cfg.has_option('GENERAL', 'seed'):
            self.randomseed = cfg.getint('GENERAL', 'seed')

        self.gamma = 0.99
        if cfg.has_option('dqnpolicy', 'gamma'):
            self.gamma = cfg.getfloat('dqnpolicy', 'gamma')

        self.regularisation = 'l2'
        if cfg.has_option('dqnpolicy', 'regularisation'):
            self.regularisation = cfg.get('dqnpolicy', 'regularisation')

        self.learning_rate = 0.001
        if cfg.has_option('dqnpolicy', 'learning_rate'):
            self.learning_rate = cfg.getfloat('dqnpolicy', 'learning_rate')

        self.exploration_type = 'e-greedy'  # Boltzman
        if cfg.has_option('dqnpolicy', 'exploration_type'):
            self.exploration_type = cfg.get('dqnpolicy', 'exploration_type')

        self.episodeNum = 1000
        if cfg.has_option('dqnpolicy', 'episodeNum'):
            self.episodeNum = cfg.getfloat('dqnpolicy', 'episodeNum')

        self.maxiter = 5000
        if cfg.has_option('dqnpolicy', 'maxiter'):
            self.maxiter = cfg.getfloat('dqnpolicy', 'maxiter')

        self.curiosityreward = False
        if cfg.has_option('eval', 'curiosityreward'):
            self.curiosityreward = cfg.getboolean('eval', 'curiosityreward')

        self.epsilon = 1
        if cfg.has_option('dqnpolicy', 'epsilon'):
            self.epsilon = cfg.getfloat('dqnpolicy', 'epsilon')

        if not self.curiosityreward:  # no eps-greedy exploration when curious expl. is used
            self.epsilon_start = 1
            if cfg.has_option('dqnpolicy', 'epsilon_start'):
                self.epsilon_start = cfg.getfloat('dqnpolicy', 'epsilon_start')
        else:
            self.epsilon_start = 1
            if cfg.has_option('dqnpolicy', 'epsilon_start'):
                self.epsilon_start = cfg.getfloat('dqnpolicy', 'epsilon_start')

        self.epsilon_end = 1
        if cfg.has_option('dqnpolicy', 'epsilon_end'):
            self.epsilon_end = cfg.getfloat('dqnpolicy', 'epsilon_end')

        self.priorProbStart = 1.0
        if cfg.has_option('dqnpolicy', 'prior_sample_prob_start'):
            self.priorProbStart = cfg.getfloat('dqnpolicy',
                                               'prior_sample_prob_start')

        self.priorProbEnd = 0.1
        if cfg.has_option('dqnpolicy', 'prior_sample_prob_end'):
            self.priorProbEnd = cfg.getfloat('dqnpolicy',
                                             'prior_sample_prob_end')

        self.policyfeatures = []
        if cfg.has_option('dqnpolicy', 'features'):
            logger.info('Features: ' + str(cfg.get('dqnpolicy', 'features')))
            self.policyfeatures = json.loads(cfg.get('dqnpolicy', 'features'))

        self.max_k = 5
        if cfg.has_option('dqnpolicy', 'max_k'):
            self.max_k = cfg.getint('dqnpolicy', 'max_k')

        self.learning_algorithm = 'drl'
        if cfg.has_option('dqnpolicy', 'learning_algorithm'):
            self.learning_algorithm = cfg.get('dqnpolicy',
                                              'learning_algorithm')
            logger.info('Learning algorithm: ' + self.learning_algorithm)

        self.minibatch_size = 32
        if cfg.has_option('dqnpolicy', 'minibatch_size'):
            self.minibatch_size = cfg.getint('dqnpolicy', 'minibatch_size')

        self.capacity = 1000
        if cfg.has_option('dqnpolicy', 'capacity'):
            self.capacity = cfg.getint('dqnpolicy', 'capacity')

        self.replay_type = 'vanilla'
        if cfg.has_option('dqnpolicy', 'replay_type'):
            self.replay_type = cfg.get('dqnpolicy', 'replay_type')

        self.architecture = 'vanilla'
        if cfg.has_option('dqnpolicy', 'architecture'):
            self.architecture = cfg.get('dqnpolicy', 'architecture')

        self.q_update = 'single'
        if cfg.has_option('dqnpolicy', 'q_update'):
            self.q_update = cfg.get('dqnpolicy', 'q_update')

        self.h1_size = 130
        if cfg.has_option('dqnpolicy', 'h1_size'):
            self.h1_size = cfg.getint('dqnpolicy', 'h1_size')

        self.h2_size = 50
        if cfg.has_option('dqnpolicy', 'h2_size'):
            self.h2_size = cfg.getint('dqnpolicy', 'h2_size')

        self.save_step = 200
        if cfg.has_option('policy', 'save_step'):
            self.save_step = cfg.getint('policy', 'save_step')

        self.importance_sampling = 'soft'
        if cfg.has_option('dqnpolicy', 'importance_sampling'):
            self.importance_sampling = cfg.get('dqnpolicy',
                                               'importance_sampling')

        self.train_iters_per_episode = 1
        if cfg.has_option('dqnpolicy', 'train_iters_per_episode'):
            self.train_iters_per_episode = cfg.getint(
                'dqnpolicy', 'train_iters_per_episode')

        self.training_frequency = 2
        if cfg.has_option('dqnpolicy', 'training_frequency'):
            self.training_frequency = cfg.getint('dqnpolicy',
                                                 'training_frequency')

        # domain specific parameter settings (overrides general policy parameter settings)
        if cfg.has_option('dqnpolicy_' + domainString, 'n_in'):
            self.n_in = cfg.getint('dqnpolicy_' + domainString, 'n_in')

        if cfg.has_option('dqnpolicy_' + domainString, 'actor_lr'):
            self.actor_lr = cfg.getfloat('dqnpolicy_' + domainString,
                                         'actor_lr')

        if cfg.has_option('dqnpolicy_' + domainString, 'critic_lr'):
            self.critic_lr = cfg.getfloat('dqnpolicy_' + domainString,
                                          'critic_lr')

        if cfg.has_option('dqnpolicy_' + domainString, 'delta'):
            self.delta = cfg.getfloat('dqnpolicy_' + domainString, 'delta')

        if cfg.has_option('dqnpolicy_' + domainString, 'beta'):
            self.alpha = cfg.getfloat('dqnpolicy_' + domainString, 'beta')

        if cfg.has_option('dqnpolicy_' + domainString, 'is_threshold'):
            self.c = cfg.getfloat('dqnpolicy_' + domainString, 'is_threshold')

        if cfg.has_option('dqnpolicy_' + domainString, 'gamma'):
            self.gamma = cfg.getfloat('dqnpolicy_' + domainString, 'gamma')

        if cfg.has_option('dqnpolicy_' + domainString, 'regularisation'):
            self.regularisation = cfg.get('dqnpolicy_' + domainString,
                                          'regulariser')

        if cfg.has_option('dqnpolicy_' + domainString, 'learning_rate'):
            self.learning_rate = cfg.getfloat('dqnpolicy_' + domainString,
                                              'learning_rate')

        if cfg.has_option('dqnpolicy_' + domainString, 'exploration_type'):
            self.exploration_type = cfg.get('dqnpolicy_' + domainString,
                                            'exploration_type')

        if cfg.has_option('dqnpolicy_' + domainString, 'episodeNum'):
            self.episodeNum = cfg.getfloat('dqnpolicy_' + domainString,
                                           'episodeNum')

        if cfg.has_option('dqnpolicy_' + domainString, 'maxiter'):
            self.maxiter = cfg.getfloat('dqnpolicy_' + domainString, 'maxiter')

        if cfg.has_option('dqnpolicy_' + domainString, 'epsilon'):
            self.epsilon = cfg.getfloat('dqnpolicy_' + domainString, 'epsilon')

        if cfg.has_option('dqnpolicy_' + domainString, 'epsilon_start'):
            self.epsilon_start = cfg.getfloat('dqnpolicy_' + domainString,
                                              'epsilon_start')

        if cfg.has_option('dqnpolicy_' + domainString, 'epsilon_end'):
            self.epsilon_end = cfg.getfloat('dqnpolicy_' + domainString,
                                            'epsilon_end')

        if cfg.has_option('dqnpolicy_' + domainString,
                          'prior_sample_prob_start'):
            self.priorProbStart = cfg.getfloat('dqnpolicy_' + domainString,
                                               'prior_sample_prob_start')

        if cfg.has_option('dqnpolicy_' + domainString,
                          'prior_sample_prob_end'):
            self.priorProbEnd = cfg.getfloat('dqnpolicy_' + domainString,
                                             'prior_sample_prob_end')

        if cfg.has_option('dqnpolicy_' + domainString, 'features'):
            logger.info('Features: ' +
                        str(cfg.get('dqnpolicy_' + domainString, 'features')))
            self.policyfeatures = json.loads(
                cfg.get('dqnpolicy_' + domainString, 'features'))

        if cfg.has_option('dqnpolicy_' + domainString, 'max_k'):
            self.max_k = cfg.getint('dqnpolicy_' + domainString, 'max_k')

        if cfg.has_option('dqnpolicy_' + domainString, 'learning_algorithm'):
            self.learning_algorithm = cfg.get('dqnpolicy_' + domainString,
                                              'learning_algorithm')
            logger.info('Learning algorithm: ' + self.learning_algorithm)

        if cfg.has_option('dqnpolicy_' + domainString, 'minibatch_size'):
            self.minibatch_size = cfg.getint('dqnpolicy_' + domainString,
                                             'minibatch_size')

        if cfg.has_option('dqnpolicy_' + domainString, 'capacity'):
            self.capacity = cfg.getint('dqnpolicy_' + domainString, 'capacity')

        if cfg.has_option('dqnpolicy_' + domainString, 'replay_type'):
            self.replay_type = cfg.get('dqnpolicy_' + domainString,
                                       'replay_type')

        if cfg.has_option('dqnpolicy_' + domainString, 'architecture'):
            self.architecture = cfg.get('dqnpolicy_' + domainString,
                                        'architecture')

        if cfg.has_option('dqnpolicy_' + domainString, 'q_update'):
            self.q_update = cfg.get('dqnpolicy_' + domainString, 'q_update')

        if cfg.has_option('dqnpolicy_' + domainString, 'h1_size'):
            self.h1_size = cfg.getint('dqnpolicy_' + domainString, 'h1_size')

        if cfg.has_option('dqnpolicy_' + domainString, 'h2_size'):
            self.h2_size = cfg.getint('dqnpolicy_' + domainString, 'h2_size')

        if cfg.has_option('policy_' + domainString, 'save_step'):
            self.save_step = cfg.getint('policy_' + domainString, 'save_step')

        if cfg.has_option('dqnpolicy_' + domainString, 'importance_sampling'):
            self.importance_sampling = cfg.get('dqnpolicy_' + domainString,
                                               'importance_sampling')

        if cfg.has_option('dqnpolicy_' + domainString,
                          'train_iters_per_episode'):
            self.train_iters_per_episode = cfg.getint(
                'dqnpolicy_' + domainString, 'train_iters_per_episode')

        if cfg.has_option('dqnpolicy_' + domainString, 'training_frequency'):
            self.training_frequency = cfg.getint('dqnpolicy_' + domainString,
                                                 'training_frequency')

        self.episode_ct = 0

        self.episode_ave_max_q = []
        self.mu_prob = 0.  # behavioral policy

        os.environ["CUDA_VISIBLE_DEVICES"] = ""

        # init session
        self.sess = tf.Session()

        with tf.device("/cpu:0"):

            np.random.seed(self.randomseed)
            tf.set_random_seed(self.randomseed)

            # initialise an replay buffer
            if self.replay_type == 'vanilla':
                self.episodes[self.domainString] = ReplayBufferEpisode(
                    self.capacity, self.minibatch_size, self.randomseed)
            elif self.replay_type == 'prioritized':
                self.episodes[self.domainString] = ReplayPrioritisedEpisode(
                    self.capacity, self.minibatch_size, self.randomseed)
            #replay_buffer = ReplayBuffer(self.capacity, self.randomseed)
            #self.episodes = []
            self.samplecount = 0
            self.episodecount = 0

            # construct the models
            self.state_dim = self.n_in
            self.summaryaction = SummaryAction.SummaryAction(domainString)
            self.action_dim = len(self.summaryaction.action_names)
            action_bound = len(self.summaryaction.action_names)
            self.stats = [0 for _ in range(self.action_dim)]
            self.global_mu = [0. for _ in range(self.action_dim)]

            self.acer = acer.ACERNetwork(self.sess, self.state_dim, self.action_dim, \
                self.critic_lr, self.delta, self.c, self.alpha, self.h1_size, self.h2_size, self.is_training, self.randomseed)

            # when all models are defined, init all variables
            init_op = tf.global_variables_initializer()
            self.sess.run(init_op)

            self.loadPolicy(self.in_policy_file)
            print 'loaded replay size: ', self.episodes[
                self.domainString].size()

            #improvement==================================
            #initial
            if self.intrinsic_reward_method == 'vime':
                self.vime_model = vime(self.state_dim, self.action_dim)
                self.vime_model.load_model('model/vime_model/' +
                                           self.in_policy_file)

            elif self.intrinsic_reward_method == 'cme':
                self.cme_model = cme(self.state_dim, self.action_dim)
                self.cme_model.load_model('model/cme_model/' +
                                          self.in_policy_file)

            elif self.intrinsic_reward_method == 'scme':
                self.scme_model = scme(self.state_dim, self.action_dim)
                self.scme_model.load_model('model/scme_model/' +
                                           self.in_policy_file)
    def __init__(self,
                 in_policy_file,
                 out_policy_file,
                 domainString='CamRestaurants',
                 is_training=False,
                 action_names=None):
        super(DQNPolicy, self).__init__(domainString, is_training)

        tf.reset_default_graph()

        self.domainString = domainString
        self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString)
        self.in_policy_file = in_policy_file
        self.out_policy_file = out_policy_file
        self.is_training = is_training
        self.accum_belief = []
        self.prev_state_check = None

        #improvement==================================
        self.intrinsic_reward_method = None
        self.conf = ConfigParser.ConfigParser()
        if utils.Settings.config.has_option('scme', 'method'):
            self.intrinsic_reward_method = utils.Settings.config.get(
                'scme', 'method')
        #improvement==================================

        # parameter settings
        if 0:  #cfg.has_option('dqnpolicy', 'n_in'): #ic304: this was giving me a weird error, disabled it until i can check it deeper
            self.n_in = cfg.getint('dqnpolicy', 'n_in')
        else:
            self.n_in = self.get_n_in(domainString)

        self.learning_rate = 0.001
        if utils.Settings.config.has_option('dqnpolicy', 'learning_rate'):
            self.learning_rate = utils.Settings.config.getfloat(
                'dqnpolicy', 'learning_rate')

        self.tau = 0.001
        if utils.Settings.config.has_option('dqnpolicy', 'tau'):
            self.tau = utils.Settings.config.getfloat('dqnpolicy', 'tau')

        # self.randomseed = 1234 #TODO cfg import doesn't work anymore therfore i changed all the cfg to u.S.config.
        # if cfg.has_option('GENERAL', 'seed'):
        #     self.randomseed = cfg.getint('GENERAL', 'seed') #see same below, this is just kept as example to try

        self.randomseed = 1234
        if utils.Settings.config.has_option('GENERAL', 'seed'):
            self.randomseed = utils.Settings.config.getint('GENERAL', 'seed')

        self.gamma = 1.0
        if utils.Settings.config.has_option('dqnpolicy', 'gamma'):
            self.gamma = utils.Settings.config.getfloat('dqnpolicy', 'gamma')

        self.regularisation = 'l2'
        if utils.Settings.config.has_option('dqnpolicy', 'regularisation'):
            self.regularisation = utils.Settings.config.get(
                'dqnpolicy', 'regulariser')

        self.exploration_type = 'e-greedy'  # Boltzman
        if utils.Settings.config.has_option('dqnpolicy', 'exploration_type'):
            self.exploration_type = utils.Settings.config.get(
                'dqnpolicy', 'exploration_type')

        self.episodeNum = 1000
        if utils.Settings.config.has_option('dqnpolicy', 'episodeNum'):
            self.episodeNum = utils.Settings.config.getfloat(
                'dqnpolicy', 'episodeNum')

        self.maxiter = 5000
        if utils.Settings.config.has_option('dqnpolicy', 'maxiter'):
            self.maxiter = utils.Settings.config.getfloat(
                'dqnpolicy', 'maxiter')

        self.epsilon = 1
        if utils.Settings.config.has_option('dqnpolicy', 'epsilon'):
            self.epsilon = utils.Settings.config.getfloat(
                'dqnpolicy', 'epsilon')

        self.epsilon_start = 1
        if utils.Settings.config.has_option('dqnpolicy', 'epsilon_start'):
            self.epsilon_start = utils.Settings.config.getfloat(
                'dqnpolicy', 'epsilon_start')

        self.epsilon_end = 1
        if utils.Settings.config.has_option('dqnpolicy', 'epsilon_end'):
            self.epsilon_end = utils.Settings.config.getfloat(
                'dqnpolicy', 'epsilon_end')

        self.save_step = 100
        if utils.Settings.config.has_option('policy', 'save_step'):
            self.save_step = utils.Settings.config.getint(
                'policy', 'save_step')

        self.priorProbStart = 1.0
        if utils.Settings.config.has_option('dqnpolicy',
                                            'prior_sample_prob_start'):
            self.priorProbStart = utils.Settings.config.getfloat(
                'dqnpolicy', 'prior_sample_prob_start')

        self.priorProbEnd = 0.1
        if utils.Settings.config.has_option('dqnpolicy',
                                            'prior_sample_prob_end'):
            self.priorProbEnd = utils.Settings.config.getfloat(
                'dqnpolicy', 'prior_sample_prob_end')

        self.policyfeatures = []
        if utils.Settings.config.has_option('dqnpolicy', 'features'):
            logger.info(
                'Features: ' +
                str(utils.Settings.config.get('dqnpolicy', 'features')))
            self.policyfeatures = json.loads(
                utils.Settings.config.get('dqnpolicy', 'features'))

        self.max_k = 5
        if utils.Settings.config.has_option('dqnpolicy', 'max_k'):
            self.max_k = utils.Settings.config.getint('dqnpolicy', 'max_k')

        self.learning_algorithm = 'drl'
        if utils.Settings.config.has_option('dqnpolicy', 'learning_algorithm'):
            self.learning_algorithm = utils.Settings.config.get(
                'dqnpolicy', 'learning_algorithm')
            logger.info('Learning algorithm: ' + self.learning_algorithm)

        self.minibatch_size = 32
        if utils.Settings.config.has_option('dqnpolicy', 'minibatch_size'):
            self.minibatch_size = utils.Settings.config.getint(
                'dqnpolicy', 'minibatch_size')

        self.capacity = 1000
        if utils.Settings.config.has_option('dqnpolicy', 'capacity'):
            self.capacity = utils.Settings.config.getint(
                'dqnpolicy', 'capacity')

        self.replay_type = 'vanilla'
        if utils.Settings.config.has_option('dqnpolicy', 'replay_type'):
            self.replay_type = utils.Settings.config.get(
                'dqnpolicy', 'replay_type')

        self.architecture = 'vanilla'
        if utils.Settings.config.has_option('dqnpolicy', 'architecture'):
            self.architecture = utils.Settings.config.get(
                'dqnpolicy', 'architecture')
            if self.architecture == 'dip':
                self.architecture = 'dip2'

        self.q_update = 'single'
        if utils.Settings.config.has_option('dqnpolicy', 'q_update'):
            self.q_update = utils.Settings.config.get('dqnpolicy', 'q_update')

        self.h1_size = 130
        if utils.Settings.config.has_option('dqnpolicy', 'h1_size'):
            self.h1_size = utils.Settings.config.getint('dqnpolicy', 'h1_size')

        self.h2_size = 130
        if utils.Settings.config.has_option('dqnpolicy', 'h2_size'):
            self.h2_size = utils.Settings.config.getint('dqnpolicy', 'h2_size')

        self.training_frequency = 2
        if utils.Settings.config.has_option('dqnpolicy', 'training_frequency'):
            self.training_frequency = utils.Settings.config.getint(
                'dqnpolicy', 'training_frequency')

        # domain specific parameter settings (overrides general policy parameter settings)
        if utils.Settings.config.has_option('dqnpolicy_' + domainString,
                                            'n_in'):
            self.n_in = utils.Settings.config.getint(
                'dqnpolicy_' + domainString, 'n_in')

        if utils.Settings.config.has_option('dqnpolicy_' + domainString,
                                            'learning_rate'):
            self.learning_rate = utils.Settings.config.getfloat(
                'dqnpolicy_' + domainString, 'learning_rate')

        if utils.Settings.config.has_option('dqnpolicy_' + domainString,
                                            'tau'):
            self.tau = utils.Settings.config.getfloat(
                'dqnpolicy_' + domainString, 'tau')

        if utils.Settings.config.has_option('dqnpolicy_' + domainString,
                                            'gamma'):
            self.gamma = utils.Settings.config.getfloat(
                'dqnpolicy_' + domainString, 'gamma')

        if utils.Settings.config.has_option('dqnpolicy_' + domainString,
                                            'regularisation'):
            self.regularisation = utils.Settings.config.get(
                'dqnpolicy_' + domainString, 'regulariser')

        if utils.Settings.config.has_option('dqnpolicy_' + domainString,
                                            'exploration_type'):
            self.exploration_type = utils.Settings.config.get(
                'dqnpolicy_' + domainString, 'exploration_type')

        if utils.Settings.config.has_option('dqnpolicy_' + domainString,
                                            'episodeNum'):
            self.episodeNum = utils.Settings.config.getfloat(
                'dqnpolicy_' + domainString, 'episodeNum')

        if utils.Settings.config.has_option('dqnpolicy_' + domainString,
                                            'maxiter'):
            self.maxiter = utils.Settings.config.getfloat(
                'dqnpolicy_' + domainString, 'maxiter')

        if utils.Settings.config.has_option('dqnpolicy_' + domainString,
                                            'epsilon'):
            self.epsilon = utils.Settings.config.getfloat(
                'dqnpolicy_' + domainString, 'epsilon')

        if utils.Settings.config.has_option('dqnpolicy_' + domainString,
                                            'epsilon_start'):
            self.epsilon_start = utils.Settings.config.getfloat(
                'dqnpolicy_' + domainString, 'epsilon_start')

        if utils.Settings.config.has_option('dqnpolicy_' + domainString,
                                            'epsilon_end'):
            self.epsilon_end = utils.Settings.config.getfloat(
                'dqnpolicy_' + domainString, 'epsilon_end')

        if utils.Settings.config.has_option('policy_' + domainString,
                                            'save_step'):
            self.save_step = utils.Settings.config.getint(
                'policy_' + domainString, 'save_step')

        if utils.Settings.config.has_option('dqnpolicy_' + domainString,
                                            'prior_sample_prob_start'):
            self.priorProbStart = utils.Settings.config.getfloat(
                'dqnpolicy_' + domainString, 'prior_sample_prob_start')

        if utils.Settings.config.has_option('dqnpolicy_' + domainString,
                                            'prior_sample_prob_end'):
            self.priorProbEnd = utils.Settings.config.getfloat(
                'dqnpolicy_' + domainString, 'prior_sample_prob_end')

        if utils.Settings.config.has_option('dqnpolicy_' + domainString,
                                            'features'):
            logger.info('Features: ' + str(
                utils.Settings.config.get('dqnpolicy_' +
                                          domainString, 'features')))
            self.policyfeatures = json.loads(
                utils.Settings.config.get('dqnpolicy_' + domainString,
                                          'features'))

        if utils.Settings.config.has_option('dqnpolicy_' + domainString,
                                            'max_k'):
            self.max_k = utils.Settings.config.getint(
                'dqnpolicy_' + domainString, 'max_k')

        if utils.Settings.config.has_option('dqnpolicy_' + domainString,
                                            'learning_algorithm'):
            self.learning_algorithm = utils.Settings.config.get(
                'dqnpolicy_' + domainString, 'learning_algorithm')
            logger.info('Learning algorithm: ' + self.learning_algorithm)

        if utils.Settings.config.has_option('dqnpolicy_' + domainString,
                                            'minibatch_size'):
            self.minibatch_size = utils.Settings.config.getint(
                'dqnpolicy_' + domainString, 'minibatch_size')

        if utils.Settings.config.has_option('dqnpolicy_' + domainString,
                                            'capacity'):
            self.capacity = utils.Settings.config.getint(
                'dqnpolicy_' + domainString, 'capacity')

        if utils.Settings.config.has_option('dqnpolicy_' + domainString,
                                            'replay_type'):
            self.replay_type = utils.Settings.config.get(
                'dqnpolicy_' + domainString, 'replay_type')

        if utils.Settings.config.has_option('dqnpolicy_' + domainString,
                                            'architecture'):
            self.architecture = utils.Settings.config.get(
                'dqnpolicy_' + domainString, 'architecture')

        if utils.Settings.config.has_option('dqnpolicy_' + domainString,
                                            'q_update'):
            self.q_update = utils.Settings.config.get(
                'dqnpolicy_' + domainString, 'q_update')

        if utils.Settings.config.has_option('dqnpolicy_' + domainString,
                                            'h1_size'):
            self.h1_size = utils.Settings.config.getint(
                'dqnpolicy_' + domainString, 'h1_size')

        if utils.Settings.config.has_option('dqnpolicy_' + domainString,
                                            'h2_size'):
            self.h2_size = utils.Settings.config.getint(
                'dqnpolicy_' + domainString, 'h2_size')

        if utils.Settings.config.has_option('dqnpolicy_' + domainString,
                                            'training_frequency'):
            self.training_frequency = utils.Settings.config.getint(
                'dqnpolicy_' + domainString, 'training_frequency')
        """
        self.shuffle = False
        if cfg.has_option('dqnpolicy_'+domainString, 'experience_replay'):
            self.shuffle = cfg.getboolean('dqnpolicy_'+domainString, 'experience_replay')
        if not self.shuffle:
            # If we don't use experience replay, we don't need to maintain
            # sliding window of experiences with maximum capacity.
            # We only need to maintain the data of minibatch_size
            self.capacity = self.minibatch_size
        """

        self.episode_ave_max_q = []
        self.curiositypred_loss = []

        #os.environ["CUDA_VISIBLE_DEVICES"] = ""
        policytype = 'dqn'
        self.dropout_rate = 0.
        if utils.Settings.config.has_option('dqnpolicy', 'dropout_rate'):
            self.dropout_rate = utils.Settings.config.getfloat(
                'dqnpolicy', 'dropout_rate')
        if utils.Settings.config.has_option('policy', 'policytype'):
            policytype = utils.Settings.config.get('policy', 'policytype')
        if policytype != 'feudal':
            self.sess = tf.Session()

            with tf.device("/cpu:0"):

                np.random.seed(self.randomseed)
                tf.set_random_seed(self.randomseed)
                # initialise an replay buffer
                if self.replay_type == 'vanilla':
                    self.episodes[self.domainString] = ReplayBuffer(
                        self.capacity, self.minibatch_size, self.randomseed)
                elif self.replay_type == 'prioritized':
                    self.episodes[self.domainString] = ReplayPrioritised(
                        self.capacity, self.minibatch_size, self.randomseed)
                self.samplecount = 0
                self.episodecount = 0

                # construct the models
                self.state_dim = self.n_in
                if self.architecture == 'dip2':
                    self.state_dim = 89
                self.summaryaction = SummaryAction.SummaryAction(domainString)
                if action_names is None:
                    self.action_names = self.summaryaction.action_names
                else:
                    self.action_names = action_names
                self.action_dim = len(self.action_names)
                action_bound = len(self.action_names)
                self.stats = [0 for _ in range(self.action_dim)]

                self.dqn = dqn.DeepQNetwork(self.sess, self.state_dim, self.action_dim, \
                                            self.learning_rate, self.tau, action_bound, self.minibatch_size,
                                            self.architecture, self.h1_size,
                                            self.h2_size, dropout_rate=self.dropout_rate)

                #self.curiosityFunctions = scme(self.sess, self.state_dim, self.action_dim, self.randomseed)

                # when all models are defined, init all variables
                init_op = tf.global_variables_initializer()
                self.sess.run(init_op)

                self.loadPolicy(self.in_policy_file)
                print 'loaded replay size: ', self.episodes[
                    self.domainString].size()

                #improvement==================================
                #initial
                if self.intrinsic_reward_method == 'vime':
                    self.vime_model = vime(self.state_dim, self.action_dim)
                    self.vime_model.load_model('model/vime_model/' +
                                               self.in_policy_file)

                elif self.intrinsic_reward_method == 'cme':
                    self.cme_model = cme(self.state_dim, self.action_dim)
                    self.cme_model.load_model('model/cme_model/' +
                                              self.in_policy_file)

                elif self.intrinsic_reward_method == 'scme':
                    self.scme_model = scme(self.state_dim, self.action_dim)
                    self.scme_model.load_model('model/scme_model/' +
                                               self.in_policy_file)
                #improvement==================================

                self.dqn.update_target_network()