Exemplo n.º 1
0
    def __init__(self, args):
        """ Agent's Parameters """
        self.n = args.n
        self.sigma = args.sigma
        self.beta = args.beta
        self.number_of_tilings = args.tilings
        self.alpha = np.float64(args.alpha) / self.number_of_tilings
        """ Experiment Configuration """
        self.config = Config()
        self.summary = {
        }  # self.summary will contain the following keys: return_per_episode, steps_per_episode
        self.config.save_summary = True

        " Environment Parameters  "
        self.config.max_actions = 100000
        self.config.num_actions = 3  # Number actions in Mountain Car
        self.config.obs_dims = [
            2
        ]  # Dimensions of the observations experienced by the agent

        " TileCoder Parameters "
        self.config.num_tilings = self.number_of_tilings
        self.config.tiling_side_length = 8
        self.config.num_dims = 2
        self.config.alpha = self.alpha
        self.config.state_space_range = np.array([0.5 + 1.2, 0.07 + 0.07],
                                                 dtype=np.float64)

        " Policies Parameters "
        self.config.target_policy = Config()
        self.config.target_policy.initial_epsilon = 0.1
        self.config.target_policy.anneal_epsilon = False
        self.config.target_policy.annealing_period = 0
        self.config.target_policy.final_epsilon = 0.1
        self.config.anneal_steps_count = 0

        " QSigma Agent "
        self.config.n = self.n
        self.config.gamma = 1
        self.config.beta = self.beta
        self.config.sigma = self.sigma
        self.config.use_er_buffer = False
        self.config.initial_rand_steps = 0
        self.config.rand_steps_count = 0

        " Environment "
        self.env = MountainCliff(config=self.config, summary=self.summary)
        """ Policies """
        self.target_policy = EpsilonGreedyPolicy(self.config,
                                                 behaviour_policy=False)
        """ TileCoder """
        self.function_approximator = TileCoderFA(self.config)
        """ RL Agent """
        self.agent = QSigma(function_approximator=self.function_approximator,
                            target_policy=self.target_policy,
                            behaviour_policy=self.target_policy,
                            environment=self.env,
                            config=self.config,
                            summary=self.summary)
Exemplo n.º 2
0
    def setUp(self):
        config = Config()
        homepath = "/home/jfernando/"
        self.games_directory = homepath + "PycharmProjects/RL_Experiments/Experiments_Engine/Environments/Arcade_Learning_Environment/Supported_Roms/"
        self.rom_name = "seaquest.bin"
        self.summary = {}
        config.save_summary = True

        """ Environment Parameters """
        config.display_screen = False
        config.frame_skip = 4
        config.agent_render = False
        config.repeat_action_probability = 0.25
        config.frame_stack = 4

        config.num_actions = 18     # Number of actions in the ALE
        config.obs_dims = [config.frame_stack, 84, 84]  # [stack_size, height, width]

        " Models Parameters "
        config.dim_out = [32, 64, 64, 512]
        config.filter_dims = [8, 4, 3]
        config.strides = [4, 2, 1]
        config.gate_fun = tf.nn.relu
        config.conv_layers = 3
        config.full_layers = 1
        config.max_pool = False
        config.frames_format = "NHWC"   # NCHW doesn't work with cpu in tensorflow

        " Policies Parameters "
        " Target Policy "
        config.target_policy = Config()
        config.target_policy.initial_epsilon = 0.1
        config.target_policy.anneal_epsilon = False
        " Behaviour Policy "
        config.behaviour_policy = Config()
        config.behaviour_policy.initial_epsilon = 0.2
        config.behaviour_policy.anneal_epsilon = True
        config.behaviour_policy.final_epsilon = 0.1
        config.behaviour_policy.annealing_period = 100

        " Experience Replay Buffer Parameters "
        config.buff_sz = 100000
        config.batch_sz = 32
        config.env_state_dims = (84, 84)    # Dimensions of a frame
        config.reward_clipping = True

        " QSigma Agent Parameters "
        config.n = 3
        config.gamma = 0.99
        config.beta = 1.0
        config.sigma = 0.5
        config.use_er_buffer = True
        config.initial_rand_steps = 50
        config.rand_steps_count = 0

        " Neural Network "
        config.alpha = 0.00025
        config.tnetwork_update_freq = 10000

        " Agent's Parameters "
        self.n = 3
        self.gamma = 0.99

        " Environment "
        self.env = ALE_Environment(config=config, games_directory=self.games_directory, rom_filename=self.rom_name,
                                   summary=self.summary)

        self.target_network = Model_nCPmFO(config=config, name="target")
        self.update_network = Model_nCPmFO(config=config, name="update")

        """ Target Policy """
        self.target_policy = EpsilonGreedyPolicy(config, behaviour_policy=False)

        """ Behaviour Policy """
        self.behavior_policy = EpsilonGreedyPolicy(config, behaviour_policy=True)

        """ Return Function """
        return_function = OffPolicyQSigmaReturnFunction(config=config, tpolicy=self.target_policy)

        """ Experience Replay Buffer """
        er_buffer = OffPolicyQSigmaExperienceReplayBuffer(config, return_function=return_function)

        """ Neural Network """
        alpha = 0.00025
        tnetwork_update_freq = 10000
        batch_size = 32
        optimizer = lambda lr: tf.train.RMSPropOptimizer(learning_rate=lr, decay=0.95, epsilon=0.01, momentum=0.95)
        tf_sess = tf.Session()
        self.function_approximator = NeuralNetwork_wER_FA(optimizer=optimizer, target_network=self.target_network,
                                                          update_network=self.update_network, er_buffer=er_buffer,
                                                          tf_session=tf_sess, config=config, summary=self.summary)

        """ RL Agent """
        self.agent = QSigma(environment=self.env, function_approximator=self.function_approximator,
                            target_policy=self.target_policy, behaviour_policy=self.behavior_policy,
                            er_buffer=er_buffer, config=config, summary=self.summary)

        davariables = self.target_network.get_variables_as_list(tf_session=tf_sess)
        total_parameters = 0
        for davar in davariables:
            total_parameters += np.array(davar).size
        print("The total number of parameters in the network is:", total_parameters)
Exemplo n.º 3
0
    def __init__(self,
                 experiment_parameters,
                 restore=False,
                 restore_data_dir=""):
        self.tf_sess = tf.Session()
        self.optimizer = lambda lr: tf.train.RMSPropOptimizer(learning_rate=lr,
                                                              decay=0.95,
                                                              epsilon=0.01,
                                                              momentum=0.95,
                                                              centered=True)
        """ Agent's Parameters """
        self.n = experiment_parameters["n"]
        self.sigma = experiment_parameters["sigma"]
        ###### sigma decay parameters ######
        self.beta = experiment_parameters["beta"]
        self.decay_type = experiment_parameters["decay_type"]
        self.decay_freq = experiment_parameters["decay_freq"]
        self.sigma_min = experiment_parameters['sigma_min']
        ####################################
        self.target_epsilon = experiment_parameters['target_epsilon']
        self.compute_bprobabilities = experiment_parameters[
            'compute_bprobabilities']
        self.anneal_epsilon = experiment_parameters['anneal_epsilon']
        self.store_sigma = experiment_parameters['store_sigma']
        self.tnetwork_update_freq = experiment_parameters[
            'tnetwork_update_freq']

        if restore:
            with open(os.path.join(restore_data_dir, 'experiment_config.p'),
                      mode='rb') as experiment_config_file:
                self.config = pickle.load(experiment_config_file)
            with open(os.path.join(restore_data_dir, "summary.p"),
                      mode='rb') as summary_file:
                self.summary = pickle.load(summary_file)
        else:
            """ Experiment Configuration """
            self.config = Config()
            self.summary = {}
            self.config.save_summary = True

            " Environment Parameters  "
            self.config.max_actions = 5000
            self.config.num_actions = 3  # Number actions in Mountain Car
            self.config.obs_dims = [
                2
            ]  # Dimensions of the observations experienced by the agent

            " Model Parameters "
            self.config.dim_out = [1000]
            self.config.gate_fun = tf.nn.relu
            self.config.full_layers = 1

            " Neural Network Parameters "
            self.config.alpha = 0.00025
            self.config.batch_sz = 32
            self.config.tnetwork_update_freq = self.tnetwork_update_freq  # Default: 0.05 * buff_sz = 1,000

            " Experience Replay Buffer Parameters "
            self.config.buff_sz = 20000
            self.config.env_state_dims = [
                2
            ]  # Dimensions of the environment's states
            self.config.obs_dtype = np.float32
            ###### sigma decay parameters ######
            self.config.sigma_decay = self.beta
            self.config.decay_type = self.decay_type
            self.config.decay_freq = self.decay_freq
            self.config.sigma_min = self.sigma_min
            ####################################
            self.config.sigma = self.sigma
            self.config.store_bprobs = not self.compute_bprobabilities
            self.config.store_sigma = self.store_sigma
            self.config.store_return = not self.anneal_epsilon

            " Policies Parameters "
            self.config.target_policy = Config()
            self.config.target_policy.initial_epsilon = self.target_epsilon
            self.config.target_policy.anneal_epsilon = False
            self.config.behaviour_policy = Config()
            if self.anneal_epsilon:
                self.config.behaviour_policy.initial_epsilon = 1
                self.config.behaviour_policy.final_epsilon = 0.1
                self.config.behaviour_policy.anneal_epsilon = True
                self.config.behaviour_policy.annealing_period = 20000  # Buffer size
            else:
                self.config.behaviour_policy.initial_epsilon = 0.1
                self.config.behaviour_policy.anneal_epsilon = False
                self.config.behaviour_policy.annealing_period = 20000  # Buffer size

            " QSigma Agent "
            self.config.n = self.n
            self.config.gamma = 1
            self.config.use_er_buffer = True
            self.config.initial_rand_steps = 1000  # 0.05 * buffer_size

            " QSigma Return Function "
            self.config.compute_bprobs = self.compute_bprobabilities
            self.config.onpolicy = self.compute_bprobabilities and not self.anneal_epsilon

        self.config.rand_steps_count = 0

        " Environment "
        self.env = MountainCar(config=self.config, summary=self.summary)

        " Models "
        self.tnetwork = Model_mFO(config=self.config, name='target')
        self.unetwork = Model_mFO(config=self.config, name='update')
        """ Policies """
        self.target_policy = EpsilonGreedyPolicy(self.config,
                                                 behaviour_policy=False)
        self.behaviour_policy = EpsilonGreedyPolicy(self.config,
                                                    behaviour_policy=True)
        """ QSigma Return Function """
        self.rl_return_fun = QSigmaReturnFunction(
            config=self.config,
            tpolicy=self.target_policy,
            bpolicy=self.behaviour_policy)
        """ QSigma Replay Buffer """
        self.qsigma_erp = QSigmaExperienceReplayBuffer(
            config=self.config, return_function=self.rl_return_fun)
        """ Neural Network """
        self.function_approximator = NeuralNetwork_wER_FA(
            optimizer=self.optimizer,
            target_network=self.tnetwork,
            update_network=self.unetwork,
            er_buffer=self.qsigma_erp,
            tf_session=self.tf_sess,
            config=self.config,
            summary=self.summary)
        """ RL Agent """
        self.agent = QSigma(function_approximator=self.function_approximator,
                            target_policy=self.target_policy,
                            behaviour_policy=self.behaviour_policy,
                            environment=self.env,
                            er_buffer=self.qsigma_erp,
                            config=self.config,
                            summary=self.summary)

        # number_of_parameters = 0
        # for variable in self.tnetwork.get_variables_as_list(self.tf_sess):
        #     number_of_parameters += np.array(variable).flatten().size
        # print("The number of parameters in the network is:", number_of_parameters)  # Answer: 6003

        if restore:
            saver = tf.train.Saver()
            sourcepath = os.path.join(restore_data_dir, "agent_graph.ckpt")
            saver.restore(self.tf_sess, sourcepath)
            print("Model restored from file: %s" % sourcepath)