示例#1
0
    def setUp(self):
        config = Config()
        homepath = "/home/jfernando/"
        self.games_directory = homepath + "PycharmProjects/RL_Experiments/Experiments_Engine/Environments/Arcade_Learning_Environment/Supported_Roms/"
        self.rom_name = "seaquest.bin"
        self.summary = {}
        config.save_summary = True

        """ Environment Parameters """
        config.display_screen = False
        config.frame_skip = 4
        config.agent_render = False
        config.repeat_action_probability = 0.25
        config.frame_stack = 4

        config.num_actions = 18     # Number of actions in the ALE
        config.obs_dims = [config.frame_stack, 84, 84]  # [stack_size, height, width]

        " Models Parameters "
        config.dim_out = [32, 64, 64, 512]
        config.filter_dims = [8, 4, 3]
        config.strides = [4, 2, 1]
        config.gate_fun = tf.nn.relu
        config.conv_layers = 3
        config.full_layers = 1
        config.max_pool = False
        config.frames_format = "NHWC"   # NCHW doesn't work with cpu in tensorflow

        " Policies Parameters "
        " Target Policy "
        config.target_policy = Config()
        config.target_policy.initial_epsilon = 0.1
        config.target_policy.anneal_epsilon = False
        " Behaviour Policy "
        config.behaviour_policy = Config()
        config.behaviour_policy.initial_epsilon = 0.2
        config.behaviour_policy.anneal_epsilon = True
        config.behaviour_policy.final_epsilon = 0.1
        config.behaviour_policy.annealing_period = 100

        " Experience Replay Buffer Parameters "
        config.buff_sz = 100000
        config.batch_sz = 32
        config.env_state_dims = (84, 84)    # Dimensions of a frame
        config.reward_clipping = True

        " QSigma Agent Parameters "
        config.n = 3
        config.gamma = 0.99
        config.beta = 1.0
        config.sigma = 0.5
        config.use_er_buffer = True
        config.initial_rand_steps = 50
        config.rand_steps_count = 0

        " Neural Network "
        config.alpha = 0.00025
        config.tnetwork_update_freq = 10000

        " Agent's Parameters "
        self.n = 3
        self.gamma = 0.99

        " Environment "
        self.env = ALE_Environment(config=config, games_directory=self.games_directory, rom_filename=self.rom_name,
                                   summary=self.summary)

        self.target_network = Model_nCPmFO(config=config, name="target")
        self.update_network = Model_nCPmFO(config=config, name="update")

        """ Target Policy """
        self.target_policy = EpsilonGreedyPolicy(config, behaviour_policy=False)

        """ Behaviour Policy """
        self.behavior_policy = EpsilonGreedyPolicy(config, behaviour_policy=True)

        """ Return Function """
        return_function = OffPolicyQSigmaReturnFunction(config=config, tpolicy=self.target_policy)

        """ Experience Replay Buffer """
        er_buffer = OffPolicyQSigmaExperienceReplayBuffer(config, return_function=return_function)

        """ Neural Network """
        alpha = 0.00025
        tnetwork_update_freq = 10000
        batch_size = 32
        optimizer = lambda lr: tf.train.RMSPropOptimizer(learning_rate=lr, decay=0.95, epsilon=0.01, momentum=0.95)
        tf_sess = tf.Session()
        self.function_approximator = NeuralNetwork_wER_FA(optimizer=optimizer, target_network=self.target_network,
                                                          update_network=self.update_network, er_buffer=er_buffer,
                                                          tf_session=tf_sess, config=config, summary=self.summary)

        """ RL Agent """
        self.agent = QSigma(environment=self.env, function_approximator=self.function_approximator,
                            target_policy=self.target_policy, behaviour_policy=self.behavior_policy,
                            er_buffer=er_buffer, config=config, summary=self.summary)

        davariables = self.target_network.get_variables_as_list(tf_session=tf_sess)
        total_parameters = 0
        for davar in davariables:
            total_parameters += np.array(davar).size
        print("The total number of parameters in the network is:", total_parameters)
示例#2
0
    def __init__(self,
                 experiment_parameters,
                 restore=False,
                 restore_data_dir=""):
        self.tf_sess = tf.Session()
        self.optimizer = lambda lr: tf.train.RMSPropOptimizer(learning_rate=lr,
                                                              decay=0.95,
                                                              epsilon=0.01,
                                                              momentum=0.95,
                                                              centered=True)
        """ Agent's Parameters """
        self.n = experiment_parameters["n"]
        self.sigma = experiment_parameters["sigma"]
        ###### sigma decay parameters ######
        self.beta = experiment_parameters["beta"]
        self.decay_type = experiment_parameters["decay_type"]
        self.decay_freq = experiment_parameters["decay_freq"]
        self.sigma_min = experiment_parameters['sigma_min']
        ####################################
        self.target_epsilon = experiment_parameters['target_epsilon']
        self.compute_bprobabilities = experiment_parameters[
            'compute_bprobabilities']
        self.anneal_epsilon = experiment_parameters['anneal_epsilon']
        self.store_sigma = experiment_parameters['store_sigma']
        self.tnetwork_update_freq = experiment_parameters[
            'tnetwork_update_freq']

        if restore:
            with open(os.path.join(restore_data_dir, 'experiment_config.p'),
                      mode='rb') as experiment_config_file:
                self.config = pickle.load(experiment_config_file)
            with open(os.path.join(restore_data_dir, "summary.p"),
                      mode='rb') as summary_file:
                self.summary = pickle.load(summary_file)
        else:
            """ Experiment Configuration """
            self.config = Config()
            self.summary = {}
            self.config.save_summary = True

            " Environment Parameters  "
            self.config.max_actions = 5000
            self.config.num_actions = 3  # Number actions in Mountain Car
            self.config.obs_dims = [
                2
            ]  # Dimensions of the observations experienced by the agent

            " Model Parameters "
            self.config.dim_out = [1000]
            self.config.gate_fun = tf.nn.relu
            self.config.full_layers = 1

            " Neural Network Parameters "
            self.config.alpha = 0.00025
            self.config.batch_sz = 32
            self.config.tnetwork_update_freq = self.tnetwork_update_freq  # Default: 0.05 * buff_sz = 1,000

            " Experience Replay Buffer Parameters "
            self.config.buff_sz = 20000
            self.config.env_state_dims = [
                2
            ]  # Dimensions of the environment's states
            self.config.obs_dtype = np.float32
            ###### sigma decay parameters ######
            self.config.sigma_decay = self.beta
            self.config.decay_type = self.decay_type
            self.config.decay_freq = self.decay_freq
            self.config.sigma_min = self.sigma_min
            ####################################
            self.config.sigma = self.sigma
            self.config.store_bprobs = not self.compute_bprobabilities
            self.config.store_sigma = self.store_sigma
            self.config.store_return = not self.anneal_epsilon

            " Policies Parameters "
            self.config.target_policy = Config()
            self.config.target_policy.initial_epsilon = self.target_epsilon
            self.config.target_policy.anneal_epsilon = False
            self.config.behaviour_policy = Config()
            if self.anneal_epsilon:
                self.config.behaviour_policy.initial_epsilon = 1
                self.config.behaviour_policy.final_epsilon = 0.1
                self.config.behaviour_policy.anneal_epsilon = True
                self.config.behaviour_policy.annealing_period = 20000  # Buffer size
            else:
                self.config.behaviour_policy.initial_epsilon = 0.1
                self.config.behaviour_policy.anneal_epsilon = False
                self.config.behaviour_policy.annealing_period = 20000  # Buffer size

            " QSigma Agent "
            self.config.n = self.n
            self.config.gamma = 1
            self.config.use_er_buffer = True
            self.config.initial_rand_steps = 1000  # 0.05 * buffer_size

            " QSigma Return Function "
            self.config.compute_bprobs = self.compute_bprobabilities
            self.config.onpolicy = self.compute_bprobabilities and not self.anneal_epsilon

        self.config.rand_steps_count = 0

        " Environment "
        self.env = MountainCar(config=self.config, summary=self.summary)

        " Models "
        self.tnetwork = Model_mFO(config=self.config, name='target')
        self.unetwork = Model_mFO(config=self.config, name='update')
        """ Policies """
        self.target_policy = EpsilonGreedyPolicy(self.config,
                                                 behaviour_policy=False)
        self.behaviour_policy = EpsilonGreedyPolicy(self.config,
                                                    behaviour_policy=True)
        """ QSigma Return Function """
        self.rl_return_fun = QSigmaReturnFunction(
            config=self.config,
            tpolicy=self.target_policy,
            bpolicy=self.behaviour_policy)
        """ QSigma Replay Buffer """
        self.qsigma_erp = QSigmaExperienceReplayBuffer(
            config=self.config, return_function=self.rl_return_fun)
        """ Neural Network """
        self.function_approximator = NeuralNetwork_wER_FA(
            optimizer=self.optimizer,
            target_network=self.tnetwork,
            update_network=self.unetwork,
            er_buffer=self.qsigma_erp,
            tf_session=self.tf_sess,
            config=self.config,
            summary=self.summary)
        """ RL Agent """
        self.agent = QSigma(function_approximator=self.function_approximator,
                            target_policy=self.target_policy,
                            behaviour_policy=self.behaviour_policy,
                            environment=self.env,
                            er_buffer=self.qsigma_erp,
                            config=self.config,
                            summary=self.summary)

        # number_of_parameters = 0
        # for variable in self.tnetwork.get_variables_as_list(self.tf_sess):
        #     number_of_parameters += np.array(variable).flatten().size
        # print("The number of parameters in the network is:", number_of_parameters)  # Answer: 6003

        if restore:
            saver = tf.train.Saver()
            sourcepath = os.path.join(restore_data_dir, "agent_graph.ckpt")
            saver.restore(self.tf_sess, sourcepath)
            print("Model restored from file: %s" % sourcepath)
示例#3
0
    def __init__(self, experiment_arguments, dir_name):
        homepath = "/home/jfernando/"
        self.games_directory = homepath + "PycharmProjects/RL_Experiments/Experiments_Engine/Environments/Arcade_Learning_Environment/Supported_Roms/"
        self.rom_name = "seaquest.bin"

        self.optimizer = lambda lr: tf.train.RMSPropOptimizer(
            lr, decay=0.95, momentum=0.95, epsilon=0.01, centered=True)
        self.sess = tf.Session()
        if experiment_arguments.restore_agent:
            with open(os.path.join(dir_name, 'experiment_config.p'),
                      mode='rb') as experiment_config_file:
                self.config = pickle.load(experiment_config_file)
            with open(os.path.join(dir_name, 'summary.p'),
                      mode='rb') as summary_file:
                self.summary = pickle.load(summary_file)
        else:
            self.config = Config()
            self.summary = {
                'frames_per_episode': [],
                'return_per_episode': [],
                'cumulative_loss': [],
                'training_steps': []
            }
            self.config.save_summary = True
            """ Environment Parameters """
            self.config.display_screen = False
            self.config.frame_skip = 5
            self.config.agent_render = False
            self.config.repeat_action_probability = 0.25
            self.config.frame_stack = 4
            self.config.num_actions = 18  # Number of actions in the ALE
            self.config.obs_dims = [4, 84, 84]  # [stack_size, height, width]

            " Models Parameters "
            self.config.dim_out = [32, 64, 64, 512]
            self.config.filter_dims = [8, 4, 3]
            self.config.strides = [4, 2, 1]
            self.config.gate_fun = tf.nn.relu
            self.config.conv_layers = 3
            self.config.full_layers = 1
            self.config.max_pool = False
            # NCHW doesn't work when working with cpu in tensorflow, but it's more efficient on a gpu
            self.config.frames_format = experiment_arguments.frame_format
            self.config.norm_factor = 255.0

            " Policies Parameters "
            " Target Policy "
            self.config.target_policy = Config()
            self.config.target_policy.initial_epsilon = 0.01
            self.config.target_policy.anneal_epsilon = False

            " Experience Replay Buffer Parameters "
            self.config.buff_sz = experiment_arguments.buffer_size
            self.config.batch_sz = 32
            self.config.env_state_dims = (84, 84)  # Dimensions of a frame
            self.config.reward_clipping = True
            self.config.obs_dtype = np.uint8
            self.config.sigma_decay = experiment_arguments.sigma_decay
            self.config.sigma = experiment_arguments.sigma
            self.config.store_bprobs = False
            self.config.store_sigma = True
            self.config.store_return = True

            " QSigma Return Parameters "
            self.config.n = experiment_arguments.n
            self.config.gamma = 0.99
            self.config.initial_rand_steps = 50000
            self.config.compute_bprobs = False
            self.config.onpolicy = True

            " Neural Network "
            self.config.alpha = 0.00025
            self.config.tnetwork_update_freq = 10000

        self.config.rand_steps_count = 0

        " Environment "
        self.env = ALE_Environment(games_directory=self.games_directory,
                                   summary=self.summary,
                                   rom_filename=self.rom_name,
                                   config=self.config)

        " Models "
        self.target_network = Model_nCPmFO(config=self.config, name='target')
        self.update_network = Model_nCPmFO(config=self.config, name='update')
        """ Policies """
        self.target_policy = EpsilonGreedyPolicy(self.config,
                                                 behaviour_policy=False)
        """ Return Function """
        self.return_function = QSigmaReturnFunction(config=self.config,
                                                    tpolicy=self.target_policy)
        """ Experience Replay Buffer """
        self.er_buffer = QSigmaExperienceReplayBuffer(
            config=self.config, return_function=self.return_function)
        """ Neural Network """
        self.function_approximator = NeuralNetwork_wER_FA(
            optimizer=self.optimizer,
            target_network=self.target_network,
            update_network=self.update_network,
            er_buffer=self.er_buffer,
            tf_session=self.sess,
            config=self.config,
            summary=self.summary)
        """ RL Agent """
        self.agent = ReplayBufferAgent(
            environment=self.env,
            function_approximator=self.function_approximator,
            behaviour_policy=self.target_policy,
            config=self.config,
            summary=self.summary,
            er_buffer=self.er_buffer)

        if experiment_arguments.restore_agent:
            saver = tf.train.Saver()
            sourcepath = os.path.join(dir_name, "agent_graph.ckpt")
            saver.restore(self.sess, sourcepath)
            print("Model restored from file: %s" % sourcepath)
示例#4
0
class ExperimentAgent():
    def __init__(self,
                 experiment_parameters,
                 restore=False,
                 restore_data_dir=""):
        self.tf_sess = tf.Session()
        self.optimizer = lambda lr: tf.train.RMSPropOptimizer(learning_rate=lr,
                                                              decay=0.95,
                                                              epsilon=0.01,
                                                              momentum=0.95,
                                                              centered=True)
        """ Agent's Parameters """
        self.n = experiment_parameters["n"]
        self.sigma = experiment_parameters["sigma"]
        ###### sigma decay parameters ######
        self.beta = experiment_parameters["beta"]
        self.decay_type = experiment_parameters["decay_type"]
        self.decay_freq = experiment_parameters["decay_freq"]
        self.sigma_min = experiment_parameters['sigma_min']
        ####################################
        self.target_epsilon = experiment_parameters['target_epsilon']
        self.compute_bprobabilities = experiment_parameters[
            'compute_bprobabilities']
        self.anneal_epsilon = experiment_parameters['anneal_epsilon']
        self.store_sigma = experiment_parameters['store_sigma']
        self.tnetwork_update_freq = experiment_parameters[
            'tnetwork_update_freq']

        if restore:
            with open(os.path.join(restore_data_dir, 'experiment_config.p'),
                      mode='rb') as experiment_config_file:
                self.config = pickle.load(experiment_config_file)
            with open(os.path.join(restore_data_dir, "summary.p"),
                      mode='rb') as summary_file:
                self.summary = pickle.load(summary_file)
        else:
            """ Experiment Configuration """
            self.config = Config()
            self.summary = {}
            self.config.save_summary = True

            " Environment Parameters  "
            self.config.max_actions = 5000
            self.config.num_actions = 3  # Number actions in Mountain Car
            self.config.obs_dims = [
                2
            ]  # Dimensions of the observations experienced by the agent

            " Model Parameters "
            self.config.dim_out = [1000]
            self.config.gate_fun = tf.nn.relu
            self.config.full_layers = 1

            " Neural Network Parameters "
            self.config.alpha = 0.00025
            self.config.batch_sz = 32
            self.config.tnetwork_update_freq = self.tnetwork_update_freq  # Default: 0.05 * buff_sz = 1,000

            " Experience Replay Buffer Parameters "
            self.config.buff_sz = 20000
            self.config.env_state_dims = [
                2
            ]  # Dimensions of the environment's states
            self.config.obs_dtype = np.float32
            ###### sigma decay parameters ######
            self.config.sigma_decay = self.beta
            self.config.decay_type = self.decay_type
            self.config.decay_freq = self.decay_freq
            self.config.sigma_min = self.sigma_min
            ####################################
            self.config.sigma = self.sigma
            self.config.store_bprobs = not self.compute_bprobabilities
            self.config.store_sigma = self.store_sigma
            self.config.store_return = not self.anneal_epsilon

            " Policies Parameters "
            self.config.target_policy = Config()
            self.config.target_policy.initial_epsilon = self.target_epsilon
            self.config.target_policy.anneal_epsilon = False
            self.config.behaviour_policy = Config()
            if self.anneal_epsilon:
                self.config.behaviour_policy.initial_epsilon = 1
                self.config.behaviour_policy.final_epsilon = 0.1
                self.config.behaviour_policy.anneal_epsilon = True
                self.config.behaviour_policy.annealing_period = 20000  # Buffer size
            else:
                self.config.behaviour_policy.initial_epsilon = 0.1
                self.config.behaviour_policy.anneal_epsilon = False
                self.config.behaviour_policy.annealing_period = 20000  # Buffer size

            " QSigma Agent "
            self.config.n = self.n
            self.config.gamma = 1
            self.config.use_er_buffer = True
            self.config.initial_rand_steps = 1000  # 0.05 * buffer_size

            " QSigma Return Function "
            self.config.compute_bprobs = self.compute_bprobabilities
            self.config.onpolicy = self.compute_bprobabilities and not self.anneal_epsilon

        self.config.rand_steps_count = 0

        " Environment "
        self.env = MountainCar(config=self.config, summary=self.summary)

        " Models "
        self.tnetwork = Model_mFO(config=self.config, name='target')
        self.unetwork = Model_mFO(config=self.config, name='update')
        """ Policies """
        self.target_policy = EpsilonGreedyPolicy(self.config,
                                                 behaviour_policy=False)
        self.behaviour_policy = EpsilonGreedyPolicy(self.config,
                                                    behaviour_policy=True)
        """ QSigma Return Function """
        self.rl_return_fun = QSigmaReturnFunction(
            config=self.config,
            tpolicy=self.target_policy,
            bpolicy=self.behaviour_policy)
        """ QSigma Replay Buffer """
        self.qsigma_erp = QSigmaExperienceReplayBuffer(
            config=self.config, return_function=self.rl_return_fun)
        """ Neural Network """
        self.function_approximator = NeuralNetwork_wER_FA(
            optimizer=self.optimizer,
            target_network=self.tnetwork,
            update_network=self.unetwork,
            er_buffer=self.qsigma_erp,
            tf_session=self.tf_sess,
            config=self.config,
            summary=self.summary)
        """ RL Agent """
        self.agent = QSigma(function_approximator=self.function_approximator,
                            target_policy=self.target_policy,
                            behaviour_policy=self.behaviour_policy,
                            environment=self.env,
                            er_buffer=self.qsigma_erp,
                            config=self.config,
                            summary=self.summary)

        # number_of_parameters = 0
        # for variable in self.tnetwork.get_variables_as_list(self.tf_sess):
        #     number_of_parameters += np.array(variable).flatten().size
        # print("The number of parameters in the network is:", number_of_parameters)  # Answer: 6003

        if restore:
            saver = tf.train.Saver()
            sourcepath = os.path.join(restore_data_dir, "agent_graph.ckpt")
            saver.restore(self.tf_sess, sourcepath)
            print("Model restored from file: %s" % sourcepath)

    def train(self):
        self.agent.train(num_episodes=1)
        self.function_approximator.store_in_summary()

    def get_number_of_frames(self):
        return np.sum(self.summary['steps_per_episode'])

    def get_episode_number(self):
        return len(self.summary['steps_per_episode'])

    def get_train_data(self):
        return_per_episode = self.summary['return_per_episode']
        nn_loss = self.summary['cumulative_loss']
        return return_per_episode, nn_loss

    def save_agent(self, dir_name):
        with open(os.path.join(dir_name, 'experiment_config.p'),
                  mode='wb') as experiment_config_file:
            pickle.dump(self.config, experiment_config_file)
        with open(os.path.join(dir_name, "summary.p"),
                  mode='wb') as summary_file:
            pickle.dump(self.summary, summary_file)
        saver = tf.train.Saver()
        save_path = saver.save(self.tf_sess,
                               os.path.join(dir_name, "agent_graph.ckpt"))
        print("Model saved in file: %s" % save_path)

    def save_results(self, dir_name):
        env_info = np.cumsum(self.summary['steps_per_episode'])
        return_per_episode = self.summary['return_per_episode']
        total_loss_per_episode = self.summary['cumulative_loss']
        results = {
            'return_per_episode': return_per_episode,
            'env_info': env_info,
            'total_loss_per_episode': total_loss_per_episode
        }
        with open(os.path.join(dir_name, 'results.p'),
                  mode="wb") as results_file:
            pickle.dump(results, results_file)

    def save_parameters(self, dir_name):
        txt_file_pathname = os.path.join(dir_name, "agent_parameters.txt")
        params_txt = open(txt_file_pathname, "w")
        params_txt.write("# Agent #\n")
        params_txt.write("\tn = " + str(self.config.n) + "\n")
        params_txt.write("\tgamma = " + str(self.config.gamma) + "\n")
        params_txt.write("\tsigma = " + str(self.config.sigma) + "\n")
        params_txt.write("\tbeta = " + str(self.config.sigma_decay) + "\n")
        params_txt.write("\trandom steps before training = " +
                         str(self.config.initial_rand_steps) + "\n")
        params_txt.write("\tcompute behaviour policy's probabilities = " +
                         str(self.config.compute_bprobs) + "\n")
        params_txt.write("\n")

        assert isinstance(self.target_policy, EpsilonGreedyPolicy)
        params_txt.write("# Target Policy #\n")
        params_txt.write("\tinitial epsilon = " +
                         str(self.config.target_policy.initial_epsilon) + "\n")
        params_txt.write("\tfinal epsilon = " +
                         str(self.config.target_policy.final_epsilon) + "\n")
        params_txt.write("\n")

        assert isinstance(self.behaviour_policy, EpsilonGreedyPolicy)
        params_txt.write("# Behaviour Policy #\n")
        params_txt.write("\tinitial epsilon = " +
                         str(self.config.behaviour_policy.initial_epsilon) +
                         "\n")
        params_txt.write("\tanneal epsilon = " +
                         str(self.config.behaviour_policy.anneal_epsilon) +
                         "\n")
        params_txt.write("\tfinal epsilon = " +
                         str(self.config.behaviour_policy.final_epsilon) +
                         "\n")
        params_txt.write("\tannealing period = " +
                         str(self.config.behaviour_policy.annealing_period) +
                         "\n")
        params_txt.write("\n")

        params_txt.write(
            "# Function Approximator: Neural Network with Experience Replay #\n"
        )
        params_txt.write("\talpha = " + str(self.config.alpha) + "\n")
        params_txt.write("\ttarget network update frequency = " +
                         str(self.config.tnetwork_update_freq) + "\n")
        params_txt.write("\tbatch size = " + str(self.config.batch_sz) + "\n")
        params_txt.write("\tbuffer size = " + str(self.config.buff_sz) + "\n")
        params_txt.write("\tfully connected layers = " +
                         str(self.config.full_layers) + "\n")
        params_txt.write("\toutput dimensions per layer = " +
                         str(self.config.dim_out) + "\n")
        params_txt.write("\tgate function = " + str(self.config.gate_fun) +
                         "\n")
        params_txt.write("\n")

        params_txt.write("\tstore_sigma = " + str(self.store_sigma))

        params_txt.close()
示例#5
0
class ExperimentAgent:
    def __init__(self, experiment_arguments, dir_name):
        homepath = "/home/jfernando/"
        self.games_directory = homepath + "PycharmProjects/RL_Experiments/Experiments_Engine/Environments/Arcade_Learning_Environment/Supported_Roms/"
        self.rom_name = "seaquest.bin"

        self.optimizer = lambda lr: tf.train.RMSPropOptimizer(
            lr, decay=0.95, momentum=0.95, epsilon=0.01, centered=True)
        self.sess = tf.Session()
        if experiment_arguments.restore_agent:
            with open(os.path.join(dir_name, 'experiment_config.p'),
                      mode='rb') as experiment_config_file:
                self.config = pickle.load(experiment_config_file)
            with open(os.path.join(dir_name, 'summary.p'),
                      mode='rb') as summary_file:
                self.summary = pickle.load(summary_file)
        else:
            self.config = Config()
            self.summary = {
                'frames_per_episode': [],
                'return_per_episode': [],
                'cumulative_loss': [],
                'training_steps': []
            }
            self.config.save_summary = True
            """ Environment Parameters """
            self.config.display_screen = False
            self.config.frame_skip = 5
            self.config.agent_render = False
            self.config.repeat_action_probability = 0.25
            self.config.frame_stack = 4
            self.config.num_actions = 18  # Number of actions in the ALE
            self.config.obs_dims = [4, 84, 84]  # [stack_size, height, width]

            " Models Parameters "
            self.config.dim_out = [32, 64, 64, 512]
            self.config.filter_dims = [8, 4, 3]
            self.config.strides = [4, 2, 1]
            self.config.gate_fun = tf.nn.relu
            self.config.conv_layers = 3
            self.config.full_layers = 1
            self.config.max_pool = False
            # NCHW doesn't work when working with cpu in tensorflow, but it's more efficient on a gpu
            self.config.frames_format = experiment_arguments.frame_format
            self.config.norm_factor = 255.0

            " Policies Parameters "
            " Target Policy "
            self.config.target_policy = Config()
            self.config.target_policy.initial_epsilon = 0.01
            self.config.target_policy.anneal_epsilon = False

            " Experience Replay Buffer Parameters "
            self.config.buff_sz = experiment_arguments.buffer_size
            self.config.batch_sz = 32
            self.config.env_state_dims = (84, 84)  # Dimensions of a frame
            self.config.reward_clipping = True
            self.config.obs_dtype = np.uint8
            self.config.sigma_decay = experiment_arguments.sigma_decay
            self.config.sigma = experiment_arguments.sigma
            self.config.store_bprobs = False
            self.config.store_sigma = True
            self.config.store_return = True

            " QSigma Return Parameters "
            self.config.n = experiment_arguments.n
            self.config.gamma = 0.99
            self.config.initial_rand_steps = 50000
            self.config.compute_bprobs = False
            self.config.onpolicy = True

            " Neural Network "
            self.config.alpha = 0.00025
            self.config.tnetwork_update_freq = 10000

        self.config.rand_steps_count = 0

        " Environment "
        self.env = ALE_Environment(games_directory=self.games_directory,
                                   summary=self.summary,
                                   rom_filename=self.rom_name,
                                   config=self.config)

        " Models "
        self.target_network = Model_nCPmFO(config=self.config, name='target')
        self.update_network = Model_nCPmFO(config=self.config, name='update')
        """ Policies """
        self.target_policy = EpsilonGreedyPolicy(self.config,
                                                 behaviour_policy=False)
        """ Return Function """
        self.return_function = QSigmaReturnFunction(config=self.config,
                                                    tpolicy=self.target_policy)
        """ Experience Replay Buffer """
        self.er_buffer = QSigmaExperienceReplayBuffer(
            config=self.config, return_function=self.return_function)
        """ Neural Network """
        self.function_approximator = NeuralNetwork_wER_FA(
            optimizer=self.optimizer,
            target_network=self.target_network,
            update_network=self.update_network,
            er_buffer=self.er_buffer,
            tf_session=self.sess,
            config=self.config,
            summary=self.summary)
        """ RL Agent """
        self.agent = ReplayBufferAgent(
            environment=self.env,
            function_approximator=self.function_approximator,
            behaviour_policy=self.target_policy,
            config=self.config,
            summary=self.summary,
            er_buffer=self.er_buffer)

        if experiment_arguments.restore_agent:
            saver = tf.train.Saver()
            sourcepath = os.path.join(dir_name, "agent_graph.ckpt")
            saver.restore(self.sess, sourcepath)
            print("Model restored from file: %s" % sourcepath)

    def train(self):
        self.agent.train()
        self.function_approximator.store_in_summary()

    def save_agent(self, dir_name):
        # Note: The buffer is always dumped and reinitialized when restored. The agent again takes k initial random
        # steps.
        with open(os.path.join(dir_name, 'experiment_config.p'),
                  mode='wb') as experiment_config_file:
            pickle.dump(self.config, experiment_config_file)
        saver = tf.train.Saver()
        save_path = saver.save(self.sess,
                               os.path.join(dir_name, "agent_graph.ckpt"))
        print("Model saved in file: %s" % save_path)

    def save_results(self, dir_name):
        with open(os.path.join(dir_name, "summary.p"),
                  mode='wb') as summary_file:
            pickle.dump(self.summary, summary_file)

    def get_training_data(self):
        return_per_episode = self.summary['return_per_episode']
        environment_data = np.cumsum(self.summary['frames_per_episode'])
        model_loss = self.summary['cumulative_loss']
        return return_per_episode, environment_data, model_loss

    def get_number_of_frames(self):
        return np.sum(self.summary['frames_per_episode'])

    def save_parameters(self, dir_name):
        txt_file_pathname = os.path.join(dir_name, "agent_parameters.txt")
        with open(txt_file_pathname, mode="w") as params_txt:
            params_txt.write("# Agent #\n")
            params_txt.write("\tn = " + str(self.return_function.n) + "\n")
            params_txt.write("\tgamma = " + str(self.return_function.gamma) +
                             "\n")
            params_txt.write("\tsigma = " + str(self.return_function.sigma) +
                             "\n")
            params_txt.write("\tbeta = " +
                             str(self.return_function.sigma_decay) + "\n")
            params_txt.write("\trandom steps before training = " +
                             str(self.agent.initial_rand_steps) + "\n")
            params_txt.write("\n")

            params_txt.write("# Target Policy #\n")
            params_txt.write("\tinitial epsilon = " +
                             str(self.target_policy.initial_epsilon) + "\n")
            params_txt.write("\tfinal epsilon = " +
                             str(self.target_policy.final_epsilon) + "\n")
            params_txt.write("\n")

            assert isinstance(self.er_buffer, QSigmaExperienceReplayBuffer)
            assert isinstance(self.function_approximator, NeuralNetwork_wER_FA)
            assert isinstance(self.target_network, Model_nCPmFO)
            assert isinstance(self.update_network, Model_nCPmFO)
            params_txt.write(
                "# Function Approximator: Neural Network with Experience Replay #\n"
            )
            params_txt.write("\talpha = " +
                             str(self.function_approximator.alpha) + "\n")
            params_txt.write(
                "\ttarget network update frequency = " +
                str(self.function_approximator.tnetwork_update_freq) + "\n")
            params_txt.write("\tbatch size = " + str(self.er_buffer.batch_sz) +
                             "\n")
            params_txt.write("\tbuffer size = " + str(self.er_buffer.buff_sz) +
                             "\n")
            params_txt.write("\n")