def __init__(self, experiment_parameters, run_results_dir): self.run_results_dir = run_results_dir self.num_tilings = check_attribute_else_default(experiment_parameters, 'num_tilings', 32) self.tiling_length = check_attribute_else_default(experiment_parameters, 'tiling_length', 10) self.learning_rate = check_attribute_else_default(exp_parameters, 'learning_rate', 0.001) self.environment_name = check_attribute_else_default(experiment_parameters, 'env', 'mountain_car', choices=['mountain_car', 'catcher']) self.verbose = experiment_parameters.verbose self.config = Config() self.config.store_summary = True self.summary = {} """ Parameters for the Environment """ self.config.max_actions = ENVIRONMENT_DICTIONARY[self.environment_name]['max_actions'] self.config.norm_state = True """ Parameters for the Function Approximator """ self.config.state_dims = ENVIRONMENT_DICTIONARY[self.environment_name]['state_dims'] self.config.num_actions = ENVIRONMENT_DICTIONARY[self.environment_name]['num_actions'] self.config.gamma = 1.0 self.config.epsilon = 0.1 self.config.lr = self.learning_rate / self.num_tilings self.config.num_tilings = self.num_tilings self.config.tiling_length = self.tiling_length self.config.scaling_factor = 1/2 self.config.scaling_offset = 1 self.env = ENVIRONMENT_DICTIONARY[self.environment_name]['class'](config=self.config, summary=self.summary) self.fa = TileCoderFA(config=self.config) self.rl_agent = Agent(environment=self.env, function_approximator=self.fa, config=self.config, summary=self.summary)
def __init__(self, config, return_function): """ Parameters: Name: Type: Default: Description: (Omitted when self-explanatory) buff_sz int 10 buffer size batch_sz int 1 env_state_dims list [2,2] dimensions of the observations to be stored in the buffer obs_dtype np.type np.uint8 the data type of the observations """ assert isinstance(config, Config) self.config = config self.buff_sz = check_attribute_else_default(self.config, 'buff_sz', 10) self.batch_sz = check_attribute_else_default(self.config, 'batch_sz', 1) self.env_state_dims = list( check_attribute_else_default(self.config, 'env_state_dims', [2, 2])) self.obs_dtype = check_attribute_else_default(self.config, 'obs_dtype', np.uint8) """ Parameters for Return Function """ assert isinstance(return_function, TDZeroReturnFunction) self.return_function = return_function """ Parameters to keep track of the current state of the buffer """ self.current_index = 0 self.full_buffer = False """ Circular Buffers """ self.state = CircularBuffer(self.buff_sz, shape=tuple(self.env_state_dims), dtype=self.obs_dtype) self.reward = CircularBuffer(self.buff_sz, shape=(), dtype=np.int32) self.terminate = CircularBuffer(self.buff_sz, shape=(), dtype=np.bool) self.timeout = CircularBuffer(self.buff_sz, shape=(), dtype=np.bool) self.estimated_return = CircularBuffer(self.buff_sz, shape=(), dtype=np.float64) self.up_to_date = CircularBuffer(self.buff_sz, shape=(), dtype=np.bool)
def __init__(self, config=None, summary=None): assert isinstance(config, Config) """ Parameters: Name: Type Default: Description(omitted when self-explanatory): max_actions int 1000 The max number of actions executed before forcing a time out save_summary bool False Whether to save a summary of the environment """ self.max_actions = check_attribute_else_default(config, 'max_actions', default_value=1000) self.save_summary = check_attribute_else_default(config, 'save_summary', default_value=False) self.summary = summary if self.save_summary: assert isinstance(self.summary, dict) check_dict_else_default(self.summary, "steps_per_episode", []) " Inner state of the environment " self.step_count = 0 self.current_state = self.reset() self.actions = np.array( [0, 1, 2], dtype=int) # 0 = backward, 1 = coast, 2 = forward self.high = np.array([0.5, 0.07], dtype=np.float32) self.low = np.array([-1.2, -0.07], dtype=np.float32) self.action_dictionary = { 0: -1, # accelerate backwards 1: 0, # coast 2: 1 } # accelerate forwards
def __init__(self, environment, function_approximator, behaviour_policy, er_buffer, config=None, summary=None, reshape=True): """ Summary Name: return_per_episode """ self.config = config or Config() assert isinstance(config, Config) """ Parameters in config: Name: Type: Default: Description: (Omitted when self-explanatory) save_summary bool False save the summary of the agent (return per episode) er_start_size int 0 number of steps sampled before training starts er_init_steps_count int 0 number of initial steps taken so far fixed_tpolicy bool False whether the policy is fixed (e.g., a function of the state) or changes over time (e.g., epsilon-greedy or a function of the q-values) """ self.save_summary = check_attribute_else_default( self.config, 'save_summary', False) self.er_start_size = check_attribute_else_default( self.config, 'er_start_size', 0) check_attribute_else_default(self.config, 'er_init_steps_count', 0) self.fixed_tpolicy = check_attribute_else_default( self.config, 'fixed_tpolicy', False) if self.save_summary: assert isinstance(summary, dict) self.summary = summary check_dict_else_default(self.summary, 'return_per_episode', []) " Other Parameters " # Behaviour self.bpolicy = behaviour_policy # Experience Replay Buffer self.er_buffer = er_buffer # Function Approximator: used to approximate the Q-Values self.fa = function_approximator # Environment that the agent is interacting with self.env = environment # Summaries self.cumulative_reward = 0 # Whether to reshape the mountain car observations self.reshape = reshape
def __init__(self, config=None): """ Parameters in config: Name: Type: Default: Description: (Omitted when self-explanatory) num_actions int 3 Number of actions available to the agent epsilon float 0.1 Epsilon before annealing """ self.config = config or Config() assert isinstance(config, Config) self.num_actions = check_attribute_else_default( self.config, 'num_actions', 3) self.epsilon = check_attribute_else_default(self.config, 'epsilon', 0.1) self.p_random = (self.epsilon / self.num_actions) self.p_optimal = self.p_random + (1 - self.epsilon)
def __init__(self, environment, function_approximator, config=None, summary=None): self.config = config or Config() assert isinstance(config, Config) """ Parameters in config: Name: Type: Default: Description: (Omitted when self-explanatory) store_summary bool False store the summary of the agent (return per episode) """ self.store_summary = check_attribute_else_default( self.config, 'store_summary', False) if self.store_summary: assert isinstance(summary, dict) self.summary = summary check_dict_else_default(self.summary, 'return_per_episode', []) " Other Parameters " # Function Approximator: used to approximate the Q-Values self.fa = function_approximator # Environment that the agent is interacting with self.env = environment # Summaries self.cumulative_reward = 0
def __init__(self, experiment_parameters, run_results_dir): self.run_results_dir = run_results_dir self.tnet_update_Freq = check_attribute_else_default( experiment_parameters, 'tnet_update_freq', 1) self.buffer_size = check_attribute_else_default( experiment_parameters, 'buffer_size', 10000) self.learning_rate = check_attribute_else_default( exp_parameters, 'lr', 0.001) self.environment_name = check_attribute_else_default( experiment_parameters, 'env', 'mountain_car', choices=['mountain_car', 'catcher', 'puddle_world']) self.verbose = experiment_parameters.verbose self.config = Config() self.config.store_summary = True # stored in summary: 'return_per_episode', 'loss_per_step', 'steps_per_episode', 'reward_per_step' self.summary = {} self.config.number_of_steps = ENVIRONMENT_DICTIONARY[ self.environment_name]['number_of_steps'] """ Parameters for the Environment """ self.config.max_episode_length = ENVIRONMENT_DICTIONARY[ self.environment_name]['max_episode_length'] self.config.norm_state = True self.config.current_step = 0 """ Parameters for the Function Approximator """ self.config.state_dims = ENVIRONMENT_DICTIONARY[ self.environment_name]['state_dims'] self.config.num_actions = ENVIRONMENT_DICTIONARY[ self.environment_name]['num_actions'] self.config.gamma = 1.0 self.config.epsilon = 0.1 self.config.optim = "adam" self.config.lr = self.learning_rate self.config.batch_size = 32 # DQN parameters self.config.buffer_size = self.buffer_size self.config.tnet_update_freq = self.tnet_update_Freq self.env = ENVIRONMENT_DICTIONARY[self.environment_name]['class']( config=self.config, summary=self.summary) self.fa = VanillaDQN(config=self.config, summary=self.summary) self.rl_agent = Agent(environment=self.env, function_approximator=self.fa, config=self.config, summary=self.summary)
def __init__(self, tpolicy, config=None): assert isinstance(config, Config) """ Parameters in config: Name: Type: Default: Description: (Omitted when self-explanatory) gamma float 1.0 the discount factor onpolicy bool True whether to compute the on-policy return or the off-policy, i.e. compute the importance sampling ratio or not. """ self.gamma = check_attribute_else_default(config, 'gamma', 1.0) self.onpolicy = check_attribute_else_default(config, 'onpolicy', True) """ Other Parameters: tpolicy - The target policy """ self.tpolicy = tpolicy
def __init__(self, experiment_parameters, run_results_dir): self.run_results_dir = run_results_dir self.buffer_size = check_attribute_else_default(experiment_parameters, 'buffer_size', 20000) self.tnet_update_freq = check_attribute_else_default(experiment_parameters, 'tnet_update_freq', 10) self.environment_name = check_attribute_else_default(experiment_parameters, 'env', 'mountain_car', choices=['mountain_car', 'catcher']) self.verbose = experiment_parameters.verbose # parameters specific to the parameter sweep self.learning_rate = check_attribute_else_default(exp_parameters, 'lr', 0.001) self.dropout_probability = check_attribute_else_default(experiment_parameters, 'dropout_probability', 0.1) self.config = Config() self.config.store_summary = True # stored in summary: 'return_per_episode', 'loss_per_step', 'steps_per_episode', 'reward_per_step' self.summary = {} self.config.number_of_steps = ENVIRONMENT_DICTIONARY[self.environment_name]['number_of_steps'] """ Parameters for the Environment """ # Same for every experiment self.config.max_episode_length = ENVIRONMENT_DICTIONARY[self.environment_name]['max_episode_length'] self.config.norm_state = True self.config.current_step = 0 """ Parameters for the Function Approximator """ # Same for every experiment self.config.state_dims = ENVIRONMENT_DICTIONARY[self.environment_name]['state_dims'] self.config.num_actions = ENVIRONMENT_DICTIONARY[self.environment_name]['num_actions'] self.config.gamma = 1.0 self.config.epsilon = 0.1 self.config.optim = "adam" self.config.batch_size = 32 # Selected after finding the best parameter combinations for DQN with a given buffer size self.config.buffer_size = self.buffer_size self.config.tnet_update_freq = self.tnet_update_freq # These are the parameters that we are sweeping over self.config.lr = self.learning_rate self.config.dropout_probability = self.dropout_probability self.env = ENVIRONMENT_DICTIONARY[self.environment_name]['class'](config=self.config, summary=self.summary) self.fa = DropoutNeuralNetwork(config=self.config, summary=self.summary) self.rl_agent = Agent(environment=self.env, function_approximator=self.fa, config=self.config, summary=self.summary)
def __init__(self, config=None, summary=None): assert isinstance(config, Config) """ Parameters: Name: Type Default: Description(omitted when self-explanatory): max_actions int 1000 The max number of actions executed before forcing a time out save_summary bool False Whether to save a summary of the environment """ self.max_actions = check_attribute_else_default(config, 'max_actions', default_value=500) self.save_summary = check_attribute_else_default(config, 'save_summary', default_value=False) self.summary = summary if self.save_summary: assert isinstance(self.summary, dict) check_dict_else_default(self.summary, "steps_per_episode", []) " Inner state of the environment " self.step_count = 0 self.openai_env = gym.make('Acrobot-v1') self.actions = np.array([0, 1, 2], dtype=np.int8) self.high = np.array([np.pi * 2, np.pi * 2, 12.56637096, 28.27433395], np.float64) self.low = np.array([0.0, 0.0, -12.56637096, -28.27433395], dtype=np.float64) self.current_state = self.reset()
def __init__(self, config, summary=None): """ Parameters: Name: Type Default: Description(omitted when self-explanatory): max_actions int 1000 The max number of actions executed before forcing a time out norm_state bool True Normalize the state to [-1,1] store_summary bool False Whether to store the summary of the environment """ self.norm_state = check_attribute_else_default(config, 'norm_state', True) self.max_actions = check_attribute_else_default(config, 'max_actions', 1000) self.store_summary = check_attribute_else_default(config, 'store_summary', False) self.summary = summary if self.store_summary: assert isinstance(self.summary, dict) check_dict_else_default(self.summary, "steps_per_episode", []) self.num_actions = 3 self.state_dims = 4 " Inner state of the environment " self.step_count = 0 self.current_state = np.float64(np.random.uniform(low=-0.5, high=0.5, size=(4,))) self.MAX_VEL_1 = 4 * np.pi self.MAX_VEL_2 = 9 * np.pi self.MAX_THETA_1 = np.pi self.MAX_THETA_2 = np.pi self.m1 = 1.0 self.m2 = 1.0 self.l1 = 1.0 self.l2 = 1.0 self.lc1 = 0.5 self.lc2 = 0.5 self.I1 = 1.0 self.I2 = 1.0 self.g = 9.8 self.dt = 0.05 self.acrobotGoalPosition = 1.0
def __init__(self, config, summary=None): assert isinstance(config, Config) """ Parameters: Name: Type Default: Description(omitted when self-explanatory): # environment parameters max_episode_length int 500000 The max number of actions executed before forcing a time out norm_state bool True Normalize the state to [-1,1] # summary parameters store_summary bool False Whether to store the summary of the environment number_of_steps int 500000 Total number of environment steps """ check_attribute_else_default(config, 'current_step', 0) self.config = config # environment related variables self.max_episode_length = check_attribute_else_default(config, 'max_episode_length', default_value=500000) self.norm_state = check_attribute_else_default(config, 'norm_state', default_value=True) # summary related variables self.store_summary = check_attribute_else_default(config, 'store_summary', default_value=False) self.number_of_steps = check_attribute_else_default(config, 'number_of_steps', default_value=500000) self.summary = summary if self.store_summary: assert isinstance(self.summary, dict) self.reward_per_step = np.zeros(self.number_of_steps, dtype=np.float64) check_dict_else_default(self.summary, "steps_per_episode", []) check_dict_else_default(self.summary, "reward_per_step", self.reward_per_step) # internal state of the environment self.episode_step_count = 0 position = -0.6 + np.random.random() * 0.2 velocity = 0.0 self.current_state = np.array((position, velocity), dtype=np.float64) self.actions = np.array([0, 1, 2], dtype=int) # 0 = backward, 1 = coast, 2 = forward self.high = np.array([0.5, 0.07], dtype=np.float64) self.low = np.array([-1.2, -0.07], dtype=np.float64) self.action_dictionary = {0: -1, # accelerate backwards 1: 0, # coast 2: 1} # accelerate forwards
def __init__(self, config=None, name="default", SEED=None): assert isinstance(config, Config) """ Parameters in config: Name: Type: Default: Description: (Omitted when self-explanatory) dim_out list [10,10,10] the output dimensions of each layer, i.e. neurons obs_dims list [2] the dimensions of the observations seen by the agent num_actions int 3 the number of actions available to the agent gate_fun tf gate fun tf.nn.relu the gate function used across the whole network full_layers int 3 number of fully connected layers xavier_init bool True whether to use a variant of xavier initialization otherwise, matrices are initialized according to N(0, 0.1) and bias are initialized according to N(0, 0.1) """ self.dim_out = check_attribute_else_default(config, 'dim_out', [10, 10, 10]) self.obs_dims = check_attribute_else_default(config, 'obs_dims', [2]) self.num_actions = check_attribute_else_default( config, 'num_actions', 3) self.gate_fun = check_attribute_else_default(config, 'gate_fun', tf.nn.relu) self.full_layers = check_attribute_else_default( config, 'full_layers', 3) self.xavier_init = check_attribute_else_default( config, 'xavier_init', True) """ Other Parameters: name - name of the network. Should be a string. """ self.name = name tf.get_collection(self.name) " Dimensions " dim_in = [np.prod(self.obs_dims)] + self.dim_out[:-1] row_and_action_number = 2 " Placehodler " self.x_frames = tf.placeholder(tf.float32, shape=(None, dim_in[0])) # input frames self.x_actions = tf.placeholder( tf.int32, shape=(None, row_and_action_number)) # input actions self.y = tf.placeholder(tf.float32, shape=None) # target " Variables for Training " self.train_vars = [] " Fully Connected Layers " current_y_hat = self.x_frames for j in range(self.full_layers): # layer n + m: fully connected W, b, z_hat, y_hat = fully_connected_av( self.name, "full_" + str(j + 1), current_y_hat, dim_in[j], self.dim_out[j], tf.random_normal_initializer(stddev=1.0 / np.sqrt(dim_in[j]), seed=SEED), self.gate_fun, xavier_init=self.xavier_init) current_y_hat = y_hat tf.add_to_collection(self.name, W) tf.add_to_collection(self.name, b) self.train_vars.extend([W, b]) """ Output layer """ # output layer: fully connected W, b, z_hat, self.y_hat = fully_connected_av( self.name, "output_layer", current_y_hat, self.dim_out[-1], self.num_actions, tf.random_normal_initializer(stddev=1.0 / np.sqrt(self.dim_out[-1]), seed=SEED), linear_transfer, xavier_init=self.xavier_init) tf.add_to_collection(self.name, W) tf.add_to_collection(self.name, b) self.train_vars.extend([W, b]) self.train_vars = [self.train_vars] # Obtaining y_hat and Scaling by the Importance Sampling y_hat = tf.gather_nd(self.y_hat, self.x_actions) y = self.y # Temporal Difference Error self.td_error = tf.subtract(y, y_hat) # Loss self.train_loss = tf.reduce_mean(tf.pow(self.td_error, 2))
def __init__(self, optimizer, target_network, update_network, er_buffer, config=None, tf_session=None, summary=None): """ Summary Names: cumulative_loss training_steps """ assert isinstance(config, Config) self.config = config """ Parameters in config: Name: Type: Default: Description: (Omitted when self-explanatory) alpha float 0.00025 step size parameter obs_dims list [4,84,84] the dimensions of the obsevations tnetwork_update_freq int 10,000 number of updates before updating the target network update_count int 0 number of updates performed save_summary bool False indicates whether to save a summary of training """ self.alpha = check_attribute_else_default(self.config, 'alpha', 0.00025) self.obs_dims = check_attribute_else_default(self.config, 'obs_dims', [4, 84, 84]) self.tnetwork_update_freq = check_attribute_else_default( self.config, 'tnetwork_update_freq', 10000) self.save_summary = check_attribute_else_default( self.config, 'save_summary', False) check_attribute_else_default(self.config, 'update_count', 0) self.summary = summary if self.save_summary: assert isinstance(self.summary, dict) check_dict_else_default(self.summary, 'cumulative_loss', []) check_dict_else_default(self.summary, 'training_steps', []) self.training_steps = 0 self.cumulative_loss = 0 """ Other Parameters """ " Experience Replay Buffer and Return Function " self.er_buffer = er_buffer " Neural Network Models " self.target_network = target_network # Target Network self.update_network = update_network # Update Network " Training and Learning Evaluation: Tensorflow and variables initializer " self.optimizer = optimizer(self.alpha) self.sess = tf_session or tf.Session() " Train step " self.train_step = self.optimizer.minimize( self.update_network.train_loss, var_list=self.update_network.train_vars[0]) " Initializing variables in the graph" for var in tf.global_variables(): self.sess.run(var.initializer) " Copy Weights to Target Network Operator " unetwork_vars = tf.get_collection(self.update_network.name) tnetwork_vars = tf.get_collection(self.target_network.name) copy_ops = [ target_var.assign(update_var) for target_var, update_var in zip(tnetwork_vars, unetwork_vars) ] self.copy_to_target = tf.group(*copy_ops) self.sess.run(self.copy_to_target)
def __init__(self, config, summary=None): assert isinstance(config, Config) """ Parameters: Name: Type Default: Description(omitted when self-explanatory): max_episode_length int 500000 The max number of steps executed in an episoe before forcing a time out norm_state bool True Normalize the state to [-1,1] display bool False Whether to display the screen of the game init_lives int 3 Number of lives at the start of the game store_summary bool False Whether to store the summary of the environment number_of_steps int 500000 Total number of environment steps """ check_attribute_else_default(config, 'current_step', 0) self.config = config # environment parameters self.max_episode_length = check_attribute_else_default( config, 'max_episode_length', default_value=500000) self.norm_state = check_attribute_else_default(config, 'norm_state', default_value=True) self.display = check_attribute_else_default(config, 'display', default_value=False) self.init_lives = check_attribute_else_default(config, 'init_lives', default_value=3) # summary parameters self.store_summary = check_attribute_else_default(config, 'store_summary', default_value=False) self.summary = summary self.number_of_steps = check_attribute_else_default( config, 'number_of_steps', 500000) if self.store_summary: assert isinstance(self.summary, dict) self.reward_per_step = np.zeros(self.number_of_steps, dtype=np.float64) check_dict_else_default(self.summary, "steps_per_episode", []) check_dict_else_default(self.summary, "reward_per_step", self.reward_per_step) # setting up original catcher environment with the specified parameters self.catcherOb = Catcher(init_lives=self.init_lives) if not self.display: # do not open a pygame window os.putenv('SDL_VIDEODRIVER', 'fbcon') os.environ["SDL_VIDEODRIVER"] = "dummy" if self.norm_state: self.pOb = PLE(self.catcherOb, fps=30, state_preprocessor=get_ob_normalize, display_screen=self.display) else: self.pOb = PLE(self.catcherOb, fps=30, state_preprocessor=get_ob, display_screen=self.display) self.pOb.init() # environment internal state self.actions = [ 97, None, 100 ] # self.pOb.getActionSet() (left = 97, do nothing = None, right = 100) self.num_action = 3 self.num_state = 4 self.episode_step_count = 0 self.pOb.reset_game() self.current_state = self.pOb.getGameState()
def __init__(self, experiment_parameters, run_results_dir): self.run_results_dir = run_results_dir self.buffer_size = check_attribute_else_default( experiment_parameters, 'buffer_size', 20000) self.method = check_attribute_else_default(exp_parameters, 'method', 'DQN') self.environment_name = check_attribute_else_default( experiment_parameters, 'env', 'mountain_car', choices=['mountain_car', 'catcher', 'puddle_world']) parameters_dictionary = BEST_PARAMETERS_DICTIONARY[ self.environment_name][self.method][self.buffer_size] self.verbose = experiment_parameters.verbose self.config = Config() self.config.store_summary = True # stored in summary: 'return_per_episode', 'loss_per_step', 'steps_per_episode', 'reward_per_step' self.summary = {} self.config.number_of_steps = ENVIRONMENT_DICTIONARY[ self.environment_name]['number_of_steps'] """ Parameters for the Environment """ self.config.max_episode_length = ENVIRONMENT_DICTIONARY[ self.environment_name]['max_episode_length'] self.config.norm_state = True self.config.current_step = 0 """ Parameters for the Function Approximator """ self.config.state_dims = ENVIRONMENT_DICTIONARY[ self.environment_name]['state_dims'] self.config.num_actions = ENVIRONMENT_DICTIONARY[ self.environment_name]['num_actions'] self.config.gamma = 1.0 self.config.epsilon = 0.1 self.config.optim = "adam" self.config.batch_size = 32 # Parameters for any type of agent self.config.buffer_size = self.buffer_size self.config.lr = parameters_dictionary['LearningRate'] self.config.tnet_update_freq = parameters_dictionary['Freq'] if self.method in ['DRE', 'DRE_LB', 'DRG', 'DRG_LB']: self.config.beta = parameters_dictionary['Beta'] self.config.reg_factor = parameters_dictionary['RegFactor'] self.config.use_gamma = False self.config.beta_lb = False if self.method in ['DRG', 'DRG_LB']: self.config.use_gamma = True if self.method in ['DRE_LB', 'DRG_LB']: self.config.beta_lb = True self.fa = DistRegNeuralNetwork(config=self.config, summary=self.summary) elif self.method in ['L1A', 'L1W', 'L2A', 'L2W']: self.config.reg_factor = parameters_dictionary['RegFactor'] self.config.reg_method = 'l1' if self.method in ['L2A', 'L2W']: self.config.reg_method = 'l2' self.config.weights_reg = False if self.method in ['L1W', 'L2W']: self.config.weights_reg = True self.fa = RegularizedNeuralNetwork(config=self.config, summary=self.summary) elif self.method in ['DQN']: self.fa = VanillaDQN(config=self.config, summary=self.summary) elif self.method in ['Dropout']: self.config.dropout_probability = parameters_dictionary[ 'DropoutProbability'] self.fa = DropoutNeuralNetwork(config=self.config, summary=self.summary) else: raise ValueError( "No configuration available for the given method.") self.env = ENVIRONMENT_DICTIONARY[self.environment_name]['class']( config=self.config, summary=self.summary) self.rl_agent = Agent(environment=self.env, function_approximator=self.fa, config=self.config, summary=self.summary)