def __init__( self, # Model name, device, parallel_interactions, buffer_observe, seed, execution, saver, summarizer, config, states, actions, preprocessing, exploration, variable_noise, l2_regularization, # TensorforceModel policy, memory, update, optimizer, objective, reward_estimation, baseline_policy, baseline_optimizer, baseline_objective, entropy_regularization, max_episode_timesteps ): preprocessed_states = OrderedDict(states) for state_name, state_spec in states.items(): if preprocessing is None: layers = None elif state_name in preprocessing: layers = preprocessing[state_name] elif state_spec['type'] in preprocessing: layers = preprocessing[state_spec['type']] else: layers = None if layers is not None: preprocessed_states[state_name] = Preprocessor.output_spec( input_spec=state_spec, layers=layers ) # Policy internals specification policy_cls, first_arg, kwargs = Module.get_module_class_and_kwargs( name='policy', module=policy, modules=policy_modules, states_spec=preprocessed_states, actions_spec=actions ) if first_arg is None: internals = policy_cls.internals_spec(name='policy', **kwargs) else: internals = policy_cls.internals_spec(first_arg, name='policy', **kwargs) if any(internal.startswith('baseline-') for internal in internals): raise TensorforceError.value( name='model', argument='internals', value=list(internals), hint='starts with baseline-' ) # Baseline internals specification if baseline_policy is None: pass else: baseline_cls, first_arg, kwargs = Module.get_module_class_and_kwargs( name='baseline', module=baseline_policy, modules=policy_modules, states_spec=preprocessed_states, actions_spec=actions ) if first_arg is None: baseline_internals = baseline_cls.internals_spec(name='baseline', **kwargs) else: baseline_internals = baseline_cls.internals_spec( first_arg, name='baseline', **kwargs ) for internal, spec in baseline_internals.items(): if internal in internals: raise TensorforceError.collision( name='model', value='internals', group1='policy', group2='baseline' ) internals[internal] = spec super().__init__( # Model name=name, device=device, parallel_interactions=parallel_interactions, buffer_observe=buffer_observe, seed=seed, execution=execution, saver=saver, summarizer=summarizer, config=config, states=states, internals=internals, actions=actions, preprocessing=preprocessing, exploration=exploration, variable_noise=variable_noise, l2_regularization=l2_regularization ) # Policy self.policy = self.add_module( name='policy', module=policy, modules=policy_modules, states_spec=self.states_spec, actions_spec=self.actions_spec ) # Update mode if not all(key in ('batch_size', 'frequency', 'start', 'unit') for key in update): raise TensorforceError.value( name='agent', argument='update', value=list(update), hint='not from {batch_size,frequency,start,unit}' ) # update: unit elif 'unit' not in update: raise TensorforceError.required(name='agent', argument='update[unit]') elif update['unit'] not in ('timesteps', 'episodes'): raise TensorforceError.value( name='agent', argument='update[unit]', value=update['unit'], hint='not in {timesteps,episodes}' ) # update: batch_size elif 'batch_size' not in update: raise TensorforceError.required(name='agent', argument='update[batch_size]') self.update_unit = update['unit'] self.update_batch_size = self.add_module( name='update-batch-size', module=update['batch_size'], modules=parameter_modules, is_trainable=False, dtype='long', min_value=1 ) if 'frequency' in update and update['frequency'] == 'never': self.update_frequency = None else: self.update_frequency = self.add_module( name='update-frequency', module=update.get('frequency', update['batch_size']), modules=parameter_modules, is_trainable=False, dtype='long', min_value=1, max_value=max(2, self.update_batch_size.max_value()) ) self.update_start = self.add_module( name='update-start', module=update.get('start', 0), modules=parameter_modules, is_trainable=False, dtype='long', min_value=0 ) # Optimizer self.optimizer = self.add_module( name='optimizer', module=optimizer, modules=optimizer_modules, is_trainable=False ) # Objective self.objective = self.add_module( name='objective', module=objective, modules=objective_modules, is_trainable=False ) # Baseline optimization overview: # Policy Objective Optimizer Config # n n n estimate_horizon=False # n n f invalid!!! # n n y invalid!!! # n y n bl trainable, weighted 1.0 # n y f bl trainable, weighted # n y y separate, use main policy # y n n bl trainable, estimate_advantage=True, equal horizon # y n f invalid!!! # y n y separate, use main objective # y y n bl trainable, weighted 1.0, equal horizon # y y f bl trainable, weighted, equal horizon # y y y separate # Baseline objective if baseline_objective is None: self.baseline_objective = None else: self.baseline_objective = self.add_module( name='baseline-objective', module=baseline_objective, modules=objective_modules, is_trainable=False, is_subscope=True ) # Baseline optimizer if baseline_optimizer is None: self.baseline_optimizer = None if self.baseline_objective is None: self.baseline_loss_weight = None else: self.baseline_loss_weight = 1.0 elif isinstance(baseline_optimizer, float): assert self.baseline_objective is not None self.baseline_optimizer = None self.baseline_loss_weight = baseline_optimizer else: assert self.baseline_objective is not None or baseline_policy is not None self.baseline_optimizer = self.add_module( name='baseline-optimizer', module=baseline_optimizer, modules=optimizer_modules, is_trainable=False, is_subscope=True ) self.baseline_loss_weight = None # Baseline if (baseline_policy is not None or self.baseline_objective is not None) and \ self.baseline_optimizer is None: # since otherwise not part of training assert self.baseline_objective is not None or \ reward_estimation.get('estimate_advantage', True) is_trainable = True else: is_trainable = False if baseline_policy is None: self.baseline_policy = self.policy self.separate_baseline_policy = False else: self.baseline_policy = self.add_module( name='baseline', module=baseline_policy, modules=policy_modules, is_trainable=is_trainable, is_subscope=True, states_spec=self.states_spec, actions_spec=self.actions_spec ) self.separate_baseline_policy = True # Estimator if not all(key in ( 'discount', 'estimate_actions', 'estimate_advantage', 'estimate_horizon', 'estimate_terminal', 'horizon' ) for key in reward_estimation): raise TensorforceError.value( name='agent', argument='reward_estimation', value=reward_estimation, hint='not from {discount,estimate_actions,estimate_advantage,estimate_horizon,' 'estimate_terminal,horizon}' ) if not self.separate_baseline_policy and self.baseline_optimizer is None and \ self.baseline_objective is None: estimate_horizon = False else: estimate_horizon = 'late' if self.separate_baseline_policy and self.baseline_objective is None and \ self.baseline_optimizer is None: estimate_advantage = True else: estimate_advantage = False self.estimator = self.add_module( name='estimator', module=Estimator, is_trainable=False, is_saved=False, values_spec=self.values_spec, horizon=reward_estimation['horizon'], discount=reward_estimation.get('discount', 1.0), estimate_horizon=reward_estimation.get('estimate_horizon', estimate_horizon), estimate_actions=reward_estimation.get('estimate_actions', False), estimate_terminal=reward_estimation.get('estimate_terminal', False), estimate_advantage=reward_estimation.get('estimate_advantage', estimate_advantage), # capacity=reward_estimation['capacity'] min_capacity=self.buffer_observe, max_past_horizon=self.baseline_policy.max_past_horizon(is_optimization=False) ) # Memory if self.update_unit == 'timesteps': policy_horizon = self.policy.max_past_horizon(is_optimization=True) baseline_horizon = self.baseline_policy.max_past_horizon(is_optimization=True) - \ self.estimator.min_future_horizon() min_capacity = self.update_batch_size.max_value() + 1 + \ self.estimator.max_future_horizon() + max(policy_horizon, baseline_horizon) elif self.update_unit == 'episodes': if max_episode_timesteps is None: min_capacity = 0 else: min_capacity = (self.update_batch_size.max_value() + 1) * max_episode_timesteps else: assert False self.memory = self.add_module( name='memory', module=memory, modules=memory_modules, is_trainable=False, values_spec=self.values_spec, min_capacity=min_capacity ) # Entropy regularization entropy_regularization = 0.0 if entropy_regularization is None else entropy_regularization self.entropy_regularization = self.add_module( name='entropy-regularization', module=entropy_regularization, modules=parameter_modules, is_trainable=False, dtype='float', min_value=0.0 ) # Internals initialization self.internals_init.update(self.policy.internals_init()) self.internals_init.update(self.baseline_policy.internals_init()) if any(internal_init is None for internal_init in self.internals_init.values()): raise TensorforceError.required(name='model', argument='internals_init') # Register global tensors Module.register_tensor(name='update', spec=dict(type='long', shape=()), batched=False) Module.register_tensor( name='dependency_starts', spec=dict(type='long', shape=()), batched=True ) Module.register_tensor( name='dependency_lengths', spec=dict(type='long', shape=()), batched=True )
def __init__( # Environment self, states, actions, max_episode_timesteps=None, # TensorFlow etc parallel_interactions=1, buffer_observe=True, seed=None, recorder=None ): assert hasattr(self, 'spec') if seed is not None: assert isinstance(seed, int) random.seed(a=seed) np.random.seed(seed=seed) # States/actions specification self.states_spec = util.valid_values_spec( values_spec=states, value_type='state', return_normalized=True ) self.actions_spec = util.valid_values_spec( values_spec=actions, value_type='action', return_normalized=True ) self.max_episode_timesteps = max_episode_timesteps # Check for name overlap for name in self.states_spec: if name in self.actions_spec: TensorforceError.collision( name='name', value=name, group1='states', group2='actions' ) # Parallel episodes if isinstance(parallel_interactions, int): if parallel_interactions <= 0: raise TensorforceError.value( name='parallel_interactions', value=parallel_interactions ) self.parallel_interactions = parallel_interactions else: raise TensorforceError.type(name='parallel_interactions', value=parallel_interactions) # Buffer observe if isinstance(buffer_observe, bool): if not buffer_observe and self.parallel_interactions > 1: raise TensorforceError.unexpected() if self.max_episode_timesteps is None and self.parallel_interactions > 1: raise TensorforceError.unexpected() if not buffer_observe: self.buffer_observe = 1 elif self.max_episode_timesteps is None: self.buffer_observe = 100 else: self.buffer_observe = self.max_episode_timesteps elif isinstance(buffer_observe, int): if buffer_observe <= 0: raise TensorforceError.value(name='buffer_observe', value=buffer_observe) if self.parallel_interactions > 1: raise TensorforceError.unexpected() if self.max_episode_timesteps is None: self.buffer_observe = buffer_observe else: self.buffer_observe = min(buffer_observe, self.max_episode_timesteps) else: raise TensorforceError.type(name='buffer_observe', value=buffer_observe) # Parallel terminal/reward buffers self.terminal_buffers = np.ndarray( shape=(self.parallel_interactions, self.buffer_observe), dtype=util.np_dtype(dtype='long') ) self.reward_buffers = np.ndarray( shape=(self.parallel_interactions, self.buffer_observe), dtype=util.np_dtype(dtype='float') ) # Parallel buffer indices self.buffer_indices = np.zeros( shape=(self.parallel_interactions,), dtype=util.np_dtype(dtype='int') ) self.timesteps = 0 self.episodes = 0 self.updates = 0 # Recorder if recorder is None: pass elif not all(key in ('directory', 'frequency', 'max-traces') for key in recorder): raise TensorforceError.value(name='recorder', value=list(recorder)) self.recorder_spec = recorder if self.recorder_spec is not None: self.record_states = OrderedDict(((name, list()) for name in self.states_spec)) for name, spec in self.actions_spec.items(): if spec['type'] == 'int': self.record_states[name + '_mask'] = list() self.record_actions = OrderedDict(((name, list()) for name in self.actions_spec)) self.record_terminal = list() self.record_reward = list() self.num_episodes = 0
def __init__( # Environment self, states, actions, max_episode_timesteps=None, # TensorFlow etc parallel_interactions=1, buffer_observe=True, seed=None, recorder=None ): assert hasattr(self, 'spec') if seed is not None: assert isinstance(seed, int) random.seed(a=seed) np.random.seed(seed=seed) # States/actions specification self.states_spec = util.valid_values_spec( values_spec=states, value_type='state', return_normalized=True ) self.actions_spec = util.valid_values_spec( values_spec=actions, value_type='action', return_normalized=True ) self.max_episode_timesteps = max_episode_timesteps # Check for name overlap for name in self.states_spec: if name in self.actions_spec: TensorforceError.collision( name='name', value=name, group1='states', group2='actions' ) # Parallel episodes if isinstance(parallel_interactions, int): if parallel_interactions <= 0: raise TensorforceError.value( name='parallel_interactions', value=parallel_interactions ) self.parallel_interactions = parallel_interactions else: raise TensorforceError.type(name='parallel_interactions', value=parallel_interactions) # Buffer observe if isinstance(buffer_observe, bool): if not buffer_observe and self.parallel_interactions > 1: raise TensorforceError.unexpected() if self.max_episode_timesteps is None and self.parallel_interactions > 1: raise TensorforceError.unexpected() if not buffer_observe: self.buffer_observe = 1 elif self.max_episode_timesteps is None: self.buffer_observe = 100 else: self.buffer_observe = self.max_episode_timesteps elif isinstance(buffer_observe, int): if buffer_observe <= 0: raise TensorforceError.value(name='buffer_observe', value=buffer_observe) if self.parallel_interactions > 1: raise TensorforceError.unexpected() if self.max_episode_timesteps is None: self.buffer_observe = buffer_observe else: self.buffer_observe = min(buffer_observe, self.max_episode_timesteps) else: raise TensorforceError.type(name='buffer_observe', value=buffer_observe) # Recorder if recorder is None: pass elif not all(key in ('directory', 'frequency', 'max-traces', 'start') for key in recorder): raise TensorforceError.value(name='recorder', value=list(recorder)) self.recorder_spec = recorder if recorder is None else dict(recorder) self.is_initialized = False
def __init__(self, states, actions, parallel_interactions=1, buffer_observe=1000, seed=None): """ Agent constructor. Args: states (specification): States specification, arbitrarily nested dictionary of state descriptions with the following attributes: - type ('bool' | 'int' | 'float'): state data type (default: 'float'). - shape (int | iter[int]): state shape (required). - num_states (int > 0): number of discrete state values (required for type 'int'). - min_value/max_value (float): minimum/maximum state value (optional for type 'float'). actions (specification): Actions specification, arbitrarily nested dictionary of action descriptions with the following attributes: - type ('bool' | 'int' | 'float'): action data type (required). - shape (int > 0 | iter[int > 0]): action shape (default: []). - num_actions (int > 0): number of discrete action values (required for type 'int'). - min_value/max_value (float): minimum/maximum action value (optional for type 'float'). parallel_interactions (int > 0): Maximum number of parallel interactions to support, for instance, to enable multiple parallel episodes, environments or (centrally controlled) agents within an environment. buffer_observe (int > 0): Maximum number of timesteps within an episode to buffer before executing internal observe operations, to reduce calls to TensorFlow for improved performance. """ if seed is not None: assert isinstance(seed, int) random.seed(n=seed) np.random.seed(seed=seed) tf.random.set_random_seed(seed=seed) # States/actions specification self.states_spec = util.valid_values_spec(values_spec=states, value_type='state', return_normalized=True) self.actions_spec = util.valid_values_spec(values_spec=actions, value_type='action', return_normalized=True) # Check for name overlap for name in self.states_spec: if name in self.actions_spec: TensorforceError.collision(name='name', value=name, group1='states', group2='actions') # Parallel episodes if isinstance(parallel_interactions, int): if parallel_interactions <= 0: raise TensorforceError.value(name='parallel_interactions', value=parallel_interactions) self.parallel_interactions = parallel_interactions else: raise TensorforceError.type(name='parallel_interactions', value=parallel_interactions) # Buffer observe if isinstance(buffer_observe, bool): # if update_mode['unit'] == 'episodes': # self.buffer_observe = 1000 if buffer_observe else 1 # else: # self.buffer_observe = update_mode['batch_size'] self.buffer_observe = 1000 if buffer_observe else 1 elif isinstance(buffer_observe, int): if buffer_observe <= 0: raise TensorforceError.value(name='buffer_observe', value=buffer_observe) self.buffer_observe = buffer_observe else: raise TensorforceError.type(name='buffer_observe', value=buffer_observe) # Parallel terminal/reward buffers self.terminal_buffers = np.ndarray(shape=(self.parallel_interactions, self.buffer_observe), dtype=util.np_dtype(dtype='bool')) self.reward_buffers = np.ndarray(shape=(self.parallel_interactions, self.buffer_observe), dtype=util.np_dtype(dtype='float')) # Parallel buffer indices self.buffer_indices = np.zeros(shape=(self.parallel_interactions, ), dtype=util.np_dtype(dtype='int')) self.timestep = 0 self.episode = 0