def __init__(self, capacity=1000, alpha=1.0, beta=1.0): """ Args: capacity (int): Max capacity. alpha (float): Initial weight. beta (float): Prioritisation factor. """ super(ApexMemory, self).__init__() self.memory_values = [] self.index = 0 self.capacity = capacity self.size = 0 self.max_priority = 1.0 self.alpha = alpha self.beta = beta self.default_new_weight = np.power(self.max_priority, self.alpha) self.priority_capacity = 1 while self.priority_capacity < self.capacity: self.priority_capacity *= 2 # Create segment trees, initialize with neutral elements. sum_values = [0.0 for _ in range_(2 * self.priority_capacity)] sum_segment_tree = MemSegmentTree(sum_values, self.priority_capacity, operator.add) min_values = [float('inf') for _ in range_(2 * self.priority_capacity)] min_segment_tree = MemSegmentTree(min_values, self.priority_capacity, min) self.merged_segment_tree = MinSumSegmentTree( sum_tree=sum_segment_tree, min_tree=min_segment_tree, capacity=self.priority_capacity)
def test_rlgraph_sampling(self): """ Tests RLgraph's sampling performance. """ memory = ApexMemory( capacity=self.capacity, alpha=1.0 ) records = [self.record_space.sample(size=1) for _ in range_(self.inserts)] for record in records: memory.insert_records(( ray_compress(record['states']), record['actions'], record['reward'], record['terminals'], None )) start = time.monotonic() for _ in range_(self.samples): batch_tuple = memory.get_records(self.sample_batch_size) end = time.monotonic() - start tp = self.samples / end print('#### Testing RLGraph Prioritized Replay memory ####') print('Testing sampling performance:') print('Sampled {} batches, throughput: {} batches/s, total time: {} s'.format( self.samples, tp, end ))
def test_rlgraph_combined_ops(self): """ Tests a combined workflow of insert, sample, update on the prioritized replay memory. """ memory = ApexMemory( capacity=self.capacity, alpha=1.0 ) chunksize = 32 chunks = int(self.inserts / chunksize) records = [self.record_space.sample(size=chunksize) for _ in range_(chunks)] loss_values = [np.random.random(size=self.sample_batch_size) for _ in range_(chunks)] start = time.monotonic() for chunk, loss_values in zip(records, loss_values): # Each record now is a chunk. for i in range_(chunksize): memory.insert_records(( ray_compress(chunk['states'][i]), chunk['actions'][i], chunk['reward'][i], chunk['terminals'][i], None )) batch, indices, weights = memory.get_records(self.sample_batch_size) memory.update_records(indices, loss_values) end = time.monotonic() - start tp = len(records) / end print('RLGraph: Testing combined op performance:') print('Ran {} combined ops, throughput: {} combined ops/s, total time: {} s'.format( len(records), tp, end ))
def get_list_registry(from_space, capacity=None, initializer=0, flatten=True, add_batch_rank=False): """ Creates a list storage for a space by providing an ordered dict mapping space names to empty lists. Args: from_space: Space to create registry from. capacity (Optional[int]): Optional capacity to initalize list. initializer (Optional(any)): Optional initializer for list if capacity is not None. flatten (bool): Whether to produce a FlattenedDataOp with auto-keys. add_batch_rank (Optional[bool,int]): If from_space is given and is True, will add a 0th rank (None) to the created variable. If it is an int, will add that int instead of None. Default: False. Returns: dict: Container dict mapping spaces to empty lists. """ if flatten: if capacity is not None: var = from_space.flatten( custom_scope_separator="-", scope_separator_at_start=False, mapping=lambda k, primitive: [initializer for _ in range_(capacity)] ) else: var = from_space.flatten( custom_scope_separator="-", scope_separator_at_start=False, mapping=lambda k, primitive: [] ) else: if capacity is not None: var = [initializer for _ in range_(capacity)] else: var = [] return var
def test_rlgraph_updating(self): """ Tests RLGraph's memory performance. """ memory = ApexMemory( capacity=self.capacity, alpha=1.0 ) records = [self.record_space.sample(size=1) for _ in range_(self.inserts)] for record in records: memory.insert_records(( record['states'], record['actions'], record['reward'], record['terminals'], None )) loss_values = [np.random.random(size=self.sample_batch_size) for _ in range_(self.samples)] indices = [np.random.randint(low=0, high=self.inserts, size=self.sample_batch_size) for _ in range_(self.samples)] start = time.monotonic() for index, loss in zip(indices, loss_values): memory.update_records(index, loss) end = time.monotonic() - start tp = len(indices) / end print('#### Testing RLGraph Prioritized Replay memory ####') print('Testing updating performance:') print('Updates {} loss batches, throughput: {} updates/s, total time: {} s'.format( len(indices), tp, end ))
def test_ray_sampling(self): """ Tests Ray's memory performance. """ assert get_distributed_backend() == "ray" memory = PrioritizedReplayBuffer( size=self.capacity, alpha=1.0, clip_rewards=True ) records = [self.record_space.sample(size=1) for _ in range_(self.inserts)] for record in records: memory.add( obs_t=ray_compress(record['states']), action=record['actions'], reward=record['reward'], obs_tp1=ray_compress(record['states']), done=record['terminals'], weight=None ) start = time.monotonic() for _ in range_(self.samples): batch_tuple = memory.sample(self.sample_batch_size, beta=1.0) end = time.monotonic() - start tp = self.samples / end print('#### Testing Ray Prioritized Replay memory ####') print('Testing sampling performance:') print('Sampled {} batches, throughput: {} samples/s, total time: {} s'.format( self.samples, tp, end ))
def test_box_spaces(self): """ Tests all BoxSpaces via sample/contains loop. With and without batch-rank, different batch sizes, and different los/high combinations (including no bounds). """ for class_ in [FloatBox, IntBox, BoolBox, TextBox]: for add_batch_rank in [False, True]: # TODO: Test time-rank more thoroughly. for add_time_rank in [False, True]: if class_ != BoolBox and class_ != TextBox: for low, high in [(None, None), (-1.0, 10.0), ((1.0, 2.0), (3.0, 4.0)), (((1.0, 2.0, 3.0), (4.0, 5.0, 6.0)), ((7.0, 8.0, 9.0), (10.0, 11.0, 12.0)))]: space = class_(low=low, high=high, add_batch_rank=add_batch_rank, add_time_rank=add_time_rank) if add_batch_rank is False: sample = space.sample() self.assertTrue(space.contains(sample)) else: for batch_size in range_(1, 4): samples = space.sample(size=batch_size) for s in samples: self.assertTrue(space.contains(s)) # TODO: test zero() method perperly for all cases #all_0s = space.zeros() #self.assertTrue(all(v == 0 for v in all_0s)) else: space = class_(add_batch_rank=add_batch_rank, add_time_rank=add_time_rank) if add_batch_rank is False: sample = space.sample() self.assertTrue(space.contains(sample)) else: for batch_size in range_(1, 4): samples = space.sample(size=batch_size) for s in samples: self.assertTrue(space.contains(s))
def test_ray_updating(self): """ Tests Ray's memory performance. """ assert get_distributed_backend() == "ray" memory = PrioritizedReplayBuffer( size=self.capacity, alpha=1.0, clip_rewards=True ) records = [self.record_space.sample(size=1) for _ in range_(self.inserts)] for record in records: memory.add( obs_t=record['states'], action=record['actions'], reward=record['reward'], obs_tp1=record['states'], done=record['terminals'], weight=None ) loss_values = [np.random.random(size=self.sample_batch_size) for _ in range_(self.samples)] indices = [np.random.randint(low=0, high=self.inserts, size=self.sample_batch_size) for _ in range_(self.samples)] start = time.monotonic() for index, loss in zip(indices, loss_values): memory.update_priorities(index, loss) end = time.monotonic() - start tp = len(indices) / end print('#### Testing Ray Prioritized Replay memory ####') print('Testing updating performance:') print('Updates {} loss batches, throughput: {} updates/s, total time: {} s'.format( len(indices), tp, end ))
def _graph_fn_call(self, inputs): """ Images come in with either a batch dimension or not. """ if self.backend == "python" or get_backend() == "python": if isinstance(inputs, list): inputs = np.asarray(inputs) had_single_color_dim = (inputs.shape[-1] == 1) # Batch of samples. if inputs.ndim == 4: resized = [] for i in range_(len(inputs)): resized.append( cv2.resize(inputs[i], dsize=(self.width, self.height), interpolation=self.cv2_interpolation)) resized = np.asarray(resized) # Single sample. else: resized = cv2.resize(inputs, dsize=(self.width, self.height), interpolation=self.cv2_interpolation) # cv2.resize removes the color rank, if its dimension is 1 (e.g. grayscale), add it back here. if had_single_color_dim is True: resized = np.expand_dims(resized, axis=-1) return resized elif get_backend() == "pytorch": if isinstance(inputs, list): inputs = torch.tensor(inputs) had_single_color_dim = (inputs.shape[-1] == 1) # Batch of samples. if len(inputs.shape) == 4: resized = [] for i in range_(len(inputs)): # Get numpy array. resized.append( cv2.resize(inputs[i].numpy(), dsize=(self.width, self.height), interpolation=self.cv2_interpolation)) resized = torch.tensor(resized) # Single sample. else: resized = cv2.resize(inputs.numpy(), dsize=(self.width, self.height), interpolation=self.cv2_interpolation) # cv2.resize removes the color rank, if its dimension is 1 (e.g. grayscale), add it back here. if had_single_color_dim is True: resized = torch.unsqueeze(resized, dim=-1) return resized elif get_backend() == "tf": return tf.image.resize_images(images=inputs, size=(self.width, self.height), method=self.tf_interpolation)
def _truncate_n_step(self, states, actions, rewards, next_states, terminals, was_terminal=True): """ Computes n-step truncation for exactly one episode segment of one environment. Returns: n-step truncated (shortened) version. """ if self.n_step_adjustment > 1: new_len = len(states) - self.n_step_adjustment + 1 # There are 2 cases. If the trajectory did not end in a terminal, # we just have to move states forward and truncate. if was_terminal: # We know the ONLY last terminal is True. terminal_position = len(rewards) - 1 for i in range(len(rewards)): for j in range(1, self.n_step_adjustment): # Outside sample data. Stop inner loop and set truncate = True if i + j >= len(next_states): break # Normal case: No terminal ahead (so far) in n-step sequence. if i + j < terminal_position: next_states[i] = next_states[i + j] rewards[i] += self.discount**j * rewards[i + j] # Terminal ahead: Don't go beyond it. # Repeat it for the remaining n-steps and always assume r=0.0. else: next_states[i] = next_states[terminal_position] terminals[i] = True if i + j <= terminal_position: rewards[i] += self.discount**j * rewards[i + j] else: # We know this segment does not contain any terminals so we simply have to adjust next # states and rewards. for i in range_(len(rewards) - self.n_step_adjustment + 1): for j in range_(1, self.n_step_adjustment): next_states[i] = next_states[i + j] rewards[i] += self.discount**j * rewards[i + j] if self.agent.flat_action_space is not None: for arr in [states, rewards, next_states, terminals]: del arr[new_len:] # Delete container actions separately. for name in self.agent.flat_action_space.keys(): del actions[name][new_len:] else: for arr in [ states, actions, rewards, next_states, terminals ]: del arr[new_len:] return states, actions, rewards, next_states, terminals
def test_ray_prioritized_replay_insert(self): """ Tests Ray's memory performance. """ assert get_distributed_backend() == "ray" memory = PrioritizedReplayBuffer( size=self.capacity, alpha=1.0, clip_rewards=True ) # Test individual inserts. records = [self.record_space.sample(size=1) for _ in range_(self.inserts)] start = time.monotonic() for record in records: memory.add( obs_t=record['states'], action=record['actions'], reward=record['reward'], obs_tp1=record['states'], done=record['terminals'], weight=None ) end = time.monotonic() - start tp = len(records) / end print('#### Testing Ray Prioritized Replay memory ####') print('Testing insert performance:') print('Inserted {} separate records, throughput: {} records/s, total time: {} s'.format( len(records), tp, end )) memory = PrioritizedReplayBuffer( size=self.capacity, alpha=1.0, clip_rewards=True ) # Test chunked inserts -> done via external for loop in Ray. chunks = int(self.inserts / self.chunksize) records = [self.record_space.sample(size=self.chunksize) for _ in range_(chunks)] start = time.monotonic() for chunk in records: for i in range_(self.chunksize): memory.add( obs_t=chunk['states'][i], action=chunk['actions'][i], reward=chunk['reward'][i], obs_tp1=chunk['states'][i], done=chunk['terminals'][i], weight=None ) end = time.monotonic() - start tp = len(records) * self.chunksize / end print('Testing chunked insert performance:') print('Inserted {} chunks, throughput: {} records/s, total time: {} s'.format( len(records), tp, end ))
def test_rlgraph_apex_insert(self): """ Tests RLgraph's python memory performance. """ memory = ApexMemory( capacity=self.capacity, alpha=1.0 ) # Testing insert performance records = [self.record_space.sample(size=1) for _ in range(self.inserts)] start = time.monotonic() for record in records: memory.insert_records(( record['states'], record['actions'], record['reward'], record['terminals'], None )) end = time.monotonic() - start tp = len(records) / end print('#### Testing RLGraph python prioritized replay ####') print('Testing insert performance:') print('Inserted {} separate records, throughput: {} records/s, total time: {} s'.format( len(records), tp, end )) memory = ApexMemory( capacity=self.capacity, alpha=1.0 ) chunks = int(self.inserts / self.chunksize) records = [self.record_space.sample(size=self.chunksize) for _ in range_(chunks)] start = time.monotonic() for chunk in records: for i in range_(self.chunksize): memory.insert_records(( chunk['states'][i], chunk['actions'][i], chunk['reward'][i], chunk['terminals'][i], None )) end = time.monotonic() - start tp = len(records) * self.chunksize / end print('Testing chunked insert performance:') print('Inserted {} chunks, throughput: {} records/s, total time: {} s'.format( len(records), tp, end ))
def __init__(self, agent, env_spec=None, num_environments=1, frameskip=1, render=False, worker_executes_exploration=True, exploration_epsilon=0.1, episode_finish_callback=None): """ Args: agent (Agent): Agent to execute environment on. env_spec Optional[Union[callable, dict]]): Either an environment spec or a callable returning a new environment. num_environments (int): How many single Environments should be run in parallel in a SequentialVectorEnv. frameskip (int): How often actions are repeated after retrieving them from the agent. This setting can be overwritten in the single calls to the different `execute_..` methods. render (bool): Whether to render the environment after each action. Default: False. worker_executes_exploration (bool): If worker executes exploration by sampling. exploration_epsilon (Optional[float]): Epsilon to use if worker executes exploration. """ super(Worker, self).__init__() self.num_environments = num_environments self.logger = logging.getLogger(__name__) # VectorEnv was passed in directly -> Use that one. if isinstance(env_spec, VectorEnv): self.vector_env = env_spec self.num_environments = self.vector_env.num_environments self.env_ids = ["env_{}".format(i) for i in range_(self.num_environments)] # `Env_spec` is for single envs inside a SequentialVectorEnv. elif env_spec is not None: self.vector_env = SequentialVectorEnv(env_spec=env_spec, num_environments=self.num_environments) self.env_ids = ["env_{}".format(i) for i in range_(self.num_environments)] # No env_spec. else: self.vector_env = None self.env_ids = [] self.agent = agent self.frameskip = frameskip self.render = render # Update schedule if worker is performing updates. self.updating = None self.steps_before_update = None self.update_interval = None self.update_steps = None self.sync_interval = None self.episodes_since_update = 0 # Default val or None? self.update_mode = "time_steps" self.worker_executes_exploration = worker_executes_exploration self.exploration_epsilon = exploration_epsilon self.episode_finish_callback = episode_finish_callback
def test_copying_a_component(self): # Flatten a simple 2x2 FloatBox to (4,). space = FloatBox(shape=(2,2), add_batch_rank=False) flatten_orig = Flatten() flatten_copy = flatten_orig.copy(scope="flatten-copy") component_to_test = Component(flatten_orig, flatten_copy, inputs=["input1", "input2"], outputs=["output1", "output2"], connections=[ ["input1", ["flatten", "input"]], ["input2", ["flatten-copy", "input"]], [["flatten", "output"], "output1"], [["flatten-copy", "output"], "output2"] ]) test = ComponentTest(component=component_to_test, input_spaces=dict(input1=space, input2=space)) input_ = dict( input1=np.array([[0.5, 2.0], [1.0, 2.0]]), input2=np.array([[1.0, 2.0], [3.0, 4.0]]) ) expected = dict( output1=np.array([0.5, 2.0, 1.0, 2.0]), output2=np.array([1.0, 2.0, 3.0, 4.0]) ) for i in range_(2): test.test(out_socket_names="output"+str(i+1), inputs=input_, expected_outputs=expected["output"+str(i+1)])
def test_insert(self): """ Simply tests insert op without checking internal logic. """ memory = MemPrioritizedReplay(capacity=self.capacity, next_states=True, alpha=self.alpha, beta=self.beta) memory.create_variables(self.input_spaces) observation = memory.record_space_flat.sample(size=1) memory.insert_records(observation) # Test chunked insert observation = memory.record_space_flat.sample(size=5) memory.insert_records(observation) # Also test Apex version memory = ApexMemory(capacity=self.capacity, alpha=self.alpha, beta=self.beta) observation = self.apex_space.sample(size=5) for i in range_(5): memory.insert_records( (observation['states'][i], observation['actions'][i], observation['reward'][i], observation['terminals'][i], observation['states'][i], observation["weights"][i]))
def test_individual_env(self): env = Environment.from_spec(self.env_spec) agent = Agent.from_spec( # Uses 2015 DQN parameters as closely as possible. config_from_path("configs/dqn_agent_for_pong.json"), state_space=env.state_space, # Try with "reduced" action space (actually only 3 actions, up, down, no-op) action_space=env.action_space) state = env.reset() start = time.monotonic() ep_length = 0 for _ in range_(self.samples): action = agent.get_action(state) state, reward, terminal, info = env.step(action) ep_length += 1 if terminal: print("reset after {} states".format(ep_length)) env.reset() ep_length = 0 runtime = time.monotonic() - start tp = self.samples / runtime print('Testing individual env {} performance:'.format( self.env_spec["gym_env"])) print('Ran {} steps, throughput: {} states/s, total time: {} s'.format( self.samples, tp, runtime))
def test_simple_python_preprocessor_stack(self): """ Tests a pure python preprocessor stack. """ space = FloatBox(shape=(2, ), add_batch_rank=True) # python PreprocessorStack multiply = dict(type="multiply", factor=0.5, scope="m") divide = dict(type="divide", divisor=0.5, scope="d") stack = PreprocessorStack(multiply, divide, backend="python") for sub_comp_scope in ["m", "d"]: stack.sub_components[sub_comp_scope].create_variables( input_spaces=dict(inputs=space)) #test = ComponentTest(component=stack, input_spaces=dict(inputs=float)) for _ in range_(3): # Call fake API-method directly (ok for PreprocessorStack). stack.reset() input_ = np.asarray([[1.0], [2.0], [3.0], [4.0]]) expected = input_ #test.test(("preprocess", input_), expected_outputs=expected) out = stack.preprocess(input_) recursive_assert_almost_equal(out, input_) input_ = space.sample() #test.test(("preprocess", input_), expected_outputs=expected) out = stack.preprocess(input_) recursive_assert_almost_equal(out, input_)
def observe(self, env_sample): """ Observes experience(s). N.b. For performance reason, data layout is slightly different for apex. """ records = env_sample.get_batch() num_records = len(records['states']) # TODO port to tf PR behaviour. if self.clip_rewards: rewards = np.sign(records["rewards"]) else: rewards = records["rewards"] for i in range_(num_records): # If Actions is dict with vectors per key, convert to single dict. if isinstance(records["actions"], dict): action = {k: v[i] for k, v in records["actions"].items()} else: action = records["actions"][i] self.memory.insert_records( (records["states"][i], action, rewards[i], records["terminals"][i], records["next_states"][i], records["importance_weights"][i]))
def test_python_sequence_preprocessor(self): seq_len = 3 space = FloatBox(shape=(1,), add_batch_rank=True) sequencer = Sequence(sequence_length=seq_len, batch_size=4, add_rank=True, backend="python") sequencer.create_variables(input_spaces=dict(preprocessing_inputs=space)) #test = ComponentTest(component=sequencer, input_spaces=dict(apply=space)) for _ in range_(3): sequencer._graph_fn_reset() self.assertEqual(sequencer.index, -1) input_ = np.asarray([[1.0], [2.0], [3.0], [4.0]]) out = sequencer._graph_fn_apply(input_) self.assertEqual(sequencer.index, 0) recursive_assert_almost_equal( out, np.asarray([[[1.0, 1.0, 1.0]], [[2.0, 2.0, 2.0]], [[3.0, 3.0, 3.0]], [[4.0, 4.0, 4.0]]]) ) input_ = np.asarray([[1.1], [2.2], [3.3], [4.4]]) out = sequencer._graph_fn_apply(input_) self.assertEqual(sequencer.index, 1) recursive_assert_almost_equal( out, np.asarray([[[1.0, 1.0, 1.1]], [[2.0, 2.0, 2.2]], [[3.0, 3.0, 3.3]], [[4.0, 4.0, 4.4]]]) ) input_ = np.asarray([[1.11], [2.22], [3.33], [4.44]]) out = sequencer._graph_fn_apply(input_) self.assertEqual(sequencer.index, 2) recursive_assert_almost_equal( out, np.asarray([[[1.0, 1.1, 1.11]], [[2.0, 2.2, 2.22]], [[3.0, 3.3, 3.33]], [[4.0, 4.4, 4.44]]]) ) input_ = np.asarray([[10], [20], [30], [40]]) out = sequencer._graph_fn_apply(input_) self.assertEqual(sequencer.index, 0) recursive_assert_almost_equal( out, np.asarray([[[1.1, 1.11, 10]], [[2.2, 2.22, 20]], [[3.3, 3.33, 30]], [[4.4, 4.44, 40]]]) )
def update_if_necessary(self, timesteps_executed): """ Calls update on the agent according to the update schedule set for this worker. Args: timesteps_executed (int): Timesteps executed thus far. Returns: float: The summed up loss (over all self.update_steps). """ if self.updating: # Are we allowed to update? if timesteps_executed > self.steps_before_update and \ (self.agent.observe_spec["buffer_enabled"] is False or # no update before some data in buffer timesteps_executed >= self.agent.observe_spec["buffer_size"]) and \ timesteps_executed % self.update_interval == 0: # update frequency check loss = 0 for _ in range_(self.update_steps): #l, s_, a_, r_, t_ = self.agent.update() loss += self.agent.update() #self.logger.info("FROM MEM: s={} a={} r={} t={}".format(s_, a_, r_, t_)) #loss += l return loss return None
def test_copying_a_component(self): # Flatten a simple 2x2 FloatBox to (4,). space = FloatBox(shape=(2, 2), add_batch_rank=False) flatten_orig = ReShape(flatten=True, scope="A") flatten_copy = flatten_orig.copy(scope="B") container = Component(flatten_orig, flatten_copy) @rlgraph_api(component=container) def flatten1(self, input_): return self.sub_components["A"].apply(input_) @rlgraph_api(component=container) def flatten2(self, input_): return self.sub_components["B"].apply(input_) test = ComponentTest(component=container, input_spaces=dict(input_=space)) input_ = dict(input1=np.array([[0.5, 2.0], [1.0, 2.0]]), input2=np.array([[1.0, 2.0], [3.0, 4.0]])) expected = dict(output1=np.array([0.5, 2.0, 1.0, 2.0]), output2=np.array([1.0, 2.0, 3.0, 4.0])) for i in range_(1, 3): test.test(("flatten" + str(i), input_["input" + str(i)]), expected_outputs=expected["output" + str(i)])
def __init__(self, num_environments, env_spec, num_background_envs=1, async_reset=False): """ num_background_envs (Optional([int]): Number of environments asynchronously reset in the background. Need to be calibrated depending on reset cost. async_reset (Optional[bool]): If true, resets envs asynchronously in another thread. """ self.environments = [] for _ in range_(num_environments): if isinstance(env_spec, dict): env = Environment.from_spec(env_spec) elif hasattr(env_spec, '__call__'): env = env_spec() else: raise ValueError( "Env_spec must be either a dict containing an environment spec or a callable" "returning a new environment object.") self.environments.append(env) super(SequentialVectorEnv, self).__init__(num_environments=num_environments, state_space=self.environments[0].state_space, action_space=self.environments[0].action_space) self.async_reset = async_reset if self.async_reset: self.resetter = ThreadedResetter(env_spec, num_background_envs) else: self.resetter = Resetter()
def create_remote_workers(self, cls, num_actors, agent_config, worker_spec, *args): """ Creates Ray actors for remote execution. Args: cls (Union[RayValueWorker, RayPolicyWorker]): RayActor class. num_actors (int): Num RayActor to create. agent_config (dict): Agent config. worker_spec (dict): Worker spec. *args (any): Arguments for RayActor class. Returns: list: Remote Ray actors. """ workers = [] cls_as_remote = cls.as_remote(num_cpus=self.num_cpus_per_worker, num_gpus=self.num_gpus_per_worker).remote # Create remote objects and schedule init tasks. ray_constant_exploration = worker_spec.get("ray_constant_exploration", False) for i in range_(num_actors): if ray_constant_exploration is True: exploration_val = worker_exploration(i, num_actors) worker_spec["ray_exploration"] = exploration_val worker = cls_as_remote(deepcopy(agent_config), worker_spec, *args) self.worker_ids[worker] = "worker_{}".format(i) workers.append(worker) self.logger.info("Successfully built agent num {}.".format(i)) return workers
def test_gaussian_noise(self): real_mean = 10.0 real_sd = 2.0 noise_component = GaussianNoise(mean=real_mean, stddev=real_sd) test = ComponentTest(component=noise_component, input_spaces=None, action_space=self.action_input_space) # Collect outputs in `collected` list to compare moments. collected = list() collect_outs = lambda component_test, outs: collected.append(outs) for _ in range_(1000): test.test(("get_noise", None), fn_test=collect_outs) test_mean = np.mean(collected) test_sd = np.std(collected) # Empiric mean should be within 2 sd of real mean self.assertGreater(real_mean, test_mean - test_sd * 2) self.assertLess(real_mean, test_mean + test_sd * 2) # Empiric sd should be within 80 % and 120 % interval self.assertGreater(real_sd, test_sd * 0.8) self.assertLess(real_sd, test_sd * 1.2)
def test_sequence_preprocessor_with_container_space(self): # Test with no batch rank. space = Tuple( FloatBox(shape=(1,)), FloatBox(shape=(2, 2)), add_batch_rank=False ) component_to_test = Sequence(sequence_length=4, add_rank=False) test = ComponentTest(component=component_to_test, input_spaces=dict(preprocessing_inputs=space)) for i in range_(3): test.test("reset") test.test(("apply", np.array([np.array([0.5]), np.array([[0.6, 0.7], [0.8, 0.9]])])), expected_outputs=(np.array([0.5, 0.5, 0.5, 0.5]), np.array([[0.6, 0.7] * 4, [0.8, 0.9] * 4]))) test.test(("apply", np.array([np.array([0.6]), np.array([[1.1, 1.1], [1.1, 1.1]])])), expected_outputs=(np.array([0.5, 0.5, 0.5, 0.6]), np.array([[0.6, 0.7, 0.6, 0.7, 0.6, 0.7, 1.1, 1.1], [0.8, 0.9, 0.8, 0.9, 0.8, 0.9, 1.1, 1.1]]))) test.test(("apply", np.array([np.array([0.7]), np.array([[2.0, 2.1], [2.2, 2.3]])])), expected_outputs=(np.array([0.5, 0.5, 0.6, 0.7]), np.array([[0.6, 0.7, 0.6, 0.7, 1.1, 1.1, 2.0, 2.1], [0.8, 0.9, 0.8, 0.9, 1.1, 1.1, 2.2, 2.3]])))
def test_ornstein_uhlenbeck_noise(self): ou_theta = 0.15 ou_mu = 10.0 ou_sigma = 2.0 noise_component = OrnsteinUhlenbeckNoise(theta=ou_theta, mu=ou_mu, sigma=ou_sigma) test = ComponentTest(component=noise_component, action_space=self.action_input_space) # Collect outputs in `collected` list to compare moments. collected = list() collect_outs = lambda component_test, outs: collected.append(outs) for _ in range_(1000): test.test(("get_noise", None), fn_test=collect_outs) test_mean = np.mean(collected) test_sd = np.std(collected) print("Moments: {} / {}".format(test_mean, test_sd)) # Empiric mean should be within 2 sd of real mean. self.assertGreater(ou_mu, test_mean - test_sd * 2) self.assertLess(ou_mu, test_mean + test_sd * 2) # Empiric sd should be within 45% and 200% interval. self.assertGreater(ou_sigma, test_sd * 0.45) self.assertLess(ou_sigma, test_sd * 2.0)
def test_sequence_preprocessor(self): space = FloatBox(shape=(1,), add_batch_rank=True) sequencer = Sequence(sequence_length=3, add_rank=True) test = ComponentTest(component=sequencer, input_spaces=dict(preprocessing_inputs=space)) vars = sequencer.get_variables("index", "buffer", global_scope=False) index, buffer = vars["index"], vars["buffer"] for _ in range_(3): test.test("reset") index_value, buffer_value = test.read_variable_values(index, buffer) self.assertEqual(index_value, -1) test.test(("apply", np.array([[0.1]])), expected_outputs=np.array([[[0.1, 0.1, 0.1]]])) index_value, buffer_value = test.read_variable_values(index, buffer) self.assertEqual(index_value, 0) test.test(("apply", np.array([[0.2]])), expected_outputs=np.array([[[0.1, 0.1, 0.2]]])) index_value, buffer_value = test.read_variable_values(index, buffer) self.assertEqual(index_value, 1) test.test(("apply", np.array([[0.3]])), expected_outputs=np.array([[[0.1, 0.2, 0.3]]])) index_value, buffer_value = test.read_variable_values(index, buffer) self.assertEqual(index_value, 2) test.test(("apply", np.array([[0.4]])), expected_outputs=np.array([[[0.2, 0.3, 0.4]]])) index_value, buffer_value = test.read_variable_values(index, buffer) self.assertEqual(index_value, 0) test.test(("apply", np.array([[0.5]])), expected_outputs=np.array([[[0.3, 0.4, 0.5]]])) index_value, buffer_value = test.read_variable_values(index, buffer) self.assertEqual(index_value, 1)
def update_if_necessary(self): """ Calls update on the agent according to the update schedule set for this worker. #Args: # timesteps_executed (int): Timesteps executed thus far. Returns: float: The summed up loss (over all self.update_steps). """ if self.updating: # Are we allowed to update? if self.agent.timesteps > self.steps_before_update and \ (self.agent.observe_spec["buffer_enabled"] is False or # no update before some data in buffer self.agent.timesteps >= self.agent.observe_spec["buffer_size"]) and \ self.agent.timesteps % self.update_interval == 0: # update frequency check loss = 0 for _ in range_(self.update_steps): ret = self.agent.update() if isinstance(ret, tuple): loss += ret[0] else: loss += ret return loss return None
def create_variables(self, input_spaces, action_space=None): super(MemPrioritizedReplay, self).create_variables(input_spaces, action_space) self.priority_capacity = 1 while self.priority_capacity < self.capacity: self.priority_capacity *= 2 # Create segment trees, initialize with neutral elements. sum_values = [0.0 for _ in range_(2 * self.priority_capacity)] sum_segment_tree = MemSegmentTree(sum_values, self.priority_capacity, operator.add) min_values = [float('inf') for _ in range_(2 * self.priority_capacity)] min_segment_tree = MemSegmentTree(min_values, self.priority_capacity, min) self.merged_segment_tree = MinSumSegmentTree( sum_tree=sum_segment_tree, min_tree=min_segment_tree, capacity=self.priority_capacity )
def step(self, actions): states, rewards, terminals, infos = [], [], [], [] for i in range_(self.num_envs): state, reward, terminal, info = self.environments[i].step(actions[i]) states.append(state) rewards.append(reward) terminals.append(terminal) infos.append(info) return states, rewards, terminals, infos