def test_enqueue_dequeue(self): """ Simply tests insert op without checking internal logic. """ fifo_queue = FIFOQueue(capacity=self.capacity, record_space=self.record_space) test = ComponentTest(component=fifo_queue, input_spaces=self.input_spaces) first_record = self.record_space.sample(size=1) test.test(("insert_records", first_record), expected_outputs=None) test.test("get_size", expected_outputs=1) further_records = self.record_space.sample(size=5) test.test(("insert_records", further_records), expected_outputs=None) test.test("get_size", expected_outputs=6) expected = dict() for (k1, v1), (k2, v2) in zip( flatten_op(first_record).items(), flatten_op(further_records).items()): expected[k1] = np.concatenate((v1, v2[:4])) expected = unflatten_op(expected) test.test(("get_records", 5), expected_outputs=expected) test.test("get_size", expected_outputs=1)
def flatten_input_ops(self, *ops, **kwarg_ops): """ Flattens all DataOps in ops into FlattenedDataOp with auto-key generation. Ops whose Sockets are not in self.flatten_ops (if its a set) will be ignored. Args: *ops (op): The primitive ops to flatten. **kwarg_ops (op): More primitive ops to flatten (but by named key). Returns: Tuple[DataOp]: A new tuple with all ops (or those specified by `flatten_ops` as FlattenedDataOp. """ assert all(op is not None for op in ops) # just make sure # The returned sequence of output ops. ret = [] for i, op in enumerate(ops): if self.flatten_ops is True or (isinstance(self.flatten_ops, set) and i in self.flatten_ops): ret.append(flatten_op(op)) else: ret.append(op) # Process kwargs, if given. kwarg_ret = {} if len(kwarg_ops) > 0: for key, op in kwarg_ops.items(): if self.flatten_ops is True or (isinstance(self.flatten_ops, set) and key in self.flatten_ops): kwarg_ret[key] = flatten_op(op) else: kwarg_ret[key] = op # Always return a tuple for indexing into the return values. return tuple(ret), kwarg_ret
def flatten_input_ops(self, *ops, **kwarg_ops): """ Flattens all DataOps in ops into FlattenedDataOp with auto-key generation. Ops whose Sockets are not in self.flatten_ops (if its a set) will be ignored. Args: *ops (op): The primitive ops to flatten. **kwarg_ops (op): More primitive ops to flatten (but by named key). Returns: Tuple[DataOp]: A new tuple with all ops (or those specified by `flatten_ops` as FlattenedDataOp. """ assert all(op is not None for op in ops) # just make sure flatten_alongside = None if isinstance(self.flatten_ops, str): flatten_alongside = self.component.__getattribute__(self.flatten_ops) # The returned sequence of output ops. ret = [] for i, op in enumerate(ops): if self.flatten_ops is True or isinstance(self.flatten_ops, str) or \ (isinstance(self.flatten_ops, (set, dict)) and i in self.flatten_ops): fa = flatten_alongside if isinstance(self.flatten_ops, dict): fa = self.component.__getattribute__(self.flatten_ops[i]) if fa is not None: assert isinstance(fa, dict), \ "ERROR: Given `flatten_alongside` property ('{}') is not a dict!".format(fa) ret.append(flatten_op(op, flatten_alongside=fa)) else: ret.append(op) # Process kwargs, if given. kwarg_ret = {} if len(kwarg_ops) > 0: for key, op in kwarg_ops.items(): if self.flatten_ops is True or isinstance(self.flatten_ops, str) or \ (isinstance(self.flatten_ops, (set, dict)) and key in self.flatten_ops): fa = flatten_alongside if isinstance(self.flatten_ops, dict): fa = self.component.__getattribute__(self.flatten_ops[key]) if fa is not None: assert isinstance(fa, dict), \ "ERROR: Given `flatten_alongside` property ('{}') is not a dict!".format(fa) kwarg_ret[key] = flatten_op(op, flatten_alongside=fa) else: kwarg_ret[key] = op # Always return a tuple for indexing into the return values. return tuple(ret), kwarg_ret
def _graph_fn_setup(self): enqueue_ops = list() if get_backend() == "tf": for data_producing_component in self.data_producing_components: record = getattr(data_producing_component, self.api_method_name)() if self.return_slot != -1: # Only care about one slot of the return values. record = record[self.return_slot] # TODO: specific for IMPALA problem: needs to be generalized. if self.internal_states_slicer is not None: outs = self.env_output_splitter.split(record) # Assume that internal_states are the last item coming from the env-stepper. initial_internal_states = self.internal_states_slicer.slice(outs[-1], 0) record = self.fifo_input_merger.merge(*(outs[:-1] + (initial_internal_states,))) else: terminals, states, actions, rewards, action_log_probs = self.env_output_splitter.split(record) record = self.fifo_input_merger.merge( terminals, states, actions, rewards, action_log_probs ) # Create enqueue_op from api_return. # TODO: This is kind of cheating, as we are producing an op from a component that's not ours. enqueue_op = self.queue.queue.enqueue(flatten_op(record)) enqueue_ops.append(enqueue_op) self.queue_runner = tf.train.QueueRunner(self.queue.queue, enqueue_ops) # Add to standard collection, so all queue-runners will be started after session creation. tf.train.add_queue_runner(self.queue_runner) return tf.no_op()
def test_capacity(self): """ Tests if insert correctly blocks when capacity is reached. """ fifo_queue = FIFOQueue(capacity=self.capacity, record_space=self.record_space) test = ComponentTest(component=fifo_queue, input_spaces=self.input_spaces) def run(expected_): # Wait n seconds. time.sleep(2) # Pull something out of the queue again to continue. test.test(("get_records", 2), expected_outputs=expected_) # Insert one more element than capacity records = self.record_space.sample(size=self.capacity + 1) expected = dict() for key, value in flatten_op(records).items(): expected[key] = value[:2] expected = unflatten_op(expected) # Start thread to save this one from getting stuck due to capacity overflow. thread = threading.Thread(target=run, args=(expected, )) thread.start() print("Going over capacity: blocking ...") test.test(("insert_records", records), expected_outputs=None) print("Dequeued some items in another thread. Unblocked.") thread.join()
def create_variables(self, input_spaces, action_space=None): self.in_space = input_spaces["preprocessing_inputs"] # type: Space # Store the mapped output Spaces (per flat key). self.output_spaces = flatten_op(self.get_preprocessed_space(self.in_space)) # Store time_major settings of incoming spaces. self.in_space_time_majors = self.in_space.flatten(mapping=lambda key, space: space.time_major) # Check whether we have to flatten the incoming categories of an IntBox into a FloatBox with additional # rank (categories rank). Store the dimension of this additional rank in the `self.num_categories` dict. if self.flatten is True: if self.flatten_categories is True: def mapping_func(key, space): if isinstance(space, IntBox): # Must have global bounds (bounds valid for all axes). if space.num_categories is False: raise RLGraphError("ERROR: Cannot flatten categories if one of the IntBox spaces ({}={}) does " "not have global bounds (its `num_categories` is False)!".format(key, space)) return space.num_categories # No categories. Keep as is. return 1 self.num_categories = self.in_space.flatten(mapping=mapping_func) elif self.flatten_categories is not False: # TODO: adjust for input ContainerSpaces. For now only support single space (flat-key=="") self.num_categories = {"": self.flatten_categories}
def _graph_fn_insert_records(self, records): flattened_records = flatten_op(records) flattened_stopped_records = {key: tf.stop_gradient(op) for key, op in flattened_records.items()} # Records is just one record. if self.only_insert_single_records is True: return self.queue.enqueue(flattened_stopped_records) # Insert many records (with batch rank). else: return self.queue.enqueue_many(flattened_stopped_records)
def _graph_fn_get_q_values(self, states, actions, target=False): backend = get_backend() #tf.one_hot(tf.cast(x=tensor, dtype=tf.int32), depth=5) flat_actions = flatten_op(actions) state_actions = [states] for flat_key, action_component in self._policy.action_space.flatten( ).items(): state_actions.append(flat_actions[flat_key]) if backend == "tf": state_actions = tf.concat(state_actions, axis=-1) elif backend == "pytorch": state_actions = torch.cat(state_actions, dim=-1) q_funcs = self._q_functions if target is False else self._target_q_functions return tuple(q.value_output(state_actions) for q in q_funcs)
def _graph_fn_reduce_over_sub_distributions(self, log_probs): params_space = next(iter(flatten_op(self.api_method_inputs["parameters"]).values())) num_ranks_to_keep = (1 if params_space.has_batch_rank else 0) + (1 if params_space.has_time_rank else 0) log_probs_list = [] if get_backend() == "tf": for log_prob in log_probs.values(): # Reduce sum over all ranks to get the joint log llh. log_prob = tf.reduce_sum(log_prob, axis=list(range(len(log_prob.shape) - 1, num_ranks_to_keep - 1, -1))) log_probs_list.append(log_prob) return tf.reduce_sum(tf.stack(log_probs_list, axis=0), axis=0) elif get_backend() == "pytorch": for log_prob in log_probs.values(): # Reduce sum over all ranks to get the joint log llh. log_prob = torch.sum(log_prob, dim=list(range(len(log_prob.shape) - 1, num_ranks_to_keep - 1, -1))) log_probs_list.append(log_prob) return torch.sum(torch.stack(log_probs_list, dim=0), dim=0)
def _graph_fn_stage(self, *inputs): """ Stages all incoming ops (after flattening them). Args: inputs (DataOp): The incoming ops to be (flattened and) staged. Returns: DataOp: The staging op. """ # Flatten inputs and stage them. # TODO: Build equivalent to nest.flatten () flattened_ops = list() for input_ in inputs: flat_list = list(flatten_op(input_).values()) flattened_ops.extend(flat_list) stage_op = self.area.put(flattened_ops) return stage_op
def __init__(self, input_network_specs, post_network_spec=None, **kwargs): """ Args: input_network_specs (Union[Dict[str,dict],Tuple[dict]]): A specification dict or tuple with values being the spec dicts for the single streams. The `call` method expects a dict input or a single tuple input (not as *args) in its first parameter. post_network_spec (Optional[]): The specification dict of the post-concat network or the post-concat network object itself. """ super(MultiInputStreamNeuralNetwork, self).__init__(scope="multi-input-stream-nn", **kwargs) # Create all streams' networks. if isinstance(input_network_specs, dict): self.input_stream_nns = {} for i, (flat_key, nn_spec) in enumerate( flatten_op(input_network_specs).items()): self.input_stream_nns[flat_key] = NeuralNetwork.from_spec( nn_spec, scope="input-stream-nn-{}".format(i)) # Create the concat layer to merge all streams. self.concat_layer = ConcatLayer(dict_keys=list( self.input_stream_nns.keys()), axis=-1) else: assert isinstance(input_network_specs, (list, tuple)),\ "ERROR: `input_network_specs` must be dict or tuple/list!" self.input_stream_nns = [] for i, nn_spec in enumerate(input_network_specs): self.input_stream_nns.append( NeuralNetwork.from_spec( nn_spec, scope="input-stream-nn-{}".format(i))) # Create the concat layer to merge all streams. self.concat_layer = ConcatLayer(axis=-1) # Create the post-network (after the concat). self.post_nn = NeuralNetwork.from_spec( post_network_spec, scope="post-concat-nn") # type: NeuralNetwork # Add all sub-Components. self.add_components( self.post_nn, self.concat_layer, *list(self.input_stream_nns.values() if isinstance( input_network_specs, dict) else self.input_stream_nns))
def _graph_fn_setup(self): enqueue_ops = list() if get_backend() == "tf": for data_producing_component in self.data_producing_components: record = getattr(data_producing_component, self.api_method_name)() if self.return_slot != -1: # Only care about one slot of the return values. record = record[self.return_slot] # Create dict record from tuple return. #record = self.input_merger.merge(*record) # TODO: specific for IMPALA problem: needs to be generalized. preprocessed_s, actions, rewards, returns, terminals, next_states, action_log_probs, \ internal_states = self.env_output_splitter.split(record) last_next_state = self.next_states_slicer.slice( next_states, -1) initial_internal_states = self.internal_states_slicer.slice( internal_states, 0) #current_internal_states = self.internal_states_slicer.slice(internal_states, -1) record = self.fifo_input_merger.merge(preprocessed_s, actions, rewards, terminals, last_next_state, action_log_probs, initial_internal_states) # Insert results into the FIFOQueue. #insert_op = fifo_queue.insert_records(record) #return step_op, insert_op, current_internal_states, returns, terminals # Create enqueue_op from api_return. # TODO: This is kind of cheating, as we are producing an op from a component that's not ours. enqueue_op = self.queue.queue.enqueue(flatten_op(record)) enqueue_ops.append(enqueue_op) self.queue_runner = tf.train.QueueRunner(self.queue.queue, enqueue_ops) # Add to standard collection, so all queue-runners will be started after session creation. tf.train.add_queue_runner(self.queue_runner) return tf.no_op()
def __init__(self, preprocessors, **kwargs): """ Args: preprocessors (dict): Raises: RLGraphError: If a sub-component is not a PreprocessLayer object. """ # Create one separate PreprocessorStack per given key. # All possibly other keys in an input will be pass through un-preprocessed. self.flattened_preprocessors = flatten_op(preprocessors) for i, (flat_key, spec) in enumerate(self.flattened_preprocessors.items()): self.flattened_preprocessors[flat_key] = PreprocessorStack.from_spec( spec, scope="preprocessor-stack-{}".format(i) ) # NOTE: No automatic API-methods. Define them all ourselves. kwargs["api_methods"] = {} default_dict(kwargs, dict(scope=kwargs.pop("scope", "dict-preprocessor-stack"))) super(DictPreprocessorStack, self).__init__(*list(self.flattened_preprocessors.values()), **kwargs)
def _graph_fn_entropy(self, distribution): params_space = next(iter(flatten_op(self.api_method_inputs["parameters"]).values())) num_ranks_to_keep = (1 if params_space.has_batch_rank else 0) + (1 if params_space.has_time_rank else 0) all_entropies = [] if get_backend() == "tf": for key, distr in distribution.items(): entropy = distr.entropy() # Reduce sum over all ranks to get the joint entropy. entropy = tf.reduce_sum(entropy, axis=list(range(len(entropy.shape) - 1, num_ranks_to_keep - 1, -1))) all_entropies.append(entropy) return tf.reduce_sum(tf.stack(all_entropies, axis=0), axis=0) elif get_backend() == "pytorch": for key, distr in distribution.items(): entropy = distr.entropy() # Reduce sum over all ranks to get the joint log llh. entropy = torch.sum(entropy, dim=list(range(len(entropy.shape) - 1, num_ranks_to_keep - 1, -1))) all_entropies.append(entropy) # TODO: flatten all all_log_probs (or expand in last dim) so we can concat, then reduce_sum to get the joint probs. return torch.sum(torch.stack(all_entropies, dim=0), dim=0)
def _graph_fn_get_q_values(self, preprocessed_states, actions, target=False): backend = get_backend() flat_actions = flatten_op(actions) actions = [] for flat_key, action_component in self._policy.action_space.flatten( ).items(): actions.append(flat_actions[flat_key]) if backend == "tf": actions = tf.concat(actions, axis=-1) elif backend == "pytorch": actions = torch.cat(actions, dim=-1) q_funcs = self._q_functions if target is False else self._target_q_functions # We do not concat states yet because we might pass states through a conv stack before merging it # with actions. return tuple( q.state_action_value(preprocessed_states, actions) for q in q_funcs)
def _graph_fn_step(self): if get_backend() == "tf": def scan_func(accum, time_delta): # Not needed: preprocessed-previous-states (tuple!) # `state` is a tuple as well. See comment in ctor for why tf cannot use ContainerSpaces here. internal_states = None state = accum[1] if self.has_rnn: internal_states = accum[-1] state = tuple(tf.convert_to_tensor(value=s) for s in state) flat_state = OrderedDict() for i, flat_key in enumerate( self.state_space_actor_flattened.keys()): # Add a simple (size 1) batch rank to the state so it'll pass through the NN. # - Also have to add a time-rank for RNN processing. expanded = state[i] for _ in range(1 if self.has_rnn is False else 2): expanded = tf.expand_dims(input=expanded, axis=0) # Make None so it'll be recognized as batch-rank by the auto-Space detector. flat_state[flat_key] = tf.placeholder_with_default( input=expanded, shape=(None, ) + ((None, ) if self.has_rnn is True else ()) + self.state_space_actor_list[i].shape) # Recreate state as the original Space to pass it into the actor-component. state = unflatten_op(flat_state) # Get action and preprocessed state (as batch-size 1). out = (self.actor_component.get_preprocessed_state_and_action if self.add_action_probs is False else self.actor_component. get_preprocessed_state_action_and_action_probs)( state, # Add simple batch rank to internal_states. None if internal_states is None else DataOpTuple( internal_states), # <- None for non-RNN systems time_step=self.time_step + time_delta, return_ops=True) # Get output depending on whether it contains internal_states or not. a = out["action"] action_probs = out.get("action_probs") current_internal_states = out.get("last_internal_states") # Strip the batch (and maybe time) ranks again from the action in case the Env doesn't like it. a_no_extra_ranks = a[0, 0] if self.has_rnn is True else a[0] # Step through the Env and collect next state (tuple!), reward and terminal as single values # (not batched). out = self.environment_server.step_for_env_stepper( a_no_extra_ranks) s_, r, t_ = out[:-2], out[-2], out[-1] r = tf.cast(r, dtype="float32") # Add a and/or r to next_state? if self.add_previous_action_to_state is True: assert isinstance( s_, tuple ), "ERROR: Cannot add previous action to non tuple!" s_ = s_ + (a_no_extra_ranks, ) if self.add_previous_reward_to_state is True: assert isinstance( s_, tuple ), "ERROR: Cannot add previous reward to non tuple!" s_ = s_ + (r, ) # Note: s_ is packed as tuple. ret = [t_, s_] + \ ([a_no_extra_ranks] if self.add_action else []) + \ ([r] if self.add_reward else []) + \ ([(action_probs[0][0] if self.has_rnn is True else action_probs[0])] if self.add_action_probs is True else []) + \ ([tuple(current_internal_states)] if self.has_rnn is True else []) return tuple(ret) # Initialize the tf.scan run. initializer = [ self.current_terminal.read_value( ), # whether the current state is terminal # current (raw) state (flattened components if ContainerSpace). tuple( map(lambda x: x.read_value(), self.current_state.values())) ] # Append actions and rewards if needed. if self.add_action: initializer.append(self.current_action.read_value()) if self.add_reward: initializer.append(self.current_reward.read_value()) # Append action probs if needed. if self.add_action_probs is True: initializer.append(self.current_action_probs.read_value()) # Append internal states if needed. if self.current_internal_states is not None: initializer.append( tuple( tf.placeholder_with_default( internal_s.read_value(), shape=(None, ) + tuple(internal_s.shape.as_list()[1:])) for internal_s in self.current_internal_states.values())) # Scan over n time-steps (tf.range produces the time_delta with respect to the current time_step). # NOTE: Changed parallel to 1, to resolve parallel issues. step_results = list( tf.scan(fn=scan_func, elems=tf.range(self.num_steps, dtype="int32"), initializer=tuple(initializer), back_prop=False)) # Store the time-step increment, return so far, current terminal and current state. assigns = [ tf.assign_add(self.time_step, self.num_steps), self.assign_variable(self.current_terminal, step_results[0][-1]) ] # Concatenate first and rest. full_results = [] for first_values, rest_values in zip(initializer, step_results): full_results.append( nest.map_structure( lambda first, rest: tf.concat([[first], rest], axis=0), first_values, rest_values)) # Re-build DataOpDicts from preprocessed-states and states (from tuple right now). rebuild_s = DataOpDict() for flat_key, var_ref, s_comp in zip( self.state_space_actor_flattened.keys(), self.current_state.values(), full_results[1]): assigns.append(self.assign_variable( var_ref, s_comp[-1])) # -1: current state (last observed) rebuild_s[flat_key] = s_comp rebuild_s = unflatten_op(rebuild_s) full_results[1] = rebuild_s # Remove batch rank from internal states again. if self.current_internal_states is not None: # TODO: What if internal states is not the last item in the list anymore due to some change. slot = -1 # if self.add_action_probs is True else 2 # TODO: What if internal states is a dict? Right now assume some tuple. internal_states_wo_batch = list() for i in range(len(full_results[slot])): # 1=batch axis (which is 1); 0=time axis. internal_states_wo_batch.append( tf.squeeze(full_results[-1][i], axis=1)) full_results[slot] = DataOpTuple(internal_states_wo_batch) with tf.control_dependencies(control_inputs=assigns): # Let the auto-infer system know, what time rank we have. full_results = DataOpTuple(full_results) for o in flatten_op(full_results).values(): o._time_rank = 0 # which position in the shape is the time-rank? step_op = tf.no_op() return step_op, full_results
def observe(self, preprocessed_states, actions, internals, rewards, next_states, terminals, env_id=None, batched=False): """ Observes an experience tuple or a batch of experience tuples. Note: If configured, first uses buffers and then internally calls _observe_graph() to actually run the computation graph. If buffering is disabled, this just routes the call to the respective `_observe_graph()` method of the child Agent. Args: preprocessed_states (Union[dict,ndarray]): Preprocessed states dict or array. actions (Union[dict,ndarray]): Actions dict or array containing actions performed for the given state(s). internals (Optional[list]): Internal state(s) returned by agent for the given states.Must be empty list if no internals available. rewards (Union[float,List[float]]): Scalar reward(s) observed. terminals (Union[bool,List[bool]]): Boolean indicating terminal. next_states (Union[dict,ndarray]): Preprocessed next states dict or array. env_id (Optional[str]): Environment id to observe for. When using vectorized execution and buffering, using environment ids is necessary to ensure correct trajectories are inserted. See `SingleThreadedWorker` for example usage. batched (bool): Whether given data (states, actions, etc..) is already batched or not. """ # Check for illegal internals. if internals is None: internals = [] if self.observe_spec["buffer_enabled"] is True: if env_id is None: env_id = self.default_env # If data is already batched, just have to extend our buffer lists. if batched: if self.flat_state_space is not None: for i, flat_key in enumerate(self.flat_state_space.keys()): self.states_buffer[env_id][i].extend( preprocessed_states[flat_key]) self.next_states_buffer[env_id][i].extend( next_states[flat_key]) else: self.states_buffer[env_id].extend(preprocessed_states) self.next_states_buffer[env_id].extend(next_states) if self.flat_action_space is not None: flat_action = flatten_op(actions) for i, flat_key in enumerate( self.flat_action_space.keys()): self.actions_buffer[env_id][i].append( flat_action[flat_key]) else: self.actions_buffer[env_id].extend(actions) self.internals_buffer[env_id].extend(internals) self.rewards_buffer[env_id].extend(rewards) self.terminals_buffer[env_id].extend(terminals) # Data is not batched, append single items (without creating new lists first!) to buffer lists. else: if self.flat_state_space is not None: for i, flat_key in enumerate(self.flat_state_space.keys()): self.states_buffer[env_id][i].append( preprocessed_states[flat_key]) self.next_states_buffer[env_id][i].append( next_states[flat_key]) else: self.states_buffer[env_id].append(preprocessed_states) self.next_states_buffer[env_id].append(next_states) if self.flat_action_space is not None: flat_action = flatten_op(actions) for i, flat_key in enumerate( self.flat_action_space.keys()): self.actions_buffer[env_id][i].append( flat_action[flat_key]) else: self.actions_buffer[env_id].append(actions) self.internals_buffer[env_id].append(internals) self.rewards_buffer[env_id].append(rewards) self.terminals_buffer[env_id].append(terminals) buffer_is_full = len(self.rewards_buffer[env_id] ) >= self.observe_spec["buffer_size"] # If the buffer (per environment) is full OR the episode was aborted: # Change terminal of last record artificially to True (also give warning "buffer too small"), # insert and flush the buffer. if buffer_is_full or self.terminals_buffer[env_id][-1]: # Warn if full and last terminal is False. if buffer_is_full and not self.terminals_buffer[env_id][-1]: self.logger.warning( "Buffer of size {} of Agent '{}' may be too small! Had to add artificial terminal=True " "to end.".format(self.observe_spec["buffer_size"], self)) self.terminals_buffer[env_id][-1] = True # TODO: Apply n-step post-processing if necessary. if self.flat_state_space is not None: states_ = {} next_states_ = {} for i, key in enumerate(self.flat_state_space.keys()): states_[key] = np.asarray( self.states_buffer[env_id][i]) next_states_[key] = np.asarray( self.next_states_buffer[env_id][i]) # Squeeze, but do not squeeze (1,) to (). if len(states_[key]) > 1: states_[key] = np.squeeze(states_[key]) next_states_[key] = np.squeeze(next_states_[key]) #else: # states_[key] = np.reshape(states_[key], (1,)) # next_states_[key] = np.reshape(next_states_[key], (1,)) else: states_ = np.asarray(self.states_buffer[env_id]) next_states_ = np.asarray(self.next_states_buffer[env_id]) if self.flat_action_space is not None: actions_ = {} for i, key in enumerate(self.flat_action_space.keys()): actions_[key] = np.asarray( self.actions_buffer[env_id][i]) # Squeeze, but do not squeeze (1,) to (). if len(actions_[key]) > 1: actions_[key] = np.squeeze(actions_[key]) else: actions_[key] = np.reshape(actions_[key], (1, )) else: actions_ = np.asarray(self.actions_buffer[env_id]) self._write_rewards_summary( rewards=self. rewards_buffer[env_id], # No need to be converted to np terminals=self.terminals_buffer[env_id], env_id=env_id) self._observe_graph( preprocessed_states=states_, actions=actions_, internals=np.asarray(self.internals_buffer[env_id]), rewards=np.asarray(self.rewards_buffer[env_id]), next_states=next_states_, terminals=np.asarray(self.terminals_buffer[env_id])) self.reset_env_buffers(env_id) else: if not batched: preprocessed_states, _ = self.preprocessed_state_space.force_batch( preprocessed_states) next_states, _ = self.preprocessed_state_space.force_batch( next_states) actions, _ = self.action_space.force_batch(actions) rewards = [rewards] terminals = [terminals] self._write_rewards_summary( rewards=rewards, # No need to be converted to np terminals=terminals, env_id=env_id) self._observe_graph(preprocessed_states, actions, internals, rewards, next_states, terminals)
def create_variables(self, input_spaces, action_space=None): in_space = input_spaces["inputs"] self.output_spaces = flatten_op(self.get_preprocessed_space(in_space))
def _graph_fn_update_from_external_batch( root, preprocessed_states, actions, rewards, terminals, sequence_indices, apply_postprocessing=True, time_percentage=None ): """ Calls iterative optimization by repeatedly sub-sampling. """ multi_gpu_sync_optimizer = root.sub_components.get("multi-gpu-synchronizer") # Return values. loss, loss_per_item, vf_loss, vf_loss_per_item = None, None, None, None policy = root.get_sub_component_by_name(agent.policy.scope) value_function = root.get_sub_component_by_name(agent.value_function.scope) optimizer = root.get_sub_component_by_name(agent.optimizer.scope) loss_function = root.get_sub_component_by_name(agent.loss_function.scope) value_function_optimizer = root.get_sub_component_by_name(agent.value_function_optimizer.scope) vars_merger = root.get_sub_component_by_name(agent.vars_merger.scope) gae_function = root.get_sub_component_by_name(agent.gae_function.scope) prev_log_probs = policy.get_log_likelihood(preprocessed_states, actions)["log_likelihood"] prev_state_values = value_function.value_output(preprocessed_states) if get_backend() == "tf": batch_size = tf.shape(list(flatten_op(preprocessed_states).values())[0])[0] # Log probs before update (stop-gradient as these are used in target term). prev_log_probs = tf.stop_gradient(prev_log_probs) #prev_log_probs = tf.Print(prev_log_probs, [prev_log_probs], "prev-log-probs: ", summarize=1000) # State values before update (stop-gradient as these are used in target term). prev_state_values = tf.stop_gradient(prev_state_values) #prev_state_values = tf.Print(prev_state_values, [prev_state_values], "prev-state-values: ", summarize=1000) # Advantages are based on previous state values. advantages = tf.cond( pred=apply_postprocessing, true_fn=lambda: gae_function.calc_gae_values( prev_state_values, rewards, terminals, sequence_indices ), false_fn=lambda: rewards ) #advantages = tf.Print(advantages, [advantages], "advantages before standardizing: ", summarize=1000) if self.standardize_advantages: mean, std = tf.nn.moments(x=advantages, axes=[0]) advantages = (advantages - mean) / std #advantages = tf.Print(advantages, [advantages], "advantages after standardizing: ", summarize=1000) def opt_body(index_, loss_, loss_per_item_, vf_loss_, vf_loss_per_item_): start = tf.random_uniform(shape=(), minval=0, maxval=batch_size, dtype=tf.int32) indices = tf.range(start=start, limit=start + agent.sample_size) % batch_size # Use `map` here in case we have container states/actions. sample_states = preprocessed_states.map(lambda k, v: tf.gather(v, indices)) sample_actions = actions.map(lambda k, v: tf.gather(v, indices)) #sample_actions["direction"] = tf.Print(sample_actions["direction"], [sample_actions["direction"]], "sample-actions['direction']: ", summarize=1000) #sample_actions["jump"] = tf.Print(sample_actions["jump"], [sample_actions["jump"]], "sample-actions['jump']: ", summarize=1000) #sample_actions["crouch"] = tf.Print(sample_actions["crouch"], [sample_actions["crouch"]], "sample-actions['crouch']: ", summarize=1000) sample_prev_log_probs = tf.gather(params=prev_log_probs, indices=indices) sample_rewards = tf.gather(params=rewards, indices=indices) sample_terminals = tf.gather(params=terminals, indices=indices) sample_sequence_indices = tf.gather(params=sequence_indices, indices=indices) sample_advantages = tf.gather(params=advantages, indices=indices) sample_advantages.set_shape((agent.sample_size,)) sample_state_values = value_function.value_output(sample_states) sample_prev_state_values = tf.gather(params=prev_state_values, indices=indices) # If we are a multi-GPU root: # Simply feeds everything into the multi-GPU sync optimizer's method and return. if multi_gpu_sync_optimizer is not None: main_policy_vars = agent.policy.variables() main_vf_vars = agent.value_function.variables() all_vars = agent.vars_merger.merge(main_policy_vars, main_vf_vars) # grads_and_vars, loss, loss_per_item, vf_loss, vf_loss_per_item = \ out = multi_gpu_sync_optimizer.calculate_update_from_external_batch( all_vars, sample_states, sample_actions, sample_rewards, sample_terminals, sample_sequence_indices, apply_postprocessing=apply_postprocessing ) avg_grads_and_vars_policy, avg_grads_and_vars_vf = agent.vars_splitter.call( out["avg_grads_and_vars_by_component"] ) policy_step_op = agent.optimizer.apply_gradients(avg_grads_and_vars_policy) vf_step_op = agent.value_function_optimizer.apply_gradients(avg_grads_and_vars_vf) step_op = root._graph_fn_group(policy_step_op, vf_step_op) step_and_sync_op = multi_gpu_sync_optimizer.sync_variables_to_towers( step_op, all_vars ) loss_vf, loss_per_item_vf = out["additional_return_0"], out["additional_return_1"] # Have to set all shapes here due to strict loop-var shape requirements. out["loss"].set_shape(()) loss_vf.set_shape(()) loss_per_item_vf.set_shape((agent.sample_size,)) out["loss_per_item"].set_shape((agent.sample_size,)) with tf.control_dependencies([step_and_sync_op]): if index_ == 0: # Increase the global training step counter. out["loss"] = root._graph_fn_training_step(out["loss"]) return index_ + 1, out["loss"], out["loss_per_item"], loss_vf, loss_per_item_vf sample_log_probs = policy.get_log_likelihood(sample_states, sample_actions)["log_likelihood"] #sample_log_probs = tf.Print(sample_log_probs, [sample_log_probs], "sample-log-probs:", summarize=1000) entropy = policy.get_entropy(sample_states)["entropy"] #entropy["direction"] = tf.Print(entropy["direction"], [entropy["direction"]], "entropy['dir']: ", summarize=1000) loss, loss_per_item, vf_loss, vf_loss_per_item = \ loss_function.loss( sample_log_probs, sample_prev_log_probs, sample_state_values, sample_prev_state_values, sample_advantages, entropy, time_percentage ) if hasattr(root, "is_multi_gpu_tower") and root.is_multi_gpu_tower is True: policy_grads_and_vars = optimizer.calculate_gradients(policy.variables(), loss, time_percentage) vf_grads_and_vars = value_function_optimizer.calculate_gradients( value_function.variables(), vf_loss, time_percentage ) grads_and_vars_by_component = vars_merger.merge(policy_grads_and_vars, vf_grads_and_vars) return grads_and_vars_by_component, loss, loss_per_item, vf_loss, vf_loss_per_item else: step_op = optimizer.step(policy.variables(), loss, loss_per_item, time_percentage) loss.set_shape(()) loss_per_item.set_shape((agent.sample_size,)) vf_step_op = value_function_optimizer.step( value_function.variables(), vf_loss, vf_loss_per_item, time_percentage ) vf_loss.set_shape(()) vf_loss_per_item.set_shape((agent.sample_size,)) with tf.control_dependencies([step_op, vf_step_op]): return index_ + 1, loss, loss_per_item, vf_loss, vf_loss_per_item def cond(index_, loss_, loss_per_item_, v_loss_, v_loss_per_item_): return index_ < agent.iterations init_loop_vars = [ 0, tf.zeros(shape=(), dtype=tf.float32), tf.zeros(shape=(agent.sample_size,)), tf.zeros(shape=(), dtype=tf.float32), tf.zeros(shape=(agent.sample_size,)) ] if hasattr(root, "is_multi_gpu_tower") and root.is_multi_gpu_tower is True: return opt_body(*init_loop_vars) else: index, loss, loss_per_item, vf_loss, vf_loss_per_item = tf.while_loop( cond=cond, body=opt_body, loop_vars=init_loop_vars, parallel_iterations=1 ) # Increase the global training step counter. loss = root._graph_fn_training_step(loss) return loss, loss_per_item, vf_loss, vf_loss_per_item elif get_backend() == "pytorch": batch_size = list(flatten_op(preprocessed_states).values())[0].shape[0] sample_size = min(batch_size, agent.sample_size) if isinstance(prev_log_probs, dict): for name in actions.keys(): prev_log_probs[name] = prev_log_probs[name].detach() else: prev_log_probs = prev_log_probs.detach() prev_state_values = value_function.value_output(preprocessed_states).detach() if apply_postprocessing: advantages = gae_function.calc_gae_values(prev_state_values, rewards, terminals, sequence_indices) else: advantages = rewards if self.standardize_advantages: std = torch.std(advantages) if not np.isnan(std): advantages = (advantages - torch.mean(advantages)) / std for _ in range(agent.iterations): start = int(torch.rand(1) * (batch_size - 1)) indices = torch.arange(start=start, end=start + sample_size, dtype=torch.long) % batch_size sample_states = torch.index_select(preprocessed_states, 0, indices) if isinstance(actions, dict): sample_actions = DataOpDict() sample_prev_log_probs = DataOpDict() for name, action in define_by_run_flatten(actions, scope_separator_at_start=False).items(): sample_actions[name] = torch.index_select(action, 0, indices) sample_prev_log_probs[name] = torch.index_select(prev_log_probs[name], 0, indices) else: sample_actions = torch.index_select(actions, 0, indices) sample_prev_log_probs = torch.index_select(prev_log_probs, 0, indices) sample_advantages = torch.index_select(advantages, 0, indices) sample_prev_state_values = torch.index_select(prev_state_values, 0, indices) sample_log_probs = policy.get_log_likelihood(sample_states, sample_actions)["log_likelihood"] sample_state_values = value_function.value_output(sample_states) entropy = policy.get_entropy(sample_states)["entropy"] loss, loss_per_item, vf_loss, vf_loss_per_item = loss_function.loss( sample_log_probs, sample_prev_log_probs, sample_state_values, sample_prev_state_values, sample_advantages, entropy, time_percentage ) # Do not need step op. optimizer.step(policy.variables(), loss, loss_per_item, time_percentage) value_function_optimizer.step(value_function.variables(), vf_loss, vf_loss_per_item, time_percentage) return loss, loss_per_item, vf_loss, vf_loss_per_item
def opt_body(index_, loss_, loss_per_item_, vf_loss_, vf_loss_per_item_): start = tf.random_uniform(shape=(), minval=0, maxval=batch_size - 1, dtype=tf.int32) indices = tf.range( start=start, limit=start + agent.sample_size) % batch_size sample_states = tf.gather(params=preprocessed_states, indices=indices) if isinstance(actions, ContainerDataOp): sample_actions = FlattenedDataOp() for name, action in flatten_op(actions).items(): sample_actions[name] = tf.gather(params=action, indices=indices) sample_actions = unflatten_op(sample_actions) else: sample_actions = tf.gather(params=actions, indices=indices) sample_prior_log_probs = tf.gather(params=prev_log_probs, indices=indices) sample_rewards = tf.gather(params=rewards, indices=indices) sample_terminals = tf.gather(params=terminals, indices=indices) sample_sequence_indices = tf.gather( params=sequence_indices, indices=indices) sample_advantages = tf.gather(params=advantages, indices=indices) sample_advantages.set_shape((self.sample_size, )) sample_baseline_values = value_function.value_output( sample_states) sample_prior_baseline_values = tf.gather( params=prior_baseline_values, indices=indices) # If we are a multi-GPU root: # Simply feeds everything into the multi-GPU sync optimizer's method and return. if multi_gpu_sync_optimizer is not None: main_policy_vars = agent.policy.variables() main_vf_vars = agent.value_function.variables() all_vars = agent.vars_merger.merge( main_policy_vars, main_vf_vars) # grads_and_vars, loss, loss_per_item, vf_loss, vf_loss_per_item = \ out = multi_gpu_sync_optimizer.calculate_update_from_external_batch( all_vars, sample_states, sample_actions, sample_rewards, sample_terminals, sample_sequence_indices, apply_postprocessing=apply_postprocessing) avg_grads_and_vars_policy, avg_grads_and_vars_vf = agent.vars_splitter.call( out["avg_grads_and_vars_by_component"]) policy_step_op = agent.optimizer.apply_gradients( avg_grads_and_vars_policy) vf_step_op = agent.value_function_optimizer.apply_gradients( avg_grads_and_vars_vf) step_op = root._graph_fn_group(policy_step_op, vf_step_op) step_and_sync_op = multi_gpu_sync_optimizer.sync_variables_to_towers( step_op, all_vars) loss_vf, loss_per_item_vf = out[ "additional_return_0"], out["additional_return_1"] # Have to set all shapes here due to strict loop-var shape requirements. out["loss"].set_shape(()) loss_vf.set_shape(()) loss_per_item_vf.set_shape((agent.sample_size, )) out["loss_per_item"].set_shape((agent.sample_size, )) with tf.control_dependencies([step_and_sync_op]): if index_ == 0: # Increase the global training step counter. out["loss"] = root._graph_fn_training_step( out["loss"]) return index_ + 1, out["loss"], out[ "loss_per_item"], loss_vf, loss_per_item_vf policy_probs = policy.get_log_likelihood( sample_states, sample_actions)["log_likelihood"] baseline_values = value_function.value_output( tf.stop_gradient(sample_states)) sample_rewards = tf.cond( pred=apply_postprocessing, true_fn=lambda: gae_function.calc_gae_values( baseline_values, sample_rewards, sample_terminals, sample_sequence_indices), false_fn=lambda: sample_rewards) sample_rewards.set_shape((agent.sample_size, )) entropy = policy.get_entropy(sample_states)["entropy"] loss, loss_per_item, vf_loss, vf_loss_per_item = \ loss_function.loss( policy_probs, sample_prior_log_probs, sample_baseline_values, sample_prior_baseline_values, sample_advantages, entropy ) if hasattr(root, "is_multi_gpu_tower" ) and root.is_multi_gpu_tower is True: policy_grads_and_vars = optimizer.calculate_gradients( policy.variables(), loss) vf_grads_and_vars = value_function_optimizer.calculate_gradients( value_function.variables(), vf_loss) grads_and_vars_by_component = vars_merger.merge( policy_grads_and_vars, vf_grads_and_vars) return grads_and_vars_by_component, loss, loss_per_item, vf_loss, vf_loss_per_item else: step_op, loss, loss_per_item = optimizer.step( policy.variables(), loss, loss_per_item) loss.set_shape(()) loss_per_item.set_shape((agent.sample_size, )) vf_step_op, vf_loss, vf_loss_per_item = value_function_optimizer.step( value_function.variables(), vf_loss, vf_loss_per_item) vf_loss.set_shape(()) vf_loss_per_item.set_shape((agent.sample_size, )) with tf.control_dependencies([step_op, vf_step_op]): return index_ + 1, loss, loss_per_item, vf_loss, vf_loss_per_item