def _expand_leaf(self, leaf, observation): """Expands a leaf and returns its quality. The leaf's new children are assigned initial quality. The quality of the "best" new leaf is then backpropagated. Only modifies leaf - adds children with new qualities. Args: leaf (TreeNode): Leaf to expand. observation (np.ndarray): Observation received at leaf. Yields: Network prediction requests. Returns: float: Quality of a chosen child of the expanded leaf. """ child_qualities_and_probs = yield from self._new_leaf_rater( observation, self._model ) # This doesn't work with dynamic action spaces. TODO(koz4k): Fix. assert len(child_qualities_and_probs) == space_utils.max_size( self._action_space ) leaf.children = [ TreeNode( quality * self._leaf_quality_dampening + self._leaf_quality_bias, prob, ) for (quality, prob) in child_qualities_and_probs ] action = self._choose_action(leaf, exploratory=True) return leaf.children[action].quality
def _on_new_root(self, root): prior = np.array([child.prior_probability for child in root.children]) noise = np.random.dirichlet([self._prior_noise_parameter] * space_utils.max_size(self._action_space)) prior = ((1 - self._prior_noise_weight) * prior + self._prior_noise_weight * noise) for (child, p) in zip(root.children, prior): child.prior_probability = p
def network_signature(self, observation_space, action_space): n_actions = space_utils.max_size(action_space) action_vector_sig = data.TensorSignature(shape=(n_actions, )) if self._use_policy: output_sig = (action_vector_sig, ) * 2 else: output_sig = action_vector_sig return data.NetworkSignature( input=space_utils.signature(observation_space), output=output_sig, )
def network_signature(self, observation_space, action_space): obs_sig = space_utils.signature(observation_space) if self._inject_log_temperature: input_sig = (obs_sig, data.TensorSignature(shape=(1,))) else: input_sig = obs_sig n_actions = space_utils.max_size(action_space) action_vector_sig = data.TensorSignature(shape=(n_actions,)) output_sig = action_vector_sig return data.NetworkSignature(input=input_sig, output=output_sig)
def network_signature(self, observation_space, action_space): n_actions = space_utils.max_size(action_space) if self._use_policy: return data.NetworkSignature( input=space_utils.signature(observation_space), output=(data.TensorSignature(shape=(1, )), data.TensorSignature(shape=(n_actions, ))), ) else: return data.NetworkSignature( input=space_utils.signature(observation_space), output=data.TensorSignature(shape=(1, )), )
def _handle_env_feedback(self, agent_info, action, next_observation, reward, done, env_info): """Handles model's mispredictions.""" if not self._use_trainable_env: # We use perfect model, so there aren't any mispredictions # to handle. return root_parent = agent_info['node'] true_state = self._model.obs2state(next_observation) solved = env_info.get('solved', False) # Correct mispredicted reward. root_parent.rewards[action] = reward if self._current_node.state != true_state: # self._model predicted wrong state, initialize new tree from # the true state new_node = self._state2node.get(true_state, None) if new_node is None: # True next state was not visited previously. # Initialize new GraphNode. if done: value = self._value_traits.zero else: # Batch stepper requires all requests submitted at the same # time to have equal shape. The only other place, which # sends requests, is self._expand_leaf() method, where # `n_actions` observations are sent - so we do the same # here. # # In practice: in batch stepper allow different number of # observations to be sent from different agents. n_actions = space_utils.max_size(self._model.action_space) response = yield Request( RequestType.AGENT_PREDICTION, np.array([next_observation] * n_actions)) [value] = response[0] # we ignore all other responses new_node = self._initialize_graph_node(value, true_state, done, solved) # Correct mispredicted state in GraphNode, so we won't make # the same mistake again. root_parent.edges[action] = new_node self._current_node = new_node self._current_node.terminal = done self._current_node.solved = solved
def network_signature(self, observation_space, action_space): return { data.AgentRequest: data.NetworkSignature( input=space.signature(observation_space), output=data.TensorSignature(shape=(1,)), ), data.ModelRequest: data.NetworkSignature( input={ 'observation': space.signature(observation_space), 'action': data.TensorSignature( shape=(space.max_size(action_space),) ), }, output={ 'next_observation': space.signature(observation_space), 'reward': data.TensorSignature(shape=(1,)), 'done': data.TensorSignature(shape=(1,)), }, ) }
def _expand_leaf(self, leaf, observation): leaf.children = yield from self._init_child_nodes(leaf, observation) for node in leaf.children: quality = node.quality(self._discount) prob = node.prior_probability prob_ok = prob is None or np.isscalar(prob) assert np.isscalar(quality) and prob_ok, ( 'Invalid shape of node quality or prior probability - expected ' 'scalars, got {} and {}. Check if your network architecture is ' 'appropriate for the observation shape.'.format( quality.shape, prob.shape if prob is not None else None)) assert len(leaf.children) == space_utils.max_size(self._action_space) if leaf is self._root: self._on_new_root(leaf) (child, _) = self._choose_child(leaf, exploratory=True, strict_filter=True) return child.quality(self._discount)
def act(self, observation): agent_request = data.AgentRequest(observation[np.newaxis, :]) n_actions = space.max_size(self._action_space) action_to_query = random.randrange(0, n_actions) model_request = data.ModelRequest({ 'observation': observation[np.newaxis, :], 'action': transformations.one_hot_encode( [action_to_query], n_actions ) }) if not self._random_order or random.randrange(0, 2) == 0: agent_response = yield agent_request model_response = yield model_request else: model_response = yield model_request agent_response = yield agent_request assert agent_response.shape == (1, 1) assert data.ops.nested_map(lambda arr: arr.shape, model_response) == { 'next_observation': (1,) + observation.shape, 'reward': (1, 1), 'done': (1, 1), } value = agent_response.item() meaningless_sum = ( value + np.sum(model_response['next_observation']) + model_response['reward'].item() + model_response['done'].item() ) action = int(meaningless_sum * 1e9) % n_actions return action, {'value': value}
def solve(self, env, epoch=None, init_state=None, time_limit=None): yield from super().solve(env, epoch, init_state, time_limit) self._epoch = epoch model_env = env if time_limit is not None: env = envs.TimeLimitWrapper(env, time_limit) if init_state is None: observation = env.reset() else: observation = env.restore_state(init_state) yield from self.reset(model_env, observation) for callback in self._callbacks: callback.on_episode_begin(env, observation, epoch) transitions = [] done = False info = {} while not done: (action, agent_info) = yield from self.act(observation) (next_observation, reward, done, info) = env.step(action) for callback in self._callbacks: callback.on_real_step(agent_info, action, next_observation, reward, done) transitions.append( data.Transition( observation=observation, action=action, reward=reward, done=done, next_observation=next_observation, agent_info=agent_info, )) observation = next_observation for callback in self._callbacks: callback.on_episode_end() transitions = self.postprocess_transitions(transitions) return_ = sum(transition.reward for transition in transitions) solved = info['solved'] if 'solved' in info else None truncated = (info['TimeLimit.truncated'] if 'TimeLimit.truncated' in info else None) transition_batch = data.nested_stack(transitions) action_space_size = space.max_size(model_env.action_space) return data.Episode(transition_batch=transition_batch, return_=return_, solved=solved, truncated=truncated, action_space_size=action_space_size)
def params_signature(action_space): return data.TensorSignature( shape=(space_utils.max_size(action_space), ))