Exemplo n.º 1
0
    def __init__(self, agent_parameters: AgentParameters, has_target: bool, has_global: bool, name: str,
                 spaces: SpacesDefinition, replicated_device=None, worker_device=None):
        self.ap = agent_parameters
        self.network_parameters = self.ap.network_wrappers[name]
        self.has_target = has_target
        self.has_global = has_global
        self.name = name
        self.sess = None

        if self.network_parameters.framework == Frameworks.tensorflow:
            if "tensorflow" not in failed_imports:
                general_network = GeneralTensorFlowNetwork.construct
            else:
                raise Exception('Install tensorflow before using it as framework')
        elif self.network_parameters.framework == Frameworks.mxnet:
            if "mxnet" not in failed_imports:
                general_network = GeneralMxnetNetwork.construct
            else:
                raise Exception('Install mxnet before using it as framework')
        else:
            raise Exception("{} Framework is not supported"
                            .format(Frameworks().to_string(self.network_parameters.framework)))

        variable_scope = "{}/{}".format(self.ap.full_name_id, name)

        # Global network - the main network shared between threads
        self.global_network = None
        if self.has_global:
            # we assign the parameters of this network on the parameters server
            self.global_network = general_network(variable_scope=variable_scope,
                                                  devices=force_list(replicated_device),
                                                  agent_parameters=agent_parameters,
                                                  name='{}/global'.format(name),
                                                  global_network=None,
                                                  network_is_local=False,
                                                  spaces=spaces,
                                                  network_is_trainable=True)

        # Online network - local copy of the main network used for playing
        self.online_network = None
        self.online_network = general_network(variable_scope=variable_scope,
                                              devices=force_list(worker_device),
                                              agent_parameters=agent_parameters,
                                              name='{}/online'.format(name),
                                              global_network=self.global_network,
                                              network_is_local=True,
                                              spaces=spaces,
                                              network_is_trainable=True)

        # Target network - a local, slow updating network used for stabilizing the learning
        self.target_network = None
        if self.has_target:
            self.target_network = general_network(variable_scope=variable_scope,
                                                  devices=force_list(worker_device),
                                                  agent_parameters=agent_parameters,
                                                  name='{}/target'.format(name),
                                                  global_network=self.global_network,
                                                  network_is_local=True,
                                                  spaces=spaces,
                                                  network_is_trainable=False)
Exemplo n.º 2
0
    def post_training_commands(self):
        # remove entropy regularization
        self.networks['main'].online_network.set_variable_value(
            self.networks['main'].online_network.output_heads[1].set_beta, 0,
            self.networks['main'].online_network.output_heads[1].
            beta_placeholder)

        # set the loss weights to the SIL loss weights
        for output_head_idx, output_head in enumerate(
                self.networks['main'].online_network.output_heads):
            self.networks['main'].online_network.set_variable_value(
                output_head.set_loss_weight,
                force_list(self.ap.network_wrappers['main'].
                           sil_loss_weights[output_head_idx]),
                output_head.loss_weight_placeholder)

        # sil training
        for i in range(self.ap.algorithm.
                       off_policy_training_steps_per_on_policy_training_steps):
            off_policy_loss = self.train_off_policy()

        # add back entropy regularization
        self.networks['main'].online_network.set_variable_value(
            self.networks['main'].online_network.output_heads[1].set_beta,
            self.ap.algorithm.beta_entropy, self.networks['main'].
            online_network.output_heads[1].beta_placeholder)

        # recover the regular loss weights
        for output_head_idx, output_head in enumerate(
                self.networks['main'].online_network.output_heads):
            self.networks['main'].online_network.set_variable_value(
                output_head.set_loss_weight,
                force_list(self.ap.network_wrappers['main'].
                           loss_weights[output_head_idx]),
                output_head.loss_weight_placeholder)
Exemplo n.º 3
0
 def train_on_batch(self,
                    inputs,
                    targets,
                    scaler=1.,
                    additional_fetches=None,
                    importance_weights=None):
     """
     Given a batch of examples and targets, runs a forward pass & backward pass and then applies the gradients
     :param additional_fetches: Optional tensors to fetch during the training process
     :param inputs: The input for the network
     :param targets: The targets corresponding to the input batch
     :param scaler: A scaling factor that allows rescaling the gradients before applying them
     :param importance_weights: A coefficient for each sample in the batch, which will be used to rescale the loss
                                error of this sample. If it is not given, the samples losses won't be scaled
     :return: The loss of the network
     """
     if additional_fetches is None:
         additional_fetches = []
     force_list(additional_fetches)
     loss = self.accumulate_gradients(inputs,
                                      targets,
                                      additional_fetches=additional_fetches,
                                      importance_weights=importance_weights)
     self.apply_and_reset_gradients(self.accumulated_gradients, scaler)
     return loss
    def step(self, action: Union[List[ActionType],
                                 ActionType]) -> List[EnvResponse]:
        """
        Make a single step in the environment using the given action

        :param action: an action to use for stepping the environment. Should follow the definition of the action space.
        :return: the environment response as returned in get_last_env_response
        """
        clipped_and_scaled_action = list()
        for agent_action, action_space in zip(force_list(action),
                                              force_list(self.action_space)):
            agent_action = action_space.clip_action_to_space(agent_action)
            if action_space and not action_space.contains(agent_action):
                raise ValueError(
                    "The given action does not match the action space definition. "
                    "Action = {}, action space definition = {}".format(
                        agent_action, action_space))
            if hasattr(
                    action_space,
                    'scale_action_space') and action_space.scale_action_space:
                agent_action = action_space.scale_action_values(agent_action)
            clipped_and_scaled_action.append(agent_action)
        action = clipped_and_scaled_action
        # store the last agent action done and allow passing None actions to repeat the previously done action
        if action is None:
            action = self.last_action
        self.last_action = action

        self.current_episode_steps_counter += 1
        if self.phase != RunPhase.UNDEFINED:
            self.total_steps_counter += 1

        # act
        self._take_action(action)

        # observe
        self._update_state()

        self.total_reward_in_current_episode = [
            total_reward_in_current_episode + reward
            for total_reward_in_current_episode, reward in zip(
                self.total_reward_in_current_episode, self.reward)
        ]

        self.last_env_response = \
            [EnvResponse(
                next_state=state,
                reward=reward,
                game_over=done,
                goal=self.goal,
                info=self.info
            ) for state, reward, done in zip(self.state, self.reward, self.done)]

        return self.last_env_response
Exemplo n.º 5
0
    def _build_module(self):
        """
        self.state_in: tuple of placeholders containing the initial state
        self.state_out: tuple of output state

        todo: it appears that the shape of the output is batch, feature
        the code here seems to be slicing off the first element in the batch
        which would definitely be wrong. need to double check the shape
        """

        self.layers.append(self.input)

        # optionally insert some layers before the LSTM
        for idx, layer_params in enumerate(self.layers_params):
            self.layers.extend(force_list(
                layer_params(self.layers[-1], name='fc{}'.format(idx),
                             is_training=self.is_training)
            ))

        # add the LSTM layer
        lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(self.number_of_lstm_cells, state_is_tuple=True)
        self.c_init = np.zeros((1, lstm_cell.state_size.c), np.float32)
        self.h_init = np.zeros((1, lstm_cell.state_size.h), np.float32)
        self.state_init = [self.c_init, self.h_init]
        self.c_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.c])
        self.h_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.h])
        self.state_in = (self.c_in, self.h_in)
        rnn_in = tf.expand_dims(self.layers[-1], [0])
        step_size = tf.shape(self.layers[-1])[:1]
        state_in = tf.nn.rnn_cell.LSTMStateTuple(self.c_in, self.h_in)
        lstm_outputs, lstm_state = tf.nn.dynamic_rnn(
            lstm_cell, rnn_in, initial_state=state_in, sequence_length=step_size, time_major=False)
        lstm_c, lstm_h = lstm_state
        self.state_out = (lstm_c[:1, :], lstm_h[:1, :])
        self.output = tf.reshape(lstm_outputs, [-1, self.number_of_lstm_cells])
Exemplo n.º 6
0
    def last_env_response(self, val: Union[List[EnvResponse], EnvResponse]):
        """
        Set the last environment response

        :param val: the last environment response
        """
        self._last_env_response = force_list(val)
Exemplo n.º 7
0
 def should_dump_video_of_the_current_episode(self, episode_terminated=False):
     if self.visualization_parameters.video_dump_methods:
         for video_dump_method in force_list(self.visualization_parameters.video_dump_methods):
             if not video_dump_method.should_dump(episode_terminated, **self.__dict__):
                 return False
         return True
     return False
Exemplo n.º 8
0
    def _build_module(self) -> None:
        """
        Builds the graph of the module
        This method is called early on from __call__. It is expected to store the graph
        in self.output.
        :return: None
        """
        # NOTE: for image inputs, we expect the data format to be of type uint8, so to be memory efficient. we chose not
        #  to implement the rescaling as an input filters.observation.observation_filter, as this would have caused the
        #  input to the network to be float, which is 4x more expensive in memory.
        #  thus causing each saved transition in the memory to also be 4x more pricier.

        input_layer = self.input / self.input_rescaling
        input_layer -= self.input_offset
        # clip input using te given range
        if self.input_clipping is not None:
            input_layer = tf.clip_by_value(input_layer, self.input_clipping[0],
                                           self.input_clipping[1])

        self.layers.append(input_layer)

        for idx, layer_params in enumerate(self.layers_params):
            self.layers.extend(
                force_list(
                    layer_params(input_layer=self.layers[-1],
                                 name='{}_{}'.format(
                                     layer_params.__class__.__name__, idx),
                                 is_training=self.is_training)))

        self.output = tf.contrib.layers.flatten(self.layers[-1])
Exemplo n.º 9
0
 def __init__(self,
              agent_parameters: AgentParameters,
              spaces: SpacesDefinition,
              network_name: str,
              head_idx: int = 0,
              loss_weight: float = 1.,
              is_local: bool = True,
              activation_function: str = 'relu'):
     self.head_idx = head_idx
     self.network_name = network_name
     self.network_parameters = agent_parameters.network_wrappers[
         self.network_name]
     self.name = "head"
     self.output = []
     self.loss = []
     self.loss_type = []
     self.regularizations = []
     self.loss_weight = force_list(loss_weight)
     self.target = []
     self.importance_weight = []
     self.input = []
     self.is_local = is_local
     self.ap = agent_parameters
     self.spaces = spaces
     self.return_type = None
     self.activation_function = activation_function
Exemplo n.º 10
0
 def __init__(self,
              agent_parameters: AgentParameters,
              spaces: SpacesDefinition,
              network_name: str,
              head_idx: int = 0,
              loss_weight: float = 1.,
              is_local: bool = True,
              activation_function: str = 'relu',
              dense_layer=Dense):
     self.head_idx = head_idx
     self.network_name = network_name
     self.network_parameters = agent_parameters.network_wrappers[
         self.network_name]
     self.name = "head"
     self.output = []
     self.loss = []
     self.loss_type = []
     self.regularizations = []
     self.loss_weight = tf.Variable(
         [float(w) for w in force_list(loss_weight)],
         trainable=False,
         collections=[tf.GraphKeys.LOCAL_VARIABLES])
     self.target = []
     self.importance_weight = []
     self.input = []
     self.is_local = is_local
     self.ap = agent_parameters
     self.spaces = spaces
     self.return_type = None
     self.activation_function = activation_function
     self.dense_layer = dense_layer
     if self.dense_layer is None:
         self.dense_layer = Dense
     else:
         self.dense_layer = convert_layer_class(self.dense_layer)
Exemplo n.º 11
0
    def _build_module(self, input_layer):
        self.layers.append(input_layer)
        for idx, layer_params in enumerate(self.scheme):
            self.layers.extend(force_list(
                layer_params(input_layer=self.layers[-1], name='{}_{}'.format(layer_params.__class__.__name__, idx))
            ))

        self.layers.append(self.dense_layer(self.num_actions)(self.layers[-1], name='output'))
        self.output = self.layers[-1]
Exemplo n.º 12
0
    def train_value_network(self, dataset, epochs):
        loss = []
        batch = Batch(dataset)
        network_keys = self.ap.network_wrappers[
            'critic'].input_embedders_parameters.keys()

        # * Found not to have any impact *
        # add a timestep to the observation
        # current_states_with_timestep = self.concat_state_and_timestep(dataset)

        mix_fraction = self.ap.algorithm.value_targets_mix_fraction
        total_returns = batch.n_step_discounted_rewards(True)
        for j in range(epochs):
            curr_batch_size = batch.size
            if self.networks['critic'].online_network.optimizer_type != 'LBFGS':
                curr_batch_size = self.ap.network_wrappers['critic'].batch_size
            for i in range(batch.size // curr_batch_size):
                # split to batches for first order optimization techniques
                current_states_batch = {
                    k: v[i * curr_batch_size:(i + 1) * curr_batch_size]
                    for k, v in batch.states(network_keys).items()
                }
                total_return_batch = total_returns[i *
                                                   curr_batch_size:(i + 1) *
                                                   curr_batch_size]
                old_policy_values = force_list(
                    self.networks['critic'].target_network.predict(
                        current_states_batch).squeeze())
                if self.networks[
                        'critic'].online_network.optimizer_type != 'LBFGS':
                    targets = total_return_batch
                else:
                    current_values = self.networks[
                        'critic'].online_network.predict(current_states_batch)
                    targets = current_values * (
                        1 - mix_fraction) + total_return_batch * mix_fraction

                inputs = copy.copy(current_states_batch)
                for input_index, input in enumerate(old_policy_values):
                    name = 'output_0_{}'.format(input_index)
                    if name in self.networks['critic'].online_network.inputs:
                        inputs[name] = input

                value_loss = self.networks[
                    'critic'].online_network.accumulate_gradients(
                        inputs, targets)

                self.networks['critic'].apply_gradients_to_online_network()
                if isinstance(self.ap.task_parameters,
                              DistributedTaskParameters):
                    self.networks['critic'].apply_gradients_to_global_network()
                self.networks[
                    'critic'].online_network.reset_accumulated_gradients()

                loss.append([value_loss[0]])
        loss = np.mean(loss, 0)
        return loss
    def filter(self, unfiltered_data: Union[EnvResponse, List[EnvResponse], Transition, List[Transition]],
               update_internal_state: bool=True, deep_copy: bool=True) -> Union[List[EnvResponse], List[Transition]]:
        """
        A wrapper around _filter which first copies the env_response so that we don't change the original one
        This function should not be updated!
        :param unfiltered_data: the input data
        :param update_internal_state: should the filter's internal state change due to this call
        :return: the filtered env_response
        """
        if self.i_am_a_reference_filter:
            raise Exception("The filter being used is a reference filter. It is not to be used directly. "
                            "Instead get a duplicate from it by calling __call__.")
        if deep_copy:
            filtered_data = copy.deepcopy(unfiltered_data)
        else:
            filtered_data = [copy.copy(t) for t in unfiltered_data]
        filtered_data = force_list(filtered_data)

        # TODO: implement observation space validation
        # filter observations
        if isinstance(filtered_data[0], Transition):
            state_objects_to_filter = [[f.state for f in filtered_data],
                                       [f.next_state for f in filtered_data]]
        elif isinstance(filtered_data[0], EnvResponse):
            state_objects_to_filter = [[f.next_state for f in filtered_data]]
        else:
            raise ValueError("unfiltered_data should be either of type EnvResponse or Transition. ")

        for state_object_list in state_objects_to_filter:
            for observation_name, filters in self._observation_filters.items():
                if observation_name in state_object_list[0].keys():
                    for filter in filters.values():
                        data_to_filter = [state_object[observation_name] for state_object in state_object_list]
                        if filter.supports_batching:
                            filtered_observations = filter.filter(
                                data_to_filter, update_internal_state=update_internal_state)
                        else:
                            filtered_observations = []
                            for data_point in data_to_filter:
                                filtered_observations.append(filter.filter(
                                    data_point, update_internal_state=update_internal_state))

                        for i, state_object in enumerate(state_object_list):
                            state_object[observation_name] = filtered_observations[i]

        # filter reward
        for filter in self._reward_filters.values():
            if filter.supports_batching:
                filtered_rewards = filter.filter([f.reward for f in filtered_data], update_internal_state)
                for d, filtered_reward in zip(filtered_data, filtered_rewards):
                    d.reward = filtered_reward
            else:
                for d in filtered_data:
                    d.reward = filter.filter(d.reward, update_internal_state)

        return filtered_data
Exemplo n.º 14
0
    def _build_module(self):
        self.layers.append(self.input)

        for idx, layer_params in enumerate(self.layers_params):
            self.layers.extend(force_list(
                layer_params(self.layers[-1], name='{}_{}'.format(layer_params.__class__.__name__, idx),
                             is_training=self.is_training)
            ))

        self.output = self.layers[-1]
Exemplo n.º 15
0
    def __call__(self, input_layer):
        """
        Wrapper for building the module graph including scoping and loss creation
        :param input_layer: the input to the graph
        :return: the output of the last layer and the target placeholder
        """
        with tf.variable_scope(
                self.get_name(),
                initializer=tf.contrib.layers.xavier_initializer()):
            self._build_module(input_layer)

            self.output = force_list(self.output)
            self.target = force_list(self.target)
            self.input = force_list(self.input)
            self.loss_type = force_list(self.loss_type)
            self.loss = force_list(self.loss)
            self.regularizations = force_list(self.regularizations)
            if self.is_local:
                self.set_loss()
            self._post_build()

        if self.is_local:
            return self.output, self.target, self.input, self.importance_weight
        else:
            return self.output, self.input
Exemplo n.º 16
0
    def _build_module(self):
        self.output = []

        for stream_idx in range(self.num_streams):
            layers = [self.input]

            for idx, layer_params in enumerate(self.layers_params):
                layers.extend(force_list(
                    layer_params(layers[-1], name='{}_{}'.format(layer_params.__class__.__name__,
                                                                 idx + stream_idx * len(self.layers_params)),
                                 is_training=self.is_training)
                ))
            self.output.append((layers[-1]))
Exemplo n.º 17
0
    def prepare_batch_for_inference(self,
                                    states: Union[Dict[str, np.ndarray],
                                                  List[Dict[str, np.ndarray]]],
                                    network_name: str):
        """
        convert curr_state into input tensors tensorflow is expecting. i.e. if we have several inputs states, stack all
        observations together, measurements together, etc.
        """
        # convert to batch so we can run it through the network
        states = force_list(states)
        batches_dict = {}
        for key in self.ap.network_wrappers[
                network_name].input_embedders_parameters.keys():
            # there are cases (e.g. ddpg) where the state does not contain all the information needed for running
            # through the network and this has to be added externally (e.g. ddpg where the action needs to be given in
            # addition to the current_state, so that all the inputs of the network will be filled)
            if key in states[0].keys():
                batches_dict[key] = np.array(
                    [np.array(state[key]) for state in states])

        return batches_dict
Exemplo n.º 18
0
    def prepare_batch_for_inference(self, states: Union[Dict[str, np.ndarray], List[Dict[str, np.ndarray]]],
                                    network_name: str) -> Dict[str, np.array]:
        """
        Convert curr_state into input tensors tensorflow is expecting. i.e. if we have several inputs states, stack all
        observations together, measurements together, etc.

        :param states: A list of environment states, where each one is a dict mapping from an observation name to its
                       corresponding observation
        :param network_name: The agent network name to prepare the batch for. this is needed in order to extract only
                             the observation relevant for the network from the states.
        :return: A dictionary containing a list of values from all the given states for each of the observations
        """
        # convert to batch so we can run it through the network
        states = force_list(states)
        batches_dict = {}
        for key in self.ap.network_wrappers[network_name].input_embedders_parameters.keys():
            # there are cases (e.g. ddpg) where the state does not contain all the information needed for running
            # through the network and this has to be added externally (e.g. ddpg where the action needs to be given in
            # addition to the current_state, so that all the inputs of the network will be filled)
            if key in states[0].keys():
                batches_dict[key] = np.array([np.array(state[key]) for state in states])

        return batches_dict
Exemplo n.º 19
0
    def accumulate_gradients(self,
                             inputs: Dict[str, np.ndarray],
                             targets: List[np.ndarray],
                             additional_fetches: List[Tuple[int, str]] = None,
                             importance_weights: np.ndarray = None,
                             no_accumulation: bool = False) -> Tuple[float, List[float], float, list]:
        """
        Runs a forward & backward pass, clips gradients if needed and accumulates them into the accumulation
        :param inputs: environment states (observation, etc.) as well extra inputs required by loss. Shape of ndarray
            is (batch_size, observation_space_size) or (batch_size, observation_space_size, stack_size)
        :param targets: targets required by  loss (e.g. sum of discounted rewards)
        :param additional_fetches: additional fetches to calculate and return. Each fetch is specified as (int, str)
            tuple of head-type-index and fetch-name. The tuple is obtained from each head.
        :param importance_weights: ndarray of shape (batch_size,) to multiply with batch loss.
        :param no_accumulation: if True, set gradient values to the new gradients, otherwise sum with previously
            calculated gradients
        :return: tuple of total_loss, losses, norm_unclipped_grads, fetched_tensors
            total_loss (float): sum of all head losses
            losses (list of float): list of all losses. The order is list of target losses followed by list of
                regularization losses. The specifics of losses is dependant on the network parameters
                (number of heads, etc.)
            norm_unclippsed_grads (float): global norm of all gradients before any gradient clipping is applied
            fetched_tensors: all values for additional_fetches
        """
        if self.accumulated_gradients is None:
            self.reset_accumulated_gradients()

        embedders = [emb.embedder_name for emb in self.model.nets[0].input_embedders]
        nd_inputs = tuple(nd.array(inputs[emb]) for emb in embedders)

        assert self.middleware.__class__.__name__ != 'LSTMMiddleware', "LSTM middleware not supported"

        targets = force_list(targets)
        with autograd.record():
            out_per_head = utils.split_outputs_per_head(self.model(*nd_inputs), self.model.output_heads)
            tgt_per_loss = utils.split_targets_per_loss(targets, self.losses)

            losses = list()
            regularizations = list()
            additional_fetches = [(k, None) for k in additional_fetches]
            for h, h_loss, h_out, l_tgt in zip(self.model.output_heads, self.losses, out_per_head, tgt_per_loss):
                l_in = utils.get_loss_agent_inputs(inputs, head_type_idx=h.head_type_idx, loss=h_loss)
                # Align arguments with loss.loss_forward and convert to NDArray
                l_args = utils.to_mx_ndarray(utils.align_loss_args(h_out, l_in, l_tgt, h_loss))
                # Calculate loss and all auxiliary outputs
                loss_outputs = utils.loss_output_dict(utils.to_list(h_loss(*l_args)), h_loss.output_schema)
                if LOSS_OUT_TYPE_LOSS in loss_outputs:
                    losses.extend(loss_outputs[LOSS_OUT_TYPE_LOSS])
                if LOSS_OUT_TYPE_REGULARIZATION in loss_outputs:
                    regularizations.extend(loss_outputs[LOSS_OUT_TYPE_REGULARIZATION])
                # Set additional fetches
                for i, fetch in enumerate(additional_fetches):
                    head_type_idx, fetch_name = fetch[0]  # fetch key is a tuple of (head_type_index, fetch_name)
                    if head_type_idx == h.head_type_idx:
                        assert fetch[1] is None  # sanity check that fetch is None
                        additional_fetches[i] = (fetch[0], loss_outputs[fetch_name])

            # Total loss is losses and regularization (NOTE: order is important)
            total_loss_list = losses + regularizations
            total_loss = nd.add_n(*total_loss_list)

        # Calculate gradients
        total_loss.backward()

        assert self.optimizer_type != 'LBFGS', 'LBFGS not supported'

        # allreduce gradients from all contexts
        self.trainer.allreduce_grads()

        # Calculate global norm of gradients
        # FIXME global norm is returned even when not used for clipping! Is this necessary?
        # FIXME global norm might be calculated twice if clipping method is global norm
        norm_unclipped_grads = utils.global_norm(self._model_grads)

        # Clip gradients
        if self.network_parameters.clip_gradients:
            utils.clip_grad(
                self._model_grads,
                clip_method=self.network_parameters.gradients_clipping_method,
                clip_val=self.network_parameters.clip_gradients,
                inplace=True)

        # Update self.accumulated_gradients depending on no_accumulation flag
        if no_accumulation:
            for acc_grad, model_grad in zip(self.accumulated_gradients, self._model_grads):
                acc_grad[:] = model_grad
        else:
            for acc_grad, model_grad in zip(self.accumulated_gradients, self._model_grads):
                acc_grad += model_grad

        # result of of additional fetches
        fetched_tensors = [fetch[1] for fetch in additional_fetches]

        # convert everything to numpy or scalar before returning
        result = utils.asnumpy_or_asscalar((total_loss, total_loss_list, norm_unclipped_grads, fetched_tensors))
        return result
Exemplo n.º 20
0
    def train_policy_network(self, dataset, epochs):
        loss = []
        for j in range(epochs):
            loss = {
                'total_loss': [],
                'policy_losses': [],
                'unclipped_grads': [],
                'fetch_result': []
            }
            #shuffle(dataset)
            for i in range(
                    len(dataset) //
                    self.ap.network_wrappers['actor'].batch_size):
                batch = Batch(
                    dataset[i *
                            self.ap.network_wrappers['actor'].batch_size:(i +
                                                                          1) *
                            self.ap.network_wrappers['actor'].batch_size])

                network_keys = self.ap.network_wrappers[
                    'actor'].input_embedders_parameters.keys()

                advantages = batch.info('advantage')
                actions = batch.actions()
                if not isinstance(self.spaces.action,
                                  DiscreteActionSpace) and len(
                                      actions.shape) == 1:
                    actions = np.expand_dims(actions, -1)

                # get old policy probabilities and distribution
                old_policy = force_list(
                    self.networks['actor'].target_network.predict(
                        batch.states(network_keys)))

                # calculate gradients and apply on both the local policy network and on the global policy network
                fetches = [
                    self.networks['actor'].online_network.output_heads[0].
                    kl_divergence, self.networks['actor'].online_network.
                    output_heads[0].entropy
                ]

                inputs = copy.copy(batch.states(network_keys))
                inputs['output_0_0'] = actions

                # old_policy_distribution needs to be represented as a list, because in the event of discrete controls,
                # it has just a mean. otherwise, it has both a mean and standard deviation
                for input_index, input in enumerate(old_policy):
                    inputs['output_0_{}'.format(input_index + 1)] = input

                total_loss, policy_losses, unclipped_grads, fetch_result =\
                    self.networks['actor'].online_network.accumulate_gradients(
                        inputs, [advantages], additional_fetches=fetches)

                self.networks['actor'].apply_gradients_to_online_network()
                if isinstance(self.ap.task_parameters,
                              DistributedTaskParameters):
                    self.networks['actor'].apply_gradients_to_global_network()

                self.networks[
                    'actor'].online_network.reset_accumulated_gradients()

                loss['total_loss'].append(total_loss)
                loss['policy_losses'].append(policy_losses)
                loss['unclipped_grads'].append(unclipped_grads)
                loss['fetch_result'].append(fetch_result)

                self.unclipped_grads.add_sample(unclipped_grads)

            for key in loss.keys():
                loss[key] = np.mean(loss[key], 0)

            if self.ap.network_wrappers['critic'].learning_rate_decay_rate != 0:
                curr_learning_rate = self.networks[
                    'critic'].online_network.get_variable_value(
                        self.ap.learning_rate)
                self.curr_learning_rate.add_sample(curr_learning_rate)
            else:
                curr_learning_rate = self.ap.network_wrappers[
                    'critic'].learning_rate

            # log training parameters
            screen.log_dict(OrderedDict([
                ("Surrogate loss", loss['policy_losses'][0]),
                ("KL divergence", loss['fetch_result'][0]),
                ("Entropy", loss['fetch_result'][1]), ("training epoch", j),
                ("learning_rate", curr_learning_rate)
            ]),
                            prefix="Policy training")

        self.total_kl_divergence_during_training_process = loss[
            'fetch_result'][0]
        self.entropy.add_sample(loss['fetch_result'][1])
        self.kl_divergence.add_sample(loss['fetch_result'][0])
        return loss['total_loss']
Exemplo n.º 21
0
    def accumulate_gradients(self, inputs, targets, additional_fetches=None, importance_weights=None,
                             no_accumulation=False):
        """
        Runs a forward pass & backward pass, clips gradients if needed and accumulates them into the accumulation
        placeholders
        :param additional_fetches: Optional tensors to fetch during gradients calculation
        :param inputs: The input batch for the network
        :param targets: The targets corresponding to the input batch
        :param importance_weights: A coefficient for each sample in the batch, which will be used to rescale the loss
                                   error of this sample. If it is not given, the samples losses won't be scaled
        :param no_accumulation: If is set to True, the gradients in the accumulated gradients placeholder will be
                                replaced by the newely calculated gradients instead of accumulating the new gradients.
                                This can speed up the function runtime by around 10%.
        :return: A list containing the total loss and the individual network heads losses
        """

        if self.accumulated_gradients is None:
            self.reset_accumulated_gradients()

        # feed inputs
        if additional_fetches is None:
            additional_fetches = []
        feed_dict = self.create_feed_dict(inputs)
        #var_list = self.create_variable_list()
        # feed targets
        targets = force_list(targets)
        for placeholder_idx, target in enumerate(targets):
            feed_dict[self.targets[placeholder_idx]] = target

        # feed importance weights
        importance_weights = force_list(importance_weights)
        for placeholder_idx, target_ph in enumerate(targets):
            if len(importance_weights) <= placeholder_idx or importance_weights[placeholder_idx] is None:
                importance_weight = np.ones(target_ph.shape[0])
            else:
                importance_weight = importance_weights[placeholder_idx]
            importance_weight = np.reshape(importance_weight, (-1,) + (1,)*(len(target_ph.shape)-1))

            feed_dict[self.importance_weights[placeholder_idx]] = importance_weight

        if self.optimizer_type != 'LBFGS':

            # feed the lstm state if necessary
            if self.middleware.__class__.__name__ == 'LSTMMiddleware':
                # we can't always assume that we are starting from scratch here can we?
                feed_dict[self.middleware.c_in] = self.middleware.c_init
                feed_dict[self.middleware.h_in] = self.middleware.h_init

            fetches = self.train_fetches + additional_fetches
            if self.ap.visualization.tensorboard:
                fetches += [self.merged]

            # get grads
            result = self.sess.run(fetches, feed_dict=feed_dict)
            if hasattr(self, 'train_writer') and self.train_writer is not None:
                self.train_writer.add_summary(result[-1], self.sess.run(self.global_step))

            # extract the fetches
            norm_unclipped_grads, grads, total_loss, losses = result[:4]
            if self.middleware.__class__.__name__ == 'LSTMMiddleware':
                (self.curr_rnn_c_in, self.curr_rnn_h_in) = result[4]
            fetched_tensors = []
            if len(additional_fetches) > 0:
                fetched_tensors = result[self.additional_fetches_start_idx:self.additional_fetches_start_idx +
                                                                      len(additional_fetches)]

            # accumulate the gradients
            for idx, grad in enumerate(grads):
                if no_accumulation:
                    self.accumulated_gradients[idx] = grad
                else:
                    self.accumulated_gradients[idx] += grad

            return total_loss, losses, norm_unclipped_grads, fetched_tensors

        else:
            self.optimizer.minimize(session=self.sess, feed_dict=feed_dict)

            return [0]
Exemplo n.º 22
0
    # if no arg is given
    if len(sys.argv) == 1:
        parser.print_help()
        exit(0)

    dir_prefix = args.dir_prefix
    preset = args.preset
    levels = args.level.split(',') if args.level is not None else [None]
    num_seeds = args.seeds
    num_workers = args.num_workers
    gpu = [int(gpu) for gpu in args.gpu.split(',')]
    level_as_sub_dir = args.level_as_sub_dir

    processes = []
    gpu_list = force_list(gpu)
    curr_gpu_idx = 0
    for level in levels:
        for seed in range(num_seeds):
            # select the next gpu for this run
            set_gpu(gpu_list[curr_gpu_idx])

            command = [
                'python3', 'rl_coach/coach.py', '-ns', '-p',
                '{}'.format(preset), '--seed', '{}'.format(seed), '-n',
                '{}'.format(num_workers)
            ]
            if dir_prefix != "":
                dir_prefix += "_"
            if args.use_cpu:
                command.append("-c")
Exemplo n.º 23
0
    def _build_module(self):
        self.layers.append(self.input)

        self.activation_function = tf.nn.relu
        initializer = tf.keras.initializers.VarianceScaling(scale=2.0)
        window_size = (3, 3)
        self.layers.append(
            Conv2D(64,
                   window_size,
                   padding='same',
                   activation=self.activation_function,
                   kernel_initializer=initializer)(self.layers[-1]))
        self.layers.append(
            Conv2D(64,
                   window_size,
                   padding='same',
                   activation=self.activation_function,
                   kernel_initializer=initializer)(self.layers[-1]))
        self.layers.append(MaxPooling2D()(self.layers[-1]))
        self.layers.append(
            Conv2D(128,
                   window_size,
                   padding='same',
                   activation=self.activation_function,
                   kernel_initializer=initializer)(self.layers[-1]))
        self.layers.append(
            Conv2D(128,
                   window_size,
                   padding='same',
                   activation=self.activation_function,
                   kernel_initializer=initializer)(self.layers[-1]))
        self.layers.append(MaxPooling2D()(self.layers[-1]))
        self.layers.append(
            Conv2D(256,
                   window_size,
                   padding='same',
                   activation=self.activation_function,
                   kernel_initializer=initializer)(self.layers[-1]))
        self.layers.append(
            Conv2D(256,
                   window_size,
                   padding='same',
                   activation=self.activation_function,
                   kernel_initializer=initializer)(self.layers[-1]))
        self.layers.append(
            Conv2D(256,
                   window_size,
                   padding='same',
                   activation=self.activation_function,
                   kernel_initializer=initializer)(self.layers[-1]))
        self.layers.append(MaxPooling2D()(self.layers[-1]))
        self.layers.append(
            Conv2D(512,
                   window_size,
                   padding='same',
                   activation=self.activation_function,
                   kernel_initializer=initializer)(self.layers[-1]))
        self.layers.append(
            Conv2D(512,
                   window_size,
                   padding='same',
                   activation=self.activation_function,
                   kernel_initializer=initializer)(self.layers[-1]))
        self.layers.append(
            Conv2D(512,
                   window_size,
                   padding='same',
                   activation=self.activation_function,
                   kernel_initializer=initializer)(self.layers[-1]))
        self.layers.append(MaxPooling2D()(self.layers[-1]))
        self.layers.append(
            Conv2D(512,
                   window_size,
                   padding='same',
                   activation=self.activation_function,
                   kernel_initializer=initializer)(self.layers[-1]))
        self.layers.append(
            Conv2D(512,
                   window_size,
                   padding='same',
                   activation=self.activation_function,
                   kernel_initializer=initializer)(self.layers[-1]))
        self.layers.append(
            Conv2D(512,
                   window_size,
                   padding='same',
                   activation=self.activation_function,
                   kernel_initializer=initializer)(self.layers[-1]))
        self.layers.append(MaxPooling2D()(self.layers[-1]))
        self.layers.append(Flatten()(self.layers[-1]))

        for idx, layer_params in enumerate(self.layers_params):
            print(idx, layer_params)
            self.layers.extend(
                force_list(
                    layer_params(self.layers[-1],
                                 name='{}_{}'.format(
                                     layer_params.__class__.__name__, idx),
                                 is_training=self.is_training,
                                 kernel_initializer=initializer,
                                 activation=self.activation_function)))

        self.output = self.layers[-1]
Exemplo n.º 24
0
 def __init__(self, run_phases: Union[RunPhase, List[RunPhase]]):
     self.run_phases = force_list(run_phases)
Exemplo n.º 25
0
 def __init__(self, params: Union[List, int]):
     """
     :param params: list of [num_output_neurons]
     """
     self.params = force_list(params)
Exemplo n.º 26
0
 def __init__(self, params: List):
     """
     :param params: list of [num_output_neurons]
     """
     self.params = force_list(params)
     self.sigma0 = 0.5
Exemplo n.º 27
0
    def __init__(self, level: LevelSelection, seed: int, frame_skip: int,
                 human_control: bool, custom_reward_threshold: Union[int,
                                                                     float],
                 visualization_parameters: VisualizationParameters,
                 server_height: int, server_width: int, camera_height: int,
                 camera_width: int, verbose: bool,
                 experiment_suite: ExperimentSuite, config: str,
                 episode_max_time: int, allow_braking: bool,
                 quality: CarlaEnvironmentParameters.Quality,
                 cameras: List[CameraTypes], weather_id: List[int],
                 experiment_path: str,
                 separate_actions_for_throttle_and_brake: bool,
                 num_speedup_steps: int, max_speed: float, **kwargs):
        super().__init__(level, seed, frame_skip, human_control,
                         custom_reward_threshold, visualization_parameters)

        # server configuration
        self.server_height = server_height
        self.server_width = server_width
        self.port = get_open_port()
        self.host = 'localhost'
        self.map_name = CarlaLevel[level.upper()].value['map_name']
        self.map_path = CarlaLevel[level.upper()].value['map_path']
        self.experiment_path = experiment_path

        # client configuration
        self.verbose = verbose
        self.quality = quality
        self.cameras = cameras
        self.weather_id = weather_id
        self.episode_max_time = episode_max_time
        self.allow_braking = allow_braking
        self.separate_actions_for_throttle_and_brake = separate_actions_for_throttle_and_brake
        self.camera_width = camera_width
        self.camera_height = camera_height

        # setup server settings
        self.experiment_suite = experiment_suite
        self.config = config
        if self.config:
            # load settings from file
            with open(self.config, 'r') as fp:
                self.settings = fp.read()
        else:
            # hard coded settings
            self.settings = CarlaSettings()
            self.settings.set(SynchronousMode=True,
                              SendNonPlayerAgentsInfo=False,
                              NumberOfVehicles=15,
                              NumberOfPedestrians=30,
                              WeatherId=random.choice(
                                  force_list(self.weather_id)),
                              QualityLevel=self.quality.value,
                              SeedVehicles=seed,
                              SeedPedestrians=seed)
            if seed is None:
                self.settings.randomize_seeds()

            self.settings = self._add_cameras(self.settings, self.cameras,
                                              self.camera_width,
                                              self.camera_height)

        # open the server
        self.server = self._open_server()

        logging.disable(40)

        # open the client
        self.game = CarlaClient(self.host, self.port, timeout=99999999)
        self.game.connect()
        if self.experiment_suite:
            self.current_experiment_idx = 0
            self.current_experiment = self.experiment_suite.get_experiments()[
                self.current_experiment_idx]
            self.scene = self.game.load_settings(
                self.current_experiment.conditions)
        else:
            self.scene = self.game.load_settings(self.settings)

        # get available start positions
        self.positions = self.scene.player_start_spots
        self.num_positions = len(self.positions)
        self.current_start_position_idx = 0
        self.current_pose = 0

        # state space
        self.state_space = StateSpace({
            "measurements":
            VectorObservationSpace(
                4, measurements_names=["forward_speed", "x", "y", "z"])
        })
        for camera in self.scene.sensors:
            self.state_space[camera.name] = ImageObservationSpace(
                shape=np.array([self.camera_height, self.camera_width, 3]),
                high=255)

        # action space
        if self.separate_actions_for_throttle_and_brake:
            self.action_space = BoxActionSpace(
                shape=3,
                low=np.array([-1, 0, 0]),
                high=np.array([1, 1, 1]),
                descriptions=["steer", "gas", "brake"])
        else:
            self.action_space = BoxActionSpace(
                shape=2,
                low=np.array([-1, -1]),
                high=np.array([1, 1]),
                descriptions=["steer", "gas_and_brake"])

        # human control
        if self.human_control:
            # convert continuous action space to discrete
            self.steering_strength = 0.5
            self.gas_strength = 1.0
            self.brake_strength = 0.5
            # TODO: reverse order of actions
            self.action_space = PartialDiscreteActionSpaceMap(
                target_actions=[[0., 0.], [0., -self.steering_strength],
                                [0., self.steering_strength],
                                [self.gas_strength, 0.],
                                [-self.brake_strength, 0],
                                [self.gas_strength, -self.steering_strength],
                                [self.gas_strength, self.steering_strength],
                                [self.brake_strength, -self.steering_strength],
                                [self.brake_strength, self.steering_strength]],
                descriptions=[
                    'NO-OP', 'TURN_LEFT', 'TURN_RIGHT', 'GAS', 'BRAKE',
                    'GAS_AND_TURN_LEFT', 'GAS_AND_TURN_RIGHT',
                    'BRAKE_AND_TURN_LEFT', 'BRAKE_AND_TURN_RIGHT'
                ])

            # map keyboard keys to actions
            for idx, action in enumerate(self.action_space.descriptions):
                for key in key_map.keys():
                    if action == key:
                        self.key_to_action[key_map[key]] = idx

        self.num_speedup_steps = num_speedup_steps
        self.max_speed = max_speed

        # measurements
        self.autopilot = None
        self.planner = Planner(self.map_name)

        # env initialization
        self.reset_internal_state(True)

        # render
        if self.is_rendered:
            image = self.get_rendered_image()
            self.renderer.create_screen(image.shape[1], image.shape[0])