Exemplo n.º 1
0
 def get_q_values(self, states: str, actions: str, use_target_network: bool) -> str:
     state_action_pairs, _ = C2.Concat(states, actions, axis=1)
     q_values = C2.NextBlob("q_values")
     if use_target_network:
         self.target_network.make_forward_pass_ops(
             C2.model(), state_action_pairs, q_values, True
         )
     else:
         self.ml_trainer.make_forward_pass_ops(
             C2.model(), state_action_pairs, q_values, True
         )
     return q_values
    def update_model(self, states: str, actions: str,
                     q_vals_target: str) -> None:
        """
        Takes in states, actions, and target q values. Updates the model:

            Runs the forward pass, computing Q(states, actions).
                Q(states, actions)[i][j] is an approximation of Q*(states[i], action_j).
            Comptutes Loss of Q(states, actions) with respect to q_vals_targets
            Updates Q Network's weights according to loss and optimizer

        :param states: Numpy array with shape (batch_size, state_dim). The ith
            row is a representation of the ith transition's state.
        :param actions: Numpy array with shape (batch_size, action_dim). The ith
            row is a representation of the ith transition's action.
        :param q_vals_targets: Numpy array with shape (batch_size, 1). The ith
            row is the label to train against for the data from the ith transition.
        """
        model = C2.model()
        q_vals_target = C2.StopGradient(q_vals_target)
        q_values = C2.NextBlob("train_output")
        state_action_pairs, _ = C2.Concat(states, actions, axis=1)
        self.ml_trainer.make_forward_pass_ops(model, state_action_pairs,
                                              q_values, False)

        self.loss_blob = self.ml_trainer.generateLossOps(
            model, q_values, q_vals_target)
        model.AddGradientOperators([self.loss_blob])
        for param in model.params:
            if param in model.param_to_grad:
                param_grad = model.param_to_grad[param]
                param_grad = C2.NanCheck(param_grad)
        self.ml_trainer.addParameterUpdateOps(model)
    def update_model(self, states: str, actions: str, q_vals_target: str) -> None:
        """
        Takes in states, actions, and target q values. Updates the model:

            Runs the forward pass, computing Q(states, actions).
                Q(states, actions)[i][j] is an approximation of Q*(states[i], action_j).
            Comptutes Loss of Q(states, actions) with respect to q_vals_targets
            Updates Q Network's weights according to loss and optimizer

        :param states: Numpy array with shape (batch_size, state_dim). The ith
            row is a representation of the ith transition's state.
        :param actions: Numpy array with shape (batch_size, action_dim). The ith
            row contains the one-hotted representation of the ith action.
        :param q_vals_targets: Numpy array with shape (batch_size, 1). The ith
            row is the label to train against for the data from the ith transition.
        """
        model = C2.model()
        q_vals_target = C2.StopGradient(q_vals_target)
        output_blob = C2.NextBlob("train_output")
        if self.conv_ml_trainer is not None:
            conv_output_blob = C2.NextBlob("conv_output")
            self.conv_ml_trainer.make_conv_pass_ops(model, states, conv_output_blob)
            states = conv_output_blob

        self.ml_trainer.make_forward_pass_ops(model, states, output_blob, False)
        q_val_select = C2.ReduceBackSum(C2.Mul(output_blob, actions))
        q_values = C2.ExpandDims(q_val_select, dims=[1])

        self.loss_blob = self.ml_trainer.generateLossOps(model, q_values, q_vals_target)
        model.AddGradientOperators([self.loss_blob])
        for param in model.params:
            if param in model.param_to_grad:
                param_grad = model.param_to_grad[param]
                param_grad = C2.NanCheck(param_grad)
        self.ml_trainer.addParameterUpdateOps(model)
Exemplo n.º 4
0
    def update_model(
        self,
        states: str,
        actions: str,
        q_vals_target: str,
    ) -> None:
        """
        Takes in states, actions, and target q values. Updates the model:

            Runs the forward pass, computing Q(states, actions).
                Q(states, actions)[i][j] is an approximation of Q*(states[i], action_j).
            Comptutes Loss of Q(states, actions) with respect to q_vals_targets
            Updates Q Network's weights according to loss and optimizer

        :param states: Numpy array with shape (batch_size, state_dim). The ith
            row is a representation of the ith transition's state.
        :param actions: Numpy array with shape (batch_size, action_dim). The ith
            row contains the one-hotted representation of the ith action.
        :param q_vals_targets: Numpy array with shape (batch_size, 1). The ith
            row is the label to train against for the data from the ith transition.
        """
        model = C2.model()
        q_vals_target = C2.StopGradient(q_vals_target)
        output_blob = C2.NextBlob("train_output")
        MakeForwardPassOps(
            model,
            self.model_id,
            states,
            output_blob,
            self.weights,
            self.biases,
            self.activations,
            self.layers,
            self.dropout_ratio,
            False,
        )
        q_val_select = C2.ReduceBackSum(C2.Mul(output_blob, actions))
        q_values = C2.ExpandDims(q_val_select, dims=[1])

        self.loss_blob = GenerateLossOps(
            model,
            q_values,
            q_vals_target,
        )
        model.AddGradientOperators([self.loss_blob])
        for param in model.params:
            if param in model.param_to_grad:
                param_grad = model.param_to_grad[param]
                param_grad = C2.NanCheck(param_grad)
        AddParameterUpdateOps(
            model,
            optimizer_input=self.optimizer,
            base_learning_rate=self.learning_rate,
            gamma=self.gamma,
            policy=self.lr_policy,
        )
    def get_q_values_all_actions(self, states: str, use_target_network: bool) -> str:
        """
        Takes in a set of states and runs the test Q Network on them.

        Creates Q(states, actions), a blob with shape (batch_size, action_dim).
        Q(states, actions)[i][j] is an approximation of Q*(states[i], action_j).
        Note that action_j takes on every possible action (of which there are
        self.action_dim_. Stores blob in self.output_blob and returns its value.

        :param states: Numpy array with shape (batch_size, state_dim). Each row
            contains a representation of a state.
        :param possible_next_actions: Numpy array with shape (batch_size, action_dim).
            possible_next_actions[i][j] = 1 iff the agent can take action j from
            state i.
        :use_target_network: Boolean that indicates whether or not to use this
            trainer's TargetNetwork to compute Q values.
        """
        all_q_values = C2.NextBlob("all_q_values")
        if use_target_network:
            if self.conv_target_network is not None:
                conv_output_blob = C2.NextBlob("conv_output")
                self.conv_target_network.make_conv_pass_ops(
                    C2.model(), states, conv_output_blob
                )
                states = conv_output_blob
            self.target_network.make_forward_pass_ops(
                C2.model(), states, all_q_values, True
            )
        else:
            if self.conv_ml_trainer is not None:
                conv_output_blob = C2.NextBlob("conv_output")
                self.conv_ml_trainer.make_conv_pass_ops(
                    C2.model(), states, conv_output_blob
                )
                states = conv_output_blob
            self.ml_trainer.make_forward_pass_ops(
                C2.model(), states, all_q_values, True
            )
        return all_q_values
Exemplo n.º 6
0
    def target_values(self, input_blob: str) -> str:
        """ Estimates the values for the given inputs using the target network

        :param inputs The given inputs
        """
        output_blob = C2.NextBlob("output_blob")
        MakeForwardPassOps(
            C2.model(),
            self.tn_model_id,
            input_blob,
            output_blob,
            self._weights,
            self._biases,
            self._trainer.activations,
            self._trainer.layers,
            self._trainer.dropout_ratio,
            True,
        )
        return output_blob
Exemplo n.º 7
0
    def get_q_values_all_actions(
        self,
        states: str,
        use_target_network: bool,
    ) -> str:
        """
        Takes in a set of states and runs the test Q Network on them.

        Creates Q(states, actions), a blob with shape (batch_size, action_dim).
        Q(states, actions)[i][j] is an approximation of Q*(states[i], action_j).
        Note that action_j takes on every possible action (of which there are
        self.action_dim_. Stores blob in self.output_blob and returns its value.

        :param states: Numpy array with shape (batch_size, state_dim). Each row
            contains a representation of a state.
        :param possible_next_actions: Numpy array with shape (batch_size, action_dim).
            possible_next_actions[i][j] = 1 iff the agent can take action j from
            state i.
        :use_target_network: Boolean that indicates whether or not to use this
            trainer's TargetNetwork to compute Q values.
        """
        if use_target_network:
            return self.target_network.target_values(states)
        else:
            all_q_values = C2.NextBlob("all_q_values")
            MakeForwardPassOps(
                C2.model(),
                self.model_id + "_score",
                states,
                all_q_values,
                self.weights,
                self.biases,
                self.activations,
                self.layers,
                self.dropout_ratio,
                True,
            )
            return all_q_values
Exemplo n.º 8
0
 def get_q_values(
     self,
     states: str,
     actions: str,
     use_target_network: bool,
 ) -> str:
     state_action_pairs, _ = C2.Concat(states, actions, axis=1)
     if use_target_network:
         return self.target_network.target_values(state_action_pairs)
     else:
         q_values = C2.NextBlob("q_values")
         MakeForwardPassOps(
             C2.model(),
             self.model_id,
             state_action_pairs,
             q_values,
             self.weights,
             self.biases,
             self.activations,
             self.layers,
             self.dropout_ratio,
             True,
         )
         return q_values