def clamp_grad(grad_input,
                   clamping_bound,
                   variable_name: str,
                   gradient_computation_mask=None):

        if not (gradient_computation_mask is None):
            # print("Applying gradient computation mask " + str(gradient_computation_mask) + " to " +
            #       "grad_output: " + str(grad_input))
            grad_output = TensorUtils.apply_binary_mask(
                grad_input, gradient_computation_mask)
        else:
            grad_output = grad_input

        grad_output = grad_output.clamp(min=-clamping_bound,
                                        max=clamping_bound)

        # if variable_name == "mdlstm - activation_column" or variable_name == "mdlstm - new_memory_state":
        #     print("clamping gradient - " + variable_name)
        #     print("clamp_grad_and_print - grad_input: " + str(grad_input))
        #     print("clamp_grad_and_print - grad_output: " + str(grad_output))

        is_bad_gradient = False

        if InsideModelGradientClamping.is_bad_grad(grad_input):
            print("is_bad_grad - grad_input: " + str(grad_input))
            ##not util.tensor_utils.TensorUtils.tensors_are_equal(grad_input, grad_output):
            # https://stackoverflow.com/questions/900392/getting-the-caller-function-name-inside-another-function-in-python
            print("clamping gradient - " + variable_name)
            print("clamp_grad_and_print - grad_input: " + str(grad_input))
            print("clamp_grad_and_print - grad_output: " + str(grad_output))
            is_bad_gradient = True

        if InsideModelGradientClamping.is_bad_grad(grad_output):
            print("is_bad_grad - grad_output: " + str(grad_output))
            ##not util.tensor_utils.TensorUtils.tensors_are_equal(grad_input, grad_output):
            # https://stackoverflow.com/questions/900392/getting-the-caller-function-name-inside-another-function-in-python
            print("clamping gradient - " + variable_name)
            print("clamp_grad_and_print - grad_input: " + str(grad_input))
            print("clamp_grad_and_print - grad_output: " + str(grad_output))
            is_bad_gradient = True

        if is_bad_gradient:
            raise RuntimeError("Error: found bad gradient")

        return grad_output
Exemplo n.º 2
0
    def compute_convolution_result_and_apply_mask(
            self, previous_state_column: torch.Tensor, mask: torch.Tensor):

        # This call seems to be causing a memory leak.
        # This seems to be the memory leak root cause, basically just the call to
        # the 2D convolution with multiple groups. Perhaps there are too many groups,
        # e.g. 28 * 2. But it is not clear how to fix this. The bug seems to happen
        # in pytorch 0.4.0 and 0.4.1 at least.
        # print("self.number_of_paired_input_weightings_per_group: " +
        #       str(self.number_of_paired_input_weightings_per_group))
        result = self.compute_convolution_result(previous_state_column)
        # self.parallel_convolution(previous_state_column)
        # result = None
        # return None

        if self.clamp_gradients:
            # print("ParallelMultipleStateWeightingsComputation - register gradient clamping...")
            # Create a 1d convolution with clamping of the gradient
            result = InsideModelGradientClamping.\
                register_gradient_clamping_default_clamping_bound(result,
                                                                  "parallel_multiple_state_weightings_Computation",
                                                                  mask)

        # It is necessary to mask the non-valid entries in the convolution result. If this
        # is not done, then the results will be "incorrect" and also when using examples packing,
        # the first row in the packed matrix will be treated differently from the first rows
        # of other examples under vertical row separators.
        # For this reason, we must mask not only the states computed for the next iteration
        # during MDLSTM computation but also for the convolution computation the entries that
        # are not valid
        # print("result.size(): " + str(result.size()))
        # print("mask.size(): " + str(mask.size()))
        # print("self.number_of_paired_input_weighting: " +
        #       str(self.get_number_of_paired_input_weightings()))
        if not (mask is None):
            result = TensorUtils.apply_binary_mask(result, mask)

        # print("compute_convolution_result - result.size():" + str(result.size()))

        return result
    def clamp_grad_and_print(grad_input,
                             clamping_bound,
                             variable_name: str,
                             gradient_computation_mask=None):
        # print("clamping gradient - " + variable_name)
        # print("number of non-zeros: " + str(TensorUtils.number_of_non_zeros(grad_input)))
        # torch.set_printoptions(precision=10)
        # print("maximum element: " + str(torch.max(grad_input)))
        # print("sum of all elements: " + str(torch.sum(grad_input)))
        # print("tensor norm: " + str(torch.norm(grad_input) * 10000000000))
        # print("clamp_grad_and_print - grad_input: " + str(grad_input))

        # nearly_zero_element_mask = grad_input.abs().lt(0.0000000001)
        # print("nearly_zero_element_mask: " + str(nearly_zero_element_mask))
        # grad_output = grad_input
        # zero_element_indices = torch.masked_select(nearly_zero_element_mask)
        # print("zero element indices: " + str(zero_element_indices))
        # grad_output.view(-1)[zero_element_indices] = 0

        # https://stackoverflow.com/questions/45384684/
        # replace-all-nonzero-values-by-zero-and-all-zero-values-by-a-specific-value/45386834
        grad_output = grad_input.clone()
        grad_output[grad_input.abs() < 0.0000000001] = 0
        # print("grad_output: " + str(grad_output))
        # print("grad_output.size(): " + str(grad_output.size()))
        # print("number of non-zeros after: " + str(TensorUtils.number_of_non_zeros(grad_output)))

        if not (gradient_computation_mask is None):
            # print("Applying gradient computation mask " + str(gradient_computation_mask) + " to " +
            #       "grad_output: " + str(grad_input))
            grad_output = TensorUtils.apply_binary_mask(
                grad_output, gradient_computation_mask)
        else:
            grad_output = grad_output

        grad_output = grad_output.clamp(min=-clamping_bound,
                                        max=clamping_bound)

        # print("clamp_grad_and_print - grad_output: " + str(grad_output))
        return grad_output