def fit_mean_std_with_tobit(self, y_intervals): y_single_valued, y_left_censored, y_right_censored, data_mean, data_std = self.read_tensors_from_intervals( y_intervals) delta = to_torch(0, grad=True) # tuple for single valued, left censored, right censored x_tuple = (delta, delta, delta) y_tuple = (y_single_valued, y_left_censored, y_right_censored) gamma = to_torch(1, device='cpu', grad=True) tobit = Reparametrized_Scaled_Tobit_Loss(gamma, device='cpu') optimizer = t.optim.SGD([delta, gamma], lr=1e-1) patience = 5 for i in range(10_000): prev_delta, prev_gamma = delta.clone(), gamma.clone() optimizer.zero_grad() loss = tobit(x_tuple, y_tuple) loss.backward() optimizer.step() early_stop = math.fabs(delta - prev_delta) + math.fabs( gamma - prev_gamma) < 1e-5 if early_stop: patience -= 1 if patience == 0: break else: patience = 5 if i % 100 == 0: print(i, delta, gamma)
def forward(self, x: Tuple[t.Tensor, t.Tensor, t.Tensor], y: Tuple[t.Tensor, t.Tensor, t.Tensor]) -> t.Tensor: x_single_value, x_left_censored, x_right_censored = x y_single_value, y_left_censored, y_right_censored = y N = len(y_single_value) + len(y_left_censored) + len(y_right_censored) sigma = t.abs(self.sigma) # Step 1: compute loss for uncensored data based on pdf: # -sum(ln(pdf((y - x)/sigma)) - ln(sigma)) log_likelihood_pdf = to_torch(0, device = self.device, grad = True) if len(y_single_value) > 0: log_likelihood_pdf = -t.sum(-(((y_single_value - x_single_value) / sigma) ** 2) / 2 - t.log(sigma + self.epsilon)) # Step 2: compute loss for left censored data: # -sum(ln(cdf((y - x)/sigma) - cdf((truncation - x)/sigma))) log_likelihood_cdf = to_torch(0, device = self.device, grad = True) if len(y_left_censored) > 0: truncation_low_penalty = 0 if not self.truncated_low else cdf((self.truncated_low - x_left_censored) / sigma) log_likelihood_cdf = -t.sum(t.log(cdf((y_left_censored - x_left_censored) / sigma) - truncation_low_penalty + self.epsilon)) # Step 3: compute the loss for right censored data: # -sum(ln(cdf((delta - y)/sigma) - cdf((delta - truncation)/sigma))) # Notice that: log(1 - cdf(z)) = log(cdf(-z)), thus compared to step 2, the signs for gamma and x are swapped log_likelihood_1_minus_cdf = to_torch(0, device = self.device, grad = True) if len(y_right_censored) > 0: truncation_high_penalty = 0 if not self.truncated_high else cdf((-self.truncated_high + x_right_censored) / sigma) log_likelihood_1_minus_cdf = -t.sum(t.log(cdf((-y_right_censored + x_right_censored) / sigma) - truncation_high_penalty + self.epsilon)) log_likelihood = log_likelihood_pdf + log_likelihood_cdf + log_likelihood_1_minus_cdf std_penalty = 0 if not self.std_panalty else self.std_panalty * sigma return log_likelihood + std_penalty
def fit_mean_std_with_tobit(self, y_intervals): y_single_valued, y_left_censored, y_right_censored, data_mean, data_std = self.read_tensors_from_intervals( y_intervals) mean = to_torch(0, grad=True) # tuple for single valued, left censored, right censored x_tuple = (mean, mean, mean) y_tuple = (y_single_valued, y_left_censored, y_right_censored) std = to_torch(1, device='cpu', grad=True) tobit = Scaled_Tobit_Loss(std, device='cpu') optimizer = t.optim.SGD([mean, std], lr=1e-1) patience = 5 for i in range(10_000): prev_delta, prev_std = mean.clone(), std.clone() optimizer.zero_grad() loss = tobit(x_tuple, y_tuple) loss.backward() optimizer.step() early_stop = math.fabs(mean - prev_delta) + math.fabs( std - prev_std) < 1e-5 if early_stop: patience -= 1 if patience == 0: break else: patience = 5 if i % 100 == 0: print(i, mean, std)
def forward(self, x: Tuple[t.Tensor, t.Tensor, t.Tensor], y: Tuple[t.Tensor, t.Tensor, t.Tensor], gamma: Tuple[t.Tensor, t.Tensor, t.Tensor]) -> t.Tensor: x_single_value, x_left_censored, x_right_censored = x y_single_value, y_left_censored, y_right_censored = y gamma_single_value, gamma_left_censored, gamma_right_censored = gamma gamma_single_value, gamma_left_censored, gamma_right_censored = t.abs(gamma_single_value), t.abs(gamma_left_censored), t.abs(gamma_right_censored) N = len(y_single_value) + len(y_left_censored) + len(y_right_censored) # Step 1: compute loss for uncensored data based on pdf: # -sum(ln(gamma) + ln(pdf(gamma * y - x))) log_likelihood_pdf = to_torch(0, device = self.device, grad = True) if len(y_single_value) > 0: log_likelihood_pdf = -t.sum(t.log(gamma_single_value + self.epsilon) - ((gamma_single_value * y_single_value - x_single_value) ** 2) / 2) # Step 2: compute loss for left censored data: # -sum(ln(cdf(gamma * y - x) - cdf(gamma * truncation - x))) log_likelihood_cdf = to_torch(0, device = self.device, grad = True) if len(y_left_censored) > 0: truncation_low_penalty = 0 if not self.truncated_low else cdf(gamma_left_censored * self.truncated_low - x_left_censored) log_likelihood_cdf = -t.sum(t.log(cdf(gamma_left_censored * y_left_censored - x_left_censored) - truncation_low_penalty + self.epsilon)) # Step 3: compute the loss for right censored data: # -sum(ln(cdf(x - gamma * y) - cdf(x - gamma * truncation))) # Notice that: log(1 - cdf(z)) = log(cdf(-z)), thus compared to step 2, the signs for gamma and x are swapped log_likelihood_1_minus_cdf = to_torch(0, device = self.device, grad = True) if len(y_right_censored) > 0: truncation_high_penalty = 0 if not self.truncated_high else cdf(-gamma_right_censored * self.truncated_high + x_right_censored) log_likelihood_1_minus_cdf = -t.sum(t.log(cdf(-gamma_right_censored * y_right_censored + x_right_censored) - truncation_high_penalty + self.epsilon)) log_likelihood = log_likelihood_pdf + log_likelihood_cdf + log_likelihood_1_minus_cdf return log_likelihood
def forward(self, x: Tuple[t.Tensor, t.Tensor, t.Tensor], y: Tuple[t.Tensor, t.Tensor, t.Tensor]) -> t.Tensor: x_single_value, x_left_censored, x_right_censored = x y_single_value, y_left_censored, y_right_censored = y N = len(y_single_value) + len(y_left_censored) + len(y_right_censored) # Step 1: compute loss for uncensored data based on pdf: # -sum(ln(pdf(y - delta))) log_likelihood_pdf = to_torch(0, device = self.device, grad = True) if len(y_single_value) > 0: log_likelihood_pdf = t.sum(((y_single_value - x_single_value) ** 2) / 2) # Step 2: compute loss for left censored data: # -sum(ln(cdf(y - delta) - cdf(truncation - delta))) log_likelihood_cdf = to_torch(0, device = self.device, grad = True) if len(y_left_censored) > 0: truncation_low_penalty = 0 if not self.truncated_low else cdf(self.truncated_low - x_left_censored) log_likelihood_cdf = -t.sum(t.log(cdf(y_left_censored - x_left_censored) - truncation_low_penalty + self.epsilon)) # Step 3: compute the loss for right censored data: # -sum(ln(cdf(delta - y) - cdf(delta - truncation))) # Notice that: log(1 - cdf(x)) = log(cdf(-x)), thus the swapped signs log_likelihood_1_minus_cdf = to_torch(0, device = self.device, grad = True) if len(y_right_censored) > 0: truncation_high_penalty = 0 if not self.truncated_high else cdf(x_right_censored - self.truncated_high) log_likelihood_1_minus_cdf = -t.sum(t.log(cdf(x_right_censored - y_right_censored) - truncation_high_penalty + self.epsilon)) log_likelihood = log_likelihood_pdf + log_likelihood_cdf + log_likelihood_1_minus_cdf return log_likelihood
def test_cdf_gradient(self): input = [10, 15, 20, 25, 30] # manual gradient computing x = np.array(input) mean, std = x.mean(), x.std() x_normalized = normalize(x, mean, std) expected_cdf = norm.cdf(x_normalized) expected_log_likelihood = np.log(expected_cdf) expected_grad_log_likelihood_by_x = norm.pdf(x_normalized) / (expected_cdf * std) # automatic gradient computing x = to_torch(input, grad = True) # in this test mean & std are considered constants x_normalized = normalize(x, mean, std) cdf_result = cdf(x_normalized) assert_almost_equal(to_numpy(cdf_result), expected_cdf) log_likelihood_result = t.log(cdf_result) assert_almost_equal(to_numpy(log_likelihood_result), expected_log_likelihood) loss = t.sum(log_likelihood_result) loss.backward() assert_almost_equal(to_numpy(x.grad), expected_grad_log_likelihood_by_x)
def forward(ctx, x: t.Tensor) -> t.Tensor: type, device = x.dtype, x.device _x = to_numpy(x) pdf = to_torch(norm.pdf(_x), type=type, device=device, grad=False) ctx.save_for_backward(pdf) return to_torch(norm.cdf(_x), type=type, device=device, grad=False)
grad = None if ctx.needs_input_grad[0]: grad = grad_output * pdf return grad cdf = __CDF.apply if __name__ == '__main__': input = [10, 15, 20, 25, 30] # manual gradient computing x = np.array(input) mean, std = x.mean(), x.std() x_normalized = normalize(x, mean, std) expected_cdf = norm.cdf(x_normalized) expected_log_likelihood = np.log(expected_cdf) expected_grad_log_likelihood_by_x = norm.pdf(x_normalized) / ( expected_cdf * std) # automatic gradient computing x = to_torch(input, grad=True) # in this test mean & std are considered constants x_normalized = normalize(x, mean, std) cdf_result = cdf(x_normalized) log_likelihood_result = t.log(cdf_result) loss = t.sum(log_likelihood_result) loss.backward() print(x.grad, expected_grad_log_likelihood_by_x)